In [1]:
import io
import sys
import os
import spacy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from collections import Counter
from imblearn.over_sampling import RandomOverSampler
from utils.text_analysis import TextAnalysis
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [2]:
#Llamadando a esta clase procedemos a predeterminar el lenguaje como español, el análisis se hará en español
ta = TextAnalysis('es')

Error load_sapcy: [E050] Can't find model 'es_core_news_md'. It doesn't seem to be a shortcut link, a Python package or a valid path to a data directory.


## Lectura y observación de la generación de nuestro csv

In [3]:
data_raw = pd.read_csv('Users_Clasifiaction.csv', sep='|')
data_raw.head()

Unnamed: 0,User,Type,Tweets
0,acfdc9e43ed5efa10a903453261a3c12,0,['Ameniza la tarde con un poco de música en @S...
1,40adbb05f96fdd96f767b5967458faf1,0,"['¡VIERNES! https://t.co/De3i7hWUbJ', '@JacobG..."
2,4aa2fb302140ec35cc6bc8a0d7d35f6,0,['@mariafaber @ShiOque https://t.co/636iLpGRpt...
3,58db587d884d22afefbcd37aa26af458,0,['@elPixelPerdido 🤔 Nos das más info por MD?'...
4,dd9494d1bff7fa477cc03fea5294a510,0,['Tenemos el remedio perfecto para la depresió...


## Erradicación de semejanzas en el diccionario

In [4]:
setting = {'url': True, 'mention': True, 'emoji': True, 'hashtag': True, 'stopwords': False, 'relabel': True} 
list_sentences = []
#Iterando con barra progresiva
for row in tqdm(data_raw['Tweets'].to_list()):
    #Erradicación mediante el metodo de clean_text
    #Se pasa como parametro los Twetts existentes de nuestro csv y se comparan
    text = ta.clean_text(row, **setting)
    #print('Text org: {0} \nTex clean: {1}'.format(row, text))
    list_sentences.append(text)

100%|█████████████████████████████████████████████████████████████████████████████| 3000/3000 [00:06<00:00, 431.91it/s]


## Asignación de valores a nuestras variables a trabajar

In [5]:
x = list_sentences
y = data_raw['Type'].to_list()

## Conteo de bots y personas reales

In [6]:
print('**Replica y_train:', sorted(Counter(y).items()))

**Replica y_train: [(0, 1500), (1, 1500)]


## Entrenamiento del algoritmo con un tamaño de prueba del 30%

In [7]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=8675309)
print('**Replica train: {0}, size {1}'.format(sorted(Counter(y_train).items()), len(y_train))) #70% para entrenar
print('**Replica test: {0}, size {1}'.format(sorted(Counter(y_test).items()), len(y_test))) #30% para test

**Replica train: [(0, 1059), (1, 1041)], size 2100
**Replica test: [(0, 441), (1, 459)], size 900


# Features in Bag of Words

In [8]:
#Determinando los valores del ngrama que va hasta 3grama, recibe los acentos, los pasa a minúsculas y analiza solo las palabras
vec = CountVectorizer(min_df=5, ngram_range=(1,3), max_features=5000, strip_accents='unicode', lowercase =True, analyzer='word')
vec.fit(x_train)#Ajusta los valores de los datos para poder obtener una mayor precisión

#Transforma los valores manteniendo el margen del tamaño de el original
x_train = vec.transform(x_train)
x_test = vec.transform(x_test)

In [9]:
print(vec.get_feature_names())

['abado', 'abajo', 'abierto', 'abogado', 'aborto', 'about', 'abrazo', 'abre', 'abril', 'abrir', 'abuela', 'abuso', 'abusos', 'aca', 'acaba', 'acaba de', 'acabar', 'acabo', 'acabo de', 'acaso', 'acceso', 'accidente', 'accidente de', 'accion', 'acciones', 'acerca', 'acevedo', 'acevedo pfeiffer', 'acevedo pfeiffer url', 'actitud', 'actividad', 'acto', 'actor', 'actos', 'actriz', 'actual', 'actuar', 'acuerdo', 'acuerdo con', 'acusa', 'adelante', 'ademas', 'adios', 'administracion', 'adventista', 'adventistas', 'adventistas cl', 'aeropuerto', 'afecta', 'aficogt', 'aficogt url', 'aficogt url reporte', 'afirma', 'afirma que', 'ag', 'agencia', 'agenda', 'agosto', 'agp', 'agp deportes', 'agp deportes url', 'agresion', 'agua', 'ah', 'ahi', 'ahora', 'ahora que', 'ahora si', 'ahorras', 'ahorras eur', 'ahorras eur emoji', 'ahre', 'aire', 'aires', 'al', 'al de', 'al de san', 'al final', 'al gobierno', 'al hashtag', 'al lado', 'al menos', 'al mention', 'al mundo', 'al no', 'al no de', 'al pais', 'al 

In [10]:
print(x_train.toarray())

[[0 1 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


# RandomOverSampler

In [11]:
ros = RandomOverSampler(random_state=1000)
x_train, y_train = ros.fit_resample(x_train, y_train)
x_test, y_test = ros.fit_resample(x_test, y_test)
print('**RandomOverSampler train:', sorted(Counter(y_train).items()))
print('**RandomOverSampler test:', sorted(Counter(y_test).items()))

**RandomOverSampler train: [(0, 1059), (1, 1059)]
**RandomOverSampler test: [(0, 459), (1, 459)]


In [12]:
classifier = LogisticRegression(C=10, solver='lbfgs', multi_class='multinomial',max_iter=1000) 
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Metrics

In [13]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

cv_score = np.mean(cross_val_score(classifier, x_train, y_train, cv=5, scoring='accuracy'))

[[430  29]
 [  9 450]]
              precision    recall  f1-score   support

           0       0.98      0.94      0.96       459
           1       0.94      0.98      0.96       459

    accuracy                           0.96       918
   macro avg       0.96      0.96      0.96       918
weighted avg       0.96      0.96      0.96       918



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [14]:
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='macro')
precision = precision_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
print('Accuracy: {}%'.format(round(accuracy, 2)*100))
print('Recall: {}%'.format(round(recall, 2)*100))
print('Precision: {}%'.format(round(precision, 2)*100))
print('F1: {}%'.format(round(f1, 2)*100))

Accuracy: 96.0%
Recall: 96.0%
Precision: 96.0%
F1: 96.0%
