# Whatsapp

## Wrapping

In [1]:
# %load basic
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.set_option('display.max_columns',200)
pd.options.display.float_format = '{:.2f}'.format
file = '/home/ef/Documents/Diplomado/data/'

In [2]:
import time
start = time.time()

In [3]:
file += 'WhatsApp Chat with Naps 🐻🐼🐯.txt'
import whatsapp as wa
df = wa.read_chat(file)

## Transformación y obtención de tipos de variables
df,cat,num,autores = wa.TAD().transform(df)

## Se tratan outliers
for col in num:
    df = wa.outlier(df,col)

[nltk_data] Downloading package stopwords to /home/ef/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
## Se estructura y=f(X)
df['OBJETIVO'] = df['Autor'].replace(autores)
X = df[['Mensaje_limpio'] + cat + num]
y = df['OBJETIVO']

## Se separa train y test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    train_size = 0.77, 
                                                    random_state = 22)

### Preprocesamiento

In [5]:
## Dummies para categóricas
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse = False, handle_unknown = 'ignore')

## Escala para numéricas
from sklearn.preprocessing import MinMaxScaler
mm_x = MinMaxScaler()

## Frecuencia de palabras para texto
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer(ngram_range = (1, 1), 
                     min_df = 10, 
                     max_features = 100)

## Se aplicará transformación para cada tipo de columnas
from sklearn.compose import ColumnTransformer
prep = ColumnTransformer(transformers=[('OHE', ohe, cat),
                                       ('Scale', mm_x, num), 
                                       ('CountV', cv, 'Mensaje_limpio')])

In [6]:
## Top palabras por autor
wa.words(df,cv)

Unnamed: 0,EF,Iván Jardón,Kevin Bacon
0,jaja,jaja,jaja
1,si,si,si
2,abuebo,we,jardon
3,pa,wey,mas
4,ah,voy,ah
5,mas,verga,we
6,amigo,bien,bien
7,we,asi,amigo
8,brob,mas,bro
9,bien,amigos,solo


### Modelos

In [None]:
scoring = 'roc_auc' if len(autores) == 2 else 'accuracy'

from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

param_logreg = {'penalty':['l1', 'l2', 'elasticnet'], 
                'C':[x+y/10 for x in range(11) for y in range(1,11)], 
                'class_weight':['None','balanced'],
                'solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
                }

from sklearn.model_selection import GridSearchCV
search_logreg = GridSearchCV(param_grid = param_logreg, 
                             cv = 4, 
                             n_jobs = -1, 
                             scoring = scoring,
                             estimator = logreg,
                             verbose = 5)

In [None]:
from sklearn.ensemble import RandomForestClassifier 
forest = RandomForestClassifier()

param_forest = {'n_estimators': [x for x in range(1400, 1500, 10)],
                'max_features': ['auto', 'sqrt', 'log2'],
                'criterion': ['gini', 'entropy'],
                'class_weight': ['balanced', None],
                'min_samples_split': [x for x in range(10, 22)],
                'min_samples_leaf': [x/1000 for x in range(1, 6)]
               }

from sklearn.model_selection import RandomizedSearchCV
search_forest = RandomizedSearchCV(param_distributions = param_forest, 
                                   cv = 4, 
                                   n_jobs = -1, 
                                   scoring = scoring,
                                   estimator = forest,
                                   verbose = 5,
                                   n_iter = 30,
                                   random_state = 22)

In [None]:
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier()

param_ada={'n_estimators':[x for x in range(50,550,50)],
           'learning_rate':[x/100 for x in range(1,111)]
          }

from sklearn.model_selection import RandomizedSearchCV
search_ada = RandomizedSearchCV(param_distributions = param_ada, 
                                cv = 4, 
                                n_jobs = -1, 
                                scoring = scoring, 
                                estimator = ada, 
                                verbose = 5,
                                n_iter = 50,
                                random_state = 22)

In [None]:
from xgboost.sklearn import XGBClassifier
xgb = XGBClassifier()

param_xgb = {'learning_rate':[x/100 for x in range(1,111)],
             'n_estimators':[x for x in range(1,111)],
             'max_depth':[x for x in range(1,11)], 
             'min_child_weight':[x for x in range(1,111)],
             'objective':['count:poisson','multi:softmax'],
             'subsample':[x/100 for x in range(50,111)], 
             'colsample_bytree':[x/100 for x in range(50,111)], 
            }

from sklearn.model_selection import RandomizedSearchCV
search_xgb = RandomizedSearchCV(param_distributions = param_xgb, 
                                cv = 4, 
                                n_jobs = -1, 
                                scoring = scoring, 
                                estimator = xgb, 
                                verbose = 5,
                                n_iter = 600,
                                random_state = 22)

### Voting

In [None]:
from sklearn.ensemble import VotingClassifier
vc = VotingClassifier(estimators = [('LogReg', search_logreg), 
                                    ('Forest', search_forest), 
                                    ('ADA', search_ada), 
                                    ('XGB',search_xgb)], 
                      voting = 'soft')

### Pipeline

In [None]:
from sklearn.pipeline import Pipeline
modelo = Pipeline(steps=[('preproc', prep),
                         ('modelo', vc)])

modelo.fit(X_train,y_train).score(X_test,y_test)

In [None]:
## Variables que más se usan para diferenciar al autor/a
wa.top_variables(vc,X_train)

In [None]:
## Certeza en el conjunto de train
from sklearn.metrics import confusion_matrix
cm = pd.DataFrame(confusion_matrix(y_true = y_train,
                                   y_pred = modelo.predict(X_train))/len(y_train), 
                  index = [{y: x for x, y in autores.items()
                           }[n] for n in list(sorted(np.unique(y_train)))], 
                  columns = [{y: x for x, y in autores.items()
                             }[n] for n in list(sorted(np.unique(y_train)))])
display(cm)

## Tiene buena acertividad (suma de diagonal en la matriz de confusión)
'Accuracy de {:.2%}'.format(np.asarray(cm).trace())

In [None]:
## Y en test
cm = pd.DataFrame(confusion_matrix(y_true = y_test,
                                   y_pred = modelo.predict(X_test))/len(y_test), 
                  index = [{y: x for x, y in autores.items()
                           }[n] for n in list(sorted(np.unique(y_test)))], 
                  columns = [{y: x for x, y in autores.items()
                             }[n] for n in list(sorted(np.unique(y_test)))])
display(cm)

## Tiene buena acertividad (suma de diagonal en la matriz de confusión)
'Accuracy de {:.2%}'.format(np.asarray(cm).trace())

In [None]:
## Guardar OHE, MinMax y modelo
import pickle
with open('modelo_whatsapp_naps.pkl', "wb") as f:
    pickle.dump(modelo, f)

## Validación

In [None]:
## Abrir el pickle con lo necesario para validar
import pickle    
with open('modelo_whatsapp_naps.pkl', "rb") as f:
    modelo = pickle.load(f)

## Listo para usarse
display('Transformadores:')
display([x[1] for x in modelo.get_params()['steps'][0][1].get_params()['transformers']])
display('Modelos:')
[x.best_estimator_ for x in modelo.get_params()['modelo'].estimators_]

In [None]:
df['Fecha'].max()

In [None]:
# %load basic
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.set_option('display.max_columns',200)
pd.options.display.float_format = '{:.2f}'.format
file = '/home/ef/Documents/Diplomado/data/WhatsApp Chat with Naps_val.txt'

## Podemos crear un módulo con funciones y clases que ejecuten todo el proceso anterior
import whatsapp as wa
df = wa.read_chat(file)

## Transformación y obtención de tipos de variables
df,cat,num,autores = wa.TAD().transform(df)

## Se estructura y=f(X)
df['OBJETIVO'] = df['Autor'].replace(autores)
X = df[['Mensaje_limpio'] + cat + num]
y = df['OBJETIVO']

## Se predice sobre datos nuevos
val = df.join(pd.DataFrame(modelo.predict(X),
                           columns = ['Estimado']
                          ).replace({y: x for x, y in autores.items()}))

## Qué acertividad hay en la validación?
from sklearn.metrics import confusion_matrix
cm = pd.DataFrame(confusion_matrix(y_true = val['Autor'],
                                   y_pred = val['Estimado'])/len(val), 
                  index = [x for x in autores], 
                  columns = [x for x in autores])
display(cm)

## Con buena acertividad (suma de diagonal en la matriz de confusión)
'Accuracy de {:.2%}'.format(np.asarray(cm).trace())

## Fin

In [None]:
## Tono para cuando termina código
from IPython.lib.display import Audio
import numpy as np

framerate = 4410
play_time_seconds = 1

t = np.linspace(0, play_time_seconds, framerate*play_time_seconds)
audio_data = np.sin(5*np.pi*300*t) + np.sin(2*np.pi*240*t)

## La siguiente línea debe ir debajo del código p que suene
Audio(audio_data, rate=framerate, autoplay=True)

In [None]:
## Tiempo total para correr la modelación
end = time.time()
tiempo_tot = end - start
import math
str(int(math.floor(tiempo_tot/60
                  )
       )
   ) + " minutos con " + '{:.2f}'.format(60*(tiempo_tot/60 - math.floor(tiempo_tot/60
                                                                       )
                                            )
                                        ) + " segundos"