In [None]:
%run 1_import_analysis.ipynb
%run 2_preprocessing.ipynb

## Configuración del pipeline y modelos a utilizar para la predicción de los sentimientos

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [None]:
from sklearn.feature_selection import SelectKBest, chi2

In [None]:
common_steps = [
            ('text_processing', TextProcessing()),
           ('vectorizer', CountVectorizer()),
            ('tfidf', TfidfTransformer()),
          # ('feature_selection', SelectKBest(chi2, k=1000)),
            ]


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV,StratifiedKFold

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from xgboost.sklearn import XGBClassifier


In [None]:
# Modelos a utilizar

LR_pipe = [Pipeline( common_steps + [('lr', LogisticRegression())]), {'lr__C':[0.01,0.1,1], 'lr__penalty':['l2']}]
DT_pipe = [Pipeline( common_steps + [('dt', tree.DecisionTreeClassifier())]), {'dt__max_depth':[  512,1024 ] }]
MNB_pipe = [Pipeline( common_steps + [('mnb', MultinomialNB())]), {'mnb__alpha':[1, 10, 20]}]
RF_pipe = [Pipeline( common_steps + [('rf', RandomForestClassifier())]), {'rf__n_estimators':[  100, 200], 'rf__max_depth':[   5,10] }]

skf=StratifiedKFold(n_splits=3,random_state=0,shuffle=True)
models = []


In [None]:
pipelines = [LR_pipe, DT_pipe, MNB_pipe, RF_pipe]

for pipe in pipelines:
    print('fitting',pipe[0].steps[-1][0])
    GS_CV=GridSearchCV(pipe[0],pipe[1],cv=skf,verbose=10,n_jobs=3);
    GS_CV.fit(X_train, y_train);
    models.append(GS_CV)
    print('best score:',GS_CV.best_score_)
    print('best params:',GS_CV.best_params_) 
    separator()

In [None]:
## Models evaluation

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


for model in models:
    print(model.best_estimator_)
    y_pred = model.predict(X_test)
    print('Accuracy: ', accuracy_score(y_test, y_pred))
    print('Classification report: ', classification_report(y_test, y_pred))
    print('Confusion matrix: ', confusion_matrix(y_test, y_pred))



In [None]:
# pick best model
best_model = models[0]
for model in models:
    if model.best_score_ > best_model.best_score_:
        best_model = model


best_model        
        