# Importaciones

In [29]:
from sklearn.metrics import confusion_matrix
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import KFold
import numpy as np
from sklearn.model_selection import train_test_split
import time

In [30]:
def error_measures(Yestimado, Yteorico):
    
    CM = confusion_matrix(Yteorico, Yestimado)

    TN = CM[0][0]
    FN = CM[1][0]
    TP = CM[1][1]
    FP = CM[0][1]
    
    sens = TP/(TP+FN)
    esp = TN/(TN+FP)
    
    return sens, esp

In [31]:
def classification_error(y_est, y_real):
    err = 0
    for y_e, y_r in zip(y_est, y_real):

        if y_e != y_r:
            err += 1

    return err/np.size(y_est)

In [32]:
df = pd.read_excel('Opiniones_Hoteles_Medellin.xlsx')
df.columns=['texto','sentimiento']

In [33]:
X=df.iloc[:,0]
y=df.iloc[:,1]

In [34]:
vector=CountVectorizer(ngram_range=(1, 2))

In [35]:
vector.fit(X)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [36]:
bagOfWords=vector.transform(X)

# Regresión logística

In [37]:
from sklearn.linear_model import LogisticRegression
import scipy as sc

In [38]:
def RegresionLogistica(Cvalue):    
    kf = KFold(n_splits=10)
    lr = LogisticRegression(C = Cvalue, max_iter=200)
    acc = []
    sens = []
    esp = []
    Errores = np.ones(10)
    tiempo_i = time.time()
    
    j=0
    for train_index, test_index in kf.split(bagOfWords):
        Xtrain, Xtest = bagOfWords[train_index], bagOfWords[test_index]
        Ytrain, Ytest = y[train_index], y[test_index]  

        lr.fit(Xtrain,Ytrain)
        Yest = lr.predict(Xtest)
        s, e = error_measures(Yest, Ytest)
        sens.append(s); esp.append(e)
        acc.append(lr.score(Xtest,Ytest))   
        
        Errores[j] = classification_error(Yest, Ytest)
        j+=1

    #print("Accuracy: ", np.mean(acc), "+/-", np.std(acc))
    #print("Sensitivity: ", np.mean(sens), "+/-", np.std(sens))
    #print("Especificity: ", np.mean(esp), "+/-", np.std(esp))
    return np.mean(acc), np.std(acc), np.mean(sens), np.std(sens), np.mean(esp), np.std(esp), np.mean(Errores), time.time()-tiempo_i

In [39]:
import pandas as pd
import qgrid
randn = np.random.randn

tasas = pd.Series(['1.0', '0.1','0.001'])

df_types = pd.DataFrame({
    'Tasa de aprendizaje' : tasas})

df_types["Accuracy"] = ""
df_types["Int_Accuracy"] = ""
df_types["Sensibility"] = ""
df_types["Int_Sensibility"] = ""
df_types["Especificity"] = ""
df_types["Int_Especificity"] = ""
df_types["Error validación"] = ""
df_types["Tiempo ejecución"] = ""
df_types.set_index(['Tasa de aprendizaje'], inplace=True)
i=0

for eta in [1.0, 0.1, 0.001]:
    Acc, IntAcc, Sen, IntSen, Esp, IntEsp, error, tiempo=RegresionLogistica(eta)       
    df_types["Accuracy"][i] = Acc
    df_types["Int_Accuracy"][i] = IntAcc
    df_types["Sensibility"][i] = Sen
    df_types["Int_Sensibility"][i] = IntSen
    df_types["Especificity"][i] = Esp
    df_types["Int_Especificity"][i] = IntEsp
    df_types["Error validación"][i] = error
    df_types["Tiempo ejecución"][i] = tiempo
    i=i+1
        

# df_types["Error_Entrenamiento"][2] = "0.0"
# df_types["Error_Prueba"][2] = "0.5"
#df_types.sort_index(inplace=True)
qgrid_widget = qgrid.show_grid(df_types, show_toolbar=False)
qgrid_widget

  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()


QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': True, 'defau…

In [40]:
qgrid_widget.get_changed_df()

Unnamed: 0_level_0,Accuracy,Int_Accuracy,Sensibility,Int_Sensibility,Especificity,Int_Especificity,Error validación,Tiempo ejecución
Tasa de aprendizaje,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1.0,0.848605,0.0593352,,,,,0.151395,0.20188
0.1,0.840799,0.0663257,,,,,0.159201,0.157913
0.001,0.641516,0.191019,,,,,0.358484,0.10094


# Árbol de decisión


In [41]:
from sklearn.tree import DecisionTreeClassifier as DT

In [42]:
def arbol_decision(prof):
    if(prof == 0):
        DT_model = DT(max_depth=None) 
    else:
        DT_model = DT(max_depth = prof)
    
    acc = []
    sens = []
    esp = []
    Errores = np.ones(100)
    tiempo_i = time.time()
    
    j=0
    for i in range(100):

        Xtrain,Xtest,Ytrain,Ytest = train_test_split(bagOfWords,y)   #Realiza una única partición de la base de datos

        DT_model.fit(Xtrain,Ytrain)
        Yest = DT_model.predict(Xtest)
        s, e = error_measures(Yest,Ytest)
        sens.append(s); esp.append(e)
        acc.append(DT_model.score(Xtest,Ytest))
        
        Errores[j] = classification_error(Yest, Ytest)
        j+=1

    #print("Accuracy: ", np.mean(acc), "+/-", np.std(acc))
    #print("Sensitivity: ", np.mean(sens), "+/-", np.std(sens))
    #print("Especificity: ", np.mean(esp), "+/-", np.std(esp))
    return np.mean(acc), np.std(acc), np.mean(sens), np.std(sens), np.mean(esp), np.std(esp),np.mean(Errores), time.time()-tiempo_i

In [43]:
randn = np.random.randn

df_types = pd.DataFrame({
    'Maxima profundidad' : pd.Series([0, 5,10,20,30,50,60,70])})

df_types["Accuracy"] = ""
df_types["Int_Accuracy"] = ""
df_types["Sensibility"] = ""
df_types["Int_Sensibility"] = ""
df_types["Especificity"] = ""
df_types["Int_Especificity"] = ""
df_types["Error validación"] = ""
df_types["Tiempo ejecución"] = ""
df_types.set_index(['Maxima profundidad'], inplace=True)

for i in df_types.index:
    Acc, IntAcc, Sen, IntSen, Esp, IntEsp, error, tiempo=arbol_decision(i)       
    df_types["Accuracy"][i] = Acc
    df_types["Int_Accuracy"][i] = IntAcc
    df_types["Sensibility"][i] = Sen
    df_types["Int_Sensibility"][i] = IntSen
    df_types["Especificity"][i] = Esp
    df_types["Int_Especificity"][i] = IntEsp
    df_types["Error validación"][i] = error
    df_types["Tiempo ejecución"][i] = tiempo
        
qgrid_widget = qgrid.show_grid(df_types, show_toolbar=False)
qgrid_widget

QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': True, 'defau…

In [44]:
qgrid_widget.get_changed_df()

Unnamed: 0_level_0,Accuracy,Int_Accuracy,Sensibility,Int_Sensibility,Especificity,Int_Especificity,Error validación,Tiempo ejecución
Maxima profundidad,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0.802171,0.0348489,0.779088,0.0503619,0.825685,0.0602123,0.197829,4.72729
5,0.800465,0.0365185,0.7507,0.0625269,0.852246,0.0840893,0.199535,2.72745
10,0.80155,0.0313546,0.759509,0.0543796,0.844628,0.055609,0.19845,3.75385
20,0.805736,0.0309655,0.783405,0.0533935,0.829808,0.0579112,0.194264,4.38047
30,0.803643,0.0359117,0.794622,0.0524795,0.8141,0.0606948,0.196357,4.63034
50,0.806977,0.0347456,0.786265,0.0477601,0.828607,0.0599895,0.193023,4.58037
60,0.800233,0.0319405,0.775901,0.0507578,0.826427,0.0587196,0.199767,4.5554
70,0.805891,0.0303995,0.789353,0.0513622,0.822103,0.0524168,0.194109,4.94414


# Random forest


In [45]:
from sklearn.ensemble import RandomForestClassifier as RF

In [46]:
def random_forest(est, carac):
    
    RF_model = RF(n_estimators=est, max_features =carac)
    acc = []
    sens = []
    esp = []
    Errores = np.ones(100)
    tiempo_i = time.time()
    
    j=0

    for i in range(100):
        Xtrain,Xtest,Ytrain,Ytest = train_test_split(bagOfWords,y)   #Realiza una única partición de la base de datos

        RF_model.fit(Xtrain,Ytrain)
        Yest = RF_model.predict(Xtest)
        s, e = error_measures(Yest,Ytest)
        sens.append(s); esp.append(e)
        acc.append(RF_model.score(Xtest,Ytest))
        Errores[j] = classification_error(Yest, Ytest)
        j+=1
        
    return np.mean(acc), np.std(acc), np.mean(sens), np.std(sens), np.mean(esp), np.std(esp),np.mean(Errores), time.time()-tiempo_i

In [47]:
randn = np.random.randn

df_types = pd.DataFrame({
    'Numero de arboles' : pd.Series([5,5,5,5,5,5,10,10,10,10,10,10,20,20,20,20,20,20,50,50,50,50,50,50,100,100,100,100,100,100]), 
    'Variables analizadas por nodo' : pd.Series([50,100,150,200,250,300,50,100,150,200,250,300,50,100,150,200,250,300,50,100,150,200,250,300,50,100,150,200,250,300])})

df_types["Accuracy"] = ""
df_types["Int_Accuracy"] = ""
df_types["Sensibility"] = ""
df_types["Int_Sensibility"] = ""
df_types["Especificity"] = ""
df_types["Int_Especificity"] = ""
df_types["Error validación"] = ""
df_types["Tiempo ejecución"] = ""
df_types.set_index(['Numero de arboles', 'Variables analizadas por nodo'], inplace=True)

for i in df_types.index:
    Acc, IntAcc, Sen, IntSen, Esp, IntEsp, error, tiempo=random_forest(i[0], i[1])       
    df_types["Accuracy"][i] = Acc
    df_types["Int_Accuracy"][i] = IntAcc
    df_types["Sensibility"][i] = Sen
    df_types["Int_Sensibility"][i] = IntSen
    df_types["Especificity"][i] = Esp
    df_types["Int_Especificity"][i] = IntEsp
    df_types["Error validación"][i] = error
    df_types["Tiempo ejecución"][i] = tiempo
        
qgrid_widget = qgrid.show_grid(df_types, show_toolbar=False)
qgrid_widget

QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': True, 'defau…

In [48]:
qgrid_widget.get_changed_df()

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy,Int_Accuracy,Sensibility,Int_Sensibility,Especificity,Int_Especificity,Error validación,Tiempo ejecución
Numero de arboles,Variables analizadas por nodo,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
5,50,0.786357,0.0425343,0.822353,0.0704729,0.75276,0.0958447,0.213643,3.18019
5,100,0.791395,0.0382147,0.810732,0.0643881,0.774449,0.0778693,0.208605,3.23714
5,150,0.810078,0.035583,0.824075,0.0634696,0.797756,0.0698174,0.189922,3.485
5,200,0.804729,0.036733,0.818463,0.0653143,0.793343,0.06826,0.195271,3.86678
5,250,0.809225,0.0340008,0.829444,0.0655292,0.790531,0.0704338,0.190775,4.1806
5,300,0.809845,0.0329153,0.806363,0.0657303,0.814561,0.0723311,0.190155,4.13861
10,50,0.820388,0.0338087,0.800816,0.0671238,0.842629,0.0742029,0.179612,5.30397
10,100,0.831085,0.0309941,0.802674,0.0521096,0.863315,0.0593756,0.168915,6.09848
10,150,0.832171,0.036432,0.803702,0.0709142,0.865229,0.0681324,0.167829,6.33838
10,200,0.836202,0.0377807,0.799591,0.0723005,0.876155,0.0579356,0.163798,6.93101


# Support vector machine

In [49]:
from sklearn import svm

In [50]:
def SVM (ker, c, gam): 
    
    if gam == 0:
        gam = 'auto'
    
    svm_model =svm.SVC(kernel=ker, C=c, gamma=gam)
    acc = []
    sens = []
    esp = []
    Errores = np.ones(100)
    tiempo_i = time.time()
    
    j=0

    for i in range(100):

        Xtrain,Xtest,Ytrain,Ytest = train_test_split(bagOfWords,y)   #Realiza una única partición de la base de datos

        svm_model.fit(Xtrain,Ytrain)
        Yest = svm_model.predict(Xtest)
        s, e = error_measures(Yest,Ytest)
        sens.append(s); esp.append(e)
        acc.append(svm_model.score(Xtest,Ytest))
        Errores[j] = classification_error(Yest, Ytest)
        j+=1

    return np.mean(acc), np.std(acc), np.mean(sens), np.std(sens), np.mean(esp), np.std(esp),np.mean(Errores), time.time()-tiempo_i

In [51]:
randn = np.random.randn

df_types = pd.DataFrame({
    'Kernel' : pd.Series(['linear','linear','linear','linear','linear','linear','linear','rbf','rbf','rbf','rbf','rbf','rbf','rbf','rbf','rbf','rbf','rbf','rbf','rbf','rbf','rbf','rbf','rbf','rbf','rbf','rbf','rbf','rbf','rbf','rbf','rbf','rbf','rbf', 'rbf']),
    'C' : pd.Series([0.001,0.01,0.1,1,10,100,200,0.001,0.001,0.001,0.001,0.01,0.01,0.01,0.01,0.1,0.1,0.1,0.1,1,1,1,1,10,10,10,10,100,100,100,100,200,200,200,200]),
    'gamma' : pd.Series([0,0,0,0,0,0,0,0.001,0.01,0.1,1,0.001,0.01,0.1,1,0.001,0.01,0.1,1,0.001,0.01,0.1,1,0.001,0.01,0.1,1,0.001,0.01,0.1,1,0.001,0.01,0.1,1])})

df_types["Accuracy"] = ""
df_types["Int_Accuracy"] = ""
df_types["Sensibility"] = ""
df_types["Int_Sensibility"] = ""
df_types["Especificity"] = ""
df_types["Int_Especificity"] = ""
df_types["Error validación"] = ""
df_types["Tiempo ejecución"] = ""
df_types.set_index(['Kernel','C','gamma'], inplace=True)

for i in df_types.index:
    Acc, IntAcc, Sen, IntSen, Esp, IntEsp, error, tiempo=SVM(i[0], i[1], i[2])       
    df_types["Accuracy"][i] = Acc
    df_types["Int_Accuracy"][i] = IntAcc
    df_types["Sensibility"][i] = Sen
    df_types["Int_Sensibility"][i] = IntSen
    df_types["Especificity"][i] = Esp
    df_types["Int_Especificity"][i] = IntEsp
    df_types["Error validación"][i] = error
    df_types["Tiempo ejecución"][i] = tiempo
        
qgrid_widget = qgrid.show_grid(df_types, show_toolbar=False)
qgrid_widget

QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': True, 'defau…

In [52]:
qgrid_widget.get_changed_df()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Accuracy,Int_Accuracy,Sensibility,Int_Sensibility,Especificity,Int_Especificity,Error validación,Tiempo ejecución
Kernel,C,gamma,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
linear,0.001,0.0,0.615426,0.0962109,0.902022,0.224224,0.354837,0.282498,0.384574,10.8988
linear,0.01,0.0,0.803023,0.0400423,0.933274,0.0324979,0.673946,0.0662476,0.196977,7.95344
linear,0.1,0.0,0.871163,0.0287508,0.904292,0.0382995,0.839947,0.0503959,0.128837,7.12892
linear,1.0,0.0,0.881395,0.0266175,0.894198,0.039705,0.870449,0.04176,0.118605,7.24083
linear,10.0,0.0,0.875814,0.0260811,0.889475,0.0413947,0.863173,0.0444681,0.124186,7.23485
linear,100.0,0.0,0.877597,0.0269494,0.891442,0.040678,0.864878,0.0403132,0.122403,7.30681
linear,200.0,0.0,0.88031,0.0254358,0.885681,0.0399255,0.875667,0.0417564,0.11969,7.18888
rbf,0.001,0.001,0.472791,0.0242053,0.62,0.485386,0.38,0.485386,0.527209,10.2561
rbf,0.001,0.01,0.467519,0.0269137,0.62,0.485386,0.38,0.485386,0.532481,10.415
rbf,0.001,0.1,0.47031,0.0244415,0.54,0.498397,0.46,0.498397,0.52969,11.9292


# Red neuronal

In [53]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import StratifiedKFold

In [54]:
def red_neuronal(nCO, nPC):
    if(nCO==1):
        mlp=MLPClassifier(activation='tanh',max_iter = 500,hidden_layer_sizes=(nPC))
    else:
        mlp=MLPClassifier(activation='tanh',max_iter = 500,hidden_layer_sizes=(nPC,nPC))
        
    acc = []
    sens = []
    esp = []
    Errores = np.ones(100)
    tiempo_i = time.time()
    
    j=0

    for i in range(100):

        Xtrain,Xtest,Ytrain,Ytest = train_test_split(bagOfWords,y)   #Realiza una única partición de la base de datos

        mlp.fit(Xtrain,Ytrain)
        Yest = mlp.predict(Xtest)
        s, e = error_measures(Yest,Ytest)
        sens.append(s); esp.append(e)
        acc.append(mlp.score(Xtest,Ytest))
        Errores[j] = classification_error(Yest, Ytest)
        j+=1

    return np.mean(acc), np.std(acc), np.mean(sens), np.std(sens), np.mean(esp), np.std(esp),np.mean(Errores), time.time()-tiempo_i

In [55]:
randn = np.random.randn

df_types = pd.DataFrame({
    'N. de capas ocultas' : pd.Series([1,1,1,1,1,2,2,2,2,2]),
    'Neuronas por capa' : pd.Series([20,24,28,32,36,20,24,28,32,36])})

df_types["Accuracy"] = ""
df_types["Int_Accuracy"] = ""
df_types["Sensibility"] = ""
df_types["Int_Sensibility"] = ""
df_types["Especificity"] = ""
df_types["Int_Especificity"] = ""
df_types["Error validación"] = ""
df_types["Tiempo ejecución"] = ""
df_types.set_index(['N. de capas ocultas','Neuronas por capa'], inplace=True)

for i in df_types.index:
    Acc, IntAcc, Sen, IntSen, Esp, IntEsp, error, tiempo=red_neuronal(i[0], i[1])       
    print(i[0], i[1])
    df_types["Accuracy"][i] = Acc
    df_types["Int_Accuracy"][i] = IntAcc
    df_types["Sensibility"][i] = Sen
    df_types["Int_Sensibility"][i] = IntSen
    df_types["Especificity"][i] = Esp
    df_types["Int_Especificity"][i] = IntEsp
    df_types["Error validación"][i] = error
    df_types["Tiempo ejecución"][i] = tiempo
        
qgrid_widget = qgrid.show_grid(df_types, show_toolbar=False)
qgrid_widget

1 20
1 24
1 28
1 32
1 36
2 20
2 24
2 28
2 32
2 36


QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': True, 'defau…

In [56]:
qgrid_widget.get_changed_df()

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy,Int_Accuracy,Sensibility,Int_Sensibility,Especificity,Int_Especificity,Error validación,Tiempo ejecución
N. de capas ocultas,Neuronas por capa,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,20,0.892326,0.0235357,0.924016,0.0319172,0.861255,0.0442534,0.107674,760.853
1,24,0.894574,0.0246604,0.923712,0.0340945,0.867072,0.0427562,0.105426,772.201
1,28,0.895349,0.027176,0.927015,0.032485,0.864443,0.049903,0.104651,839.394
1,32,0.891473,0.0264476,0.924838,0.0320284,0.858225,0.0477324,0.108527,971.446
1,36,0.894109,0.0243717,0.926542,0.0329527,0.863052,0.0447215,0.105891,1030.9
2,20,0.902403,0.0268332,0.925485,0.0371121,0.879929,0.0476317,0.0975969,501.831
2,24,0.903178,0.0235254,0.925731,0.032058,0.880733,0.0463686,0.0968217,538.825
2,28,0.903023,0.0240059,0.925914,0.0344023,0.880924,0.0410637,0.0969767,573.21
2,32,0.902171,0.0249458,0.915581,0.0350105,0.888567,0.0438096,0.0978295,611.918
2,36,0.908915,0.0256488,0.919194,0.0327905,0.899328,0.0411841,0.0910853,598.158
