# Concatenating the best features

In this notebook are tested models with concatenated best features.

To not repeat, all the functions used for feature extraction are saved in "feature_extraction.py" which is imported. These functions are defined locally and explained more in other notebooks.

Only Catalan dataset.

In [1]:
import numpy as np

In [2]:
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import f1_score

In [3]:
import get_train_test
import feature_extraction

In [4]:
k = [1,2,3,4]
random_states = [0,1,2,4,5,42,50,60,70,100]

#### BOW frequencies TF-IDF + bigrams TF-IDF

In [7]:
results_svm = []
results_lr = []
for i in k:
    X_train1,y_train,X_test1,y_test = feature_extraction.get_LP_BOW_train_test_features(language="cat",k=i,repeat=True,tf_idf=True)
    X_train2,y_train,X_test2,y_test = feature_extraction.get_LP_ngram_train_test_features(language="cat",k=i,n=2,tf_idf=True)
    X_train=np.concatenate((X_train1,X_train2),axis=1)
    X_test=np.concatenate((X_test1,X_test2),axis=1)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5,class_weight="balanced")
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))
    
    clf_lr = LogisticRegression(max_iter=5,class_weight="balanced").fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2193 instances, test on 729 instances
train on 2193 instances, test on 729 instances


In [8]:
print("BOW frequencies TF-IDF + bigrams TF-IDF, sentences")
print("------")
print("SVM accuracy: ", np.mean(results_svm))
print("SVM precision: ", precision_score(y_test, predicted_svm, average='weighted'))
print("SVM recall: ", recall_score(y_test, predicted_svm, average='weighted'))
print("SVM F1: ", f1_score(y_test, predicted_svm, average='weighted'))
print("SVM kappa: ", cohen_kappa_score(y_test, predicted_svm))
print("------")
print("Logistic regression accuracy: ", np.mean(results_lr))
print("Logistic regression precision: ", precision_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression recall: ", recall_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression F1: ", f1_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression kappa: ", cohen_kappa_score(y_test, predicted_lr))

BOW frequencies TF-IDF + bigrams TF-IDF, sentences
------
SVM accuracy:  0.710191895858439
SVM precision:  0.7112871969296871
SVM recall:  0.7308228730822873
SVM F1:  0.711199209071092
SVM kappa:  0.5095985172532329
------
Multinomial NB accuracy:  0.6528851889478975
Multinomial NB precision:  0.7691434061676741
Multinomial NB recall:  0.6596931659693166
Multinomial NB F1:  0.5862900356871699
Multinomial NB kappa:  0.29836410085705234
------
Logistic regression accuracy:  0.7221703906201549
Logistic regression precision:  0.6944829900120596
Logistic regression recall:  0.7168758716875872
Logistic regression F1:  0.6965340129927217
Logistic regression kappa:  0.48582541146048597


#### BOW frequencies TF-IDF + bigrams POS TF-IDF

In [9]:
results_svm = []
results_lr = []
for i in k:
    X_train1,y_train,X_test1,y_test = feature_extraction.get_LP_BOW_train_test_features(language="cat",k=i,repeat=True,tf_idf=True)
    X_train2,y_train,X_test2,y_test = feature_extraction.get_LP_ngram_train_test_features(language="cat",k=i,n=2,tf_idf=True,POS=True)
    X_train=np.concatenate((X_train1,X_train2),axis=1)
    X_test=np.concatenate((X_test1,X_test2),axis=1)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5,class_weight="balanced")
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))

    clf_lr = LogisticRegression(max_iter=5,class_weight="balanced").fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2193 instances, test on 729 instances
train on 2193 instances, test on 729 instances


In [12]:
print("BOW frequencies TF-IDF + bigrams POS TF-IDF, sentences")
print("------")
print("SVM accuracy: ", np.mean(results_svm))
print("SVM precision: ", precision_score(y_test, predicted_svm, average='weighted'))
print("SVM recall: ", recall_score(y_test, predicted_svm, average='weighted'))
print("SVM F1: ", f1_score(y_test, predicted_svm, average='weighted'))
print("SVM kappa: ", cohen_kappa_score(y_test, predicted_svm))
print("------")
print("Logistic regression accuracy: ", np.mean(results_lr))
print("Logistic regression precision: ", precision_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression recall: ", recall_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression F1: ", f1_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression kappa: ", cohen_kappa_score(y_test, predicted_lr))

BOW frequencies TF-IDF + bigrams POS TF-IDF, sentences
------
SVM accuracy:  0.68768428109856
SVM precision:  0.704525756753175
SVM recall:  0.7112970711297071
SVM F1:  0.7044975125648788
SVM kappa:  0.4898989551828431
------
Logistic regression accuracy:  0.7057298982170547
Logistic regression precision:  0.7007298826488837
Logistic regression recall:  0.7099023709902371
Logistic regression F1:  0.7034218975115305
Logistic regression kappa:  0.49084354131672747


#### BOW frequencies TF-IDF + BOW frequences POS TF-IDF

In [16]:
results_svm = []
results_lr = []
for i in k:
    X_train1,y_train,X_test1,y_test = feature_extraction.get_LP_BOW_train_test_features(language="cat",k=i,repeat=True,tf_idf=True)
    X_train2,y_train,X_test2,y_test = feature_extraction.get_LP_BOW_train_test_features(language="cat",k=i,repeat=True,tf_idf=True,POS=True)
    X_train=np.concatenate((X_train1,X_train2),axis=1)
    X_test=np.concatenate((X_test1,X_test2),axis=1)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5,class_weight="balanced")
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))

    clf_lr = LogisticRegression(max_iter=5,class_weight="balanced").fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2193 instances, test on 729 instances
train on 2193 instances, test on 729 instances


In [17]:
print("BOW frequencies TF-IDF + BOW frequencies POS TF-IDF, sentences")
print("------")
print("SVM accuracy: ", np.mean(results_svm))
print("SVM precision: ", precision_score(y_test, predicted_svm, average='weighted'))
print("SVM recall: ", recall_score(y_test, predicted_svm, average='weighted'))
print("SVM F1: ", f1_score(y_test, predicted_svm, average='weighted'))
print("SVM kappa: ", cohen_kappa_score(y_test, predicted_svm))
print("------")
print("Logistic regression accuracy: ", np.mean(results_lr))
print("Logistic regression precision: ", precision_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression recall: ", recall_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression F1: ", f1_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression kappa: ", cohen_kappa_score(y_test, predicted_lr))

BOW frequencies TF-IDF + BOW frequencies POS TF-IDF, sentences
------
SVM accuracy:  0.6898489527526654
SVM precision:  0.6991403218346467
SVM recall:  0.704323570432357
SVM F1:  0.6863726802336388
SVM kappa:  0.4546335197815722
------
Logistic regression accuracy:  0.7123820820764839
Logistic regression precision:  0.6900198343814615
Logistic regression recall:  0.705718270571827
Logistic regression F1:  0.692800998965621
Logistic regression kappa:  0.4736854922315844


#### BOW frequencies TF-IDF + tri-grams TF-IDF

In [18]:
results_svm = []
results_lr = []
for i in k:
    X_train1,y_train,X_test1,y_test = feature_extraction.get_LP_BOW_train_test_features(language="cat",k=i,repeat=True,tf_idf=True)
    X_train2,y_train,X_test2,y_test = feature_extraction.get_LP_ngram_train_test_features(language="cat",k=i,n=3,tf_idf=True,POS=True)
    X_train=np.concatenate((X_train1,X_train2),axis=1)
    X_test=np.concatenate((X_test1,X_test2),axis=1)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5,class_weight="balanced")
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))

    clf_lr = LogisticRegression(max_iter=5,class_weight="balanced").fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2193 instances, test on 729 instances
train on 2193 instances, test on 729 instances


In [20]:
print("BOW frequencies TF-IDF + tri-gram POS TF-IDF, sentences")
print("------")
print("SVM accuracy: ", np.mean(results_svm))
print("SVM precision: ", precision_score(y_test, predicted_svm, average='weighted'))
print("SVM recall: ", recall_score(y_test, predicted_svm, average='weighted'))
print("SVM F1: ", f1_score(y_test, predicted_svm, average='weighted'))
print("SVM kappa: ", cohen_kappa_score(y_test, predicted_svm))
print("------")
print("Logistic regression accuracy: ", np.mean(results_lr))
print("Logistic regression precision: ", precision_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression recall: ", recall_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression F1: ", f1_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression kappa: ", cohen_kappa_score(y_test, predicted_lr))

BOW frequencies TF-IDF + tri-gram POS TF-IDF, sentences
------
SVM accuracy:  0.6838952646491971
SVM precision:  0.6991163447956431
SVM recall:  0.698744769874477
SVM F1:  0.6965245627136598
SVM kappa:  0.47528408898405583
------
Logistic regression accuracy:  0.7036074321784824
Logistic regression precision:  0.6980302803348265
Logistic regression recall:  0.7140864714086471
Logistic regression F1:  0.7021160946005418
Logistic regression kappa:  0.490857006280071


#### BOW frequencies TF-IDF + bigram POS TF-IDF + bigram TF-IDF

In [27]:
results_svm = []
results_lr = []
for i in k:
    X_train1,y_train,X_test1,y_test = feature_extraction.get_LP_BOW_train_test_features(language="cat",k=i,repeat=True,tf_idf=True)
    X_train2,y_train,X_test2,y_test = feature_extraction.get_LP_ngram_train_test_features(language="cat",k=i,n=2,tf_idf=True,POS=True)
    X_train3,y_train,X_test3,y_test = feature_extraction.get_LP_ngram_train_test_features(language="cat",k=i,n=2,tf_idf=True)
    X_train=np.concatenate((X_train1,X_train2,X_train3),axis=1)
    X_test=np.concatenate((X_test1,X_test2,X_test3),axis=1)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5,class_weight="balanced")
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))

    clf_lr = LogisticRegression(max_iter=5,class_weight="balanced").fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2193 instances, test on 729 instances
train on 2193 instances, test on 729 instances
train on 2193 instances, test on 729 instances


In [28]:
print("BOW frequencies TF-IDF + bigram POS TF-IDF + bigram TF-IDF, sentences")
print("------")
print("SVM accuracy: ", np.mean(results_svm))
print("SVM precision: ", precision_score(y_test, predicted_svm, average='weighted'))
print("SVM recall: ", recall_score(y_test, predicted_svm, average='weighted'))
print("SVM F1: ", f1_score(y_test, predicted_svm, average='weighted'))
print("SVM kappa: ", cohen_kappa_score(y_test, predicted_svm))
print("------")
print("Logistic regression accuracy: ", np.mean(results_lr))
print("Logistic regression precision: ", precision_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression recall: ", recall_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression F1: ", f1_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression kappa: ", cohen_kappa_score(y_test, predicted_lr))

BOW frequencies TF-IDF + bigram POS TF-IDF + bigram TF-IDF, sentences
------
SVM accuracy:  0.704768553275253
SVM precision:  0.722252591853742
SVM recall:  0.7336122733612274
SVM F1:  0.7147004388910586
SVM kappa:  0.5122257879533691
------
Logistic regression accuracy:  0.7193751878355448
Logistic regression precision:  0.7091101963201413
Logistic regression recall:  0.7224546722454672
Logistic regression F1:  0.7112832997119448
Logistic regression kappa:  0.5057981254805795


#### BOW frequencies TF-IDF + bigram TF-IDF + trigram TF-IDF

In [5]:
results_svm = []
results_lr = []
for i in k:
    X_train1,y_train,X_test1,y_test = feature_extraction.get_LP_BOW_train_test_features(language="cat",k=i,repeat=True,tf_idf=True)
    X_train2,y_train,X_test2,y_test = feature_extraction.get_LP_ngram_train_test_features(language="cat",k=i,n=2,tf_idf=True)
    X_train3,y_train,X_test3,y_test = feature_extraction.get_LP_ngram_train_test_features(language="cat",k=i,n=3,tf_idf=True)
    X_train=np.concatenate((X_train1,X_train2,X_train3),axis=1)
    X_test=np.concatenate((X_test1,X_test2,X_test3),axis=1)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5,class_weight="balanced")
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))

    clf_lr = LogisticRegression(max_iter=5,class_weight="balanced").fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2193 instances, test on 729 instances
train on 2193 instances, test on 729 instances
train on 2193 instances, test on 729 instances


In [6]:
print("BOW frequencies TF-IDF + bigram TF-IDF + trigram TF-IDF, sentences")
print("------")
print("SVM accuracy: ", np.mean(results_svm))
print("SVM precision: ", precision_score(y_test, predicted_svm, average='weighted'))
print("SVM recall: ", recall_score(y_test, predicted_svm, average='weighted'))
print("SVM F1: ", f1_score(y_test, predicted_svm, average='weighted'))
print("SVM kappa: ", cohen_kappa_score(y_test, predicted_svm))
print("------")
print("Logistic regression accuracy: ", np.mean(results_lr))
print("Logistic regression precision: ", precision_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression recall: ", recall_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression F1: ", f1_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression kappa: ", cohen_kappa_score(y_test, predicted_lr))

BOW frequencies TF-IDF + bigram TF-IDF + trigram TF-IDF, sentences
------
SVM accuracy:  0.7148159636560675
SVM precision:  0.707993166089176
SVM recall:  0.7252440725244073
SVM F1:  0.708158735673631
SVM kappa:  0.5016335948966919
------
Logistic regression accuracy:  0.7260087254870371
Logistic regression precision:  0.7069182938168294
Logistic regression recall:  0.7252440725244073
Logistic regression F1:  0.7014961998126128
Logistic regression kappa:  0.494465023890052


In [7]:
C = confusion_matrix(y_test, predicted_svm,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.68663594, 0.25806452, 0.05529954],
       [0.06632653, 0.87755102, 0.05612245],
       [0.12962963, 0.62037037, 0.25      ]])

In [8]:
C = confusion_matrix(y_test, predicted_lr,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.67281106, 0.28110599, 0.04608295],
       [0.07397959, 0.89540816, 0.03061224],
       [0.15740741, 0.62962963, 0.21296296]])

#### BOW frequencies TF-IDF + bigram TF-IDF + trigram TF-IDF + bigrams POS tagged TF-IDF

In [32]:
results_svm = []
results_lr = []
for i in k:
    X_train1,y_train,X_test1,y_test = feature_extraction.get_LP_BOW_train_test_features(language="cat",k=i,repeat=True,tf_idf=True)
    X_train2,y_train,X_test2,y_test = feature_extraction.get_LP_ngram_train_test_features(language="cat",k=i,n=2,tf_idf=True)
    X_train3,y_train,X_test3,y_test = feature_extraction.get_LP_ngram_train_test_features(language="cat",k=i,n=3,tf_idf=True)
    X_train4,y_train,X_test4,y_test = feature_extraction.get_LP_ngram_train_test_features(language="cat",k=i,n=2,tf_idf=True,POS=True)
    X_train=np.concatenate((X_train1,X_train2,X_train3,X_train4),axis=1)
    X_test=np.concatenate((X_test1,X_test2,X_test3,X_test4),axis=1)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5,class_weight="balanced")
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))

    clf_lr = LogisticRegression(max_iter=5,class_weight="balanced").fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2193 instances, test on 729 instances
train on 2193 instances, test on 729 instances
train on 2193 instances, test on 729 instances
train on 2193 instances, test on 729 instances


In [34]:
print("BOW frequencies TF-IDF + bigram TF-IDF + trigram TF-IDF + bigrams POS tagged TF-IDF, sentences")
print("------")
print("SVM accuracy: ", np.mean(results_svm))
print("SVM precision: ", precision_score(y_test, predicted_svm, average='weighted'))
print("SVM recall: ", recall_score(y_test, predicted_svm, average='weighted'))
print("SVM F1: ", f1_score(y_test, predicted_svm, average='weighted'))
print("SVM kappa: ", cohen_kappa_score(y_test, predicted_svm))
print("------")
print("Logistic regression accuracy: ", np.mean(results_lr))
print("Logistic regression precision: ", precision_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression recall: ", recall_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression F1: ", f1_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression kappa: ", cohen_kappa_score(y_test, predicted_lr))

BOW frequencies TF-IDF + bigram TF-IDF + trigram TF-IDF + bigrams POS tagd TF-IDF, sentences
------
SVM accuracy:  0.7062724547195708
SVM precision:  0.7226897137519892
SVM recall:  0.7350069735006973
SVM F1:  0.7189669960197933
SVM kappa:  0.5194929333046456
------
Logistic regression accuracy:  0.7249784535680409
Logistic regression precision:  0.708891736637406
Logistic regression recall:  0.7238493723849372
Logistic regression F1:  0.7088758019327636
Logistic regression kappa:  0.5020623763626415


#### BOW frequencies TF-IDF + word2vec TF-IDF

In [5]:
results_svm = []
results_lr = []
for i in k:
    X_train1,y_train,X_test1,y_test = feature_extraction.get_LP_BOW_train_test_features(language="cat",k=i,repeat=True,tf_idf=True)
    X_train2,y_train,X_test2,y_test = feature_extraction.get_w2v_train_test_features(k=i,language="cat",tf_idf=True)
    X_train=np.concatenate((X_train1,X_train2),axis=1)
    X_test=np.concatenate((X_test1,X_test2),axis=1)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5,class_weight="balanced")
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))

    clf_lr = LogisticRegression(max_iter=5,class_weight="balanced").fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2193 instances, test on 729 instances
train on 2193 instances, test on 729 instances


In [6]:
print("BOW frequencies TF-IDF + word2vec TF-IDF, sentences")
print("------")
print("SVM accuracy: ", np.mean(results_svm))
print("SVM precision: ", precision_score(y_test, predicted_svm, average='weighted'))
print("SVM recall: ", recall_score(y_test, predicted_svm, average='weighted'))
print("SVM F1: ", f1_score(y_test, predicted_svm, average='weighted'))
print("SVM kappa: ", cohen_kappa_score(y_test, predicted_svm))
print("------")
print("Logistic regression accuracy: ", np.mean(results_lr))
print("Logistic regression precision: ", precision_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression recall: ", recall_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression F1: ", f1_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression kappa: ", cohen_kappa_score(y_test, predicted_lr))

BOW frequencies TF-IDF + word2vec TF-IDF, sentences
------
SVM accuracy:  0.6838439824333049
SVM precision:  0.6940465930618602
SVM recall:  0.7071129707112971
SVM F1:  0.6986186420131161
SVM kappa:  0.48518148745862855
------
Logistic regression accuracy:  0.6987378171558034
Logistic regression precision:  0.6871983353777521
Logistic regression recall:  0.700139470013947
Logistic regression F1:  0.692010952098867
Logistic regression kappa:  0.47406399050172976


#### BOW frequencies TF-IDF + bigram TF-IDF + trigram TF-IDF + bigrams

In [32]:
results_svm = []
results_lr = []
for i in k:
    X_train1,y_train,X_test1,y_test = feature_extraction.get_LP_BOW_train_test_features(language="cat",k=i,repeat=True,tf_idf=True)
    X_train2,y_train,X_test2,y_test = feature_extraction.get_LP_ngram_train_test_features(language="cat",k=i,n=2,tf_idf=True)
    X_train3,y_train,X_test3,y_test = feature_extraction.get_LP_ngram_train_test_features(language="cat",k=i,n=3,tf_idf=True)
    X_train4,y_train,X_test4,y_test = feature_extraction.get_LP_ngram_train_test_features(language="cat",k=i,n=2)
    X_train=np.concatenate((X_train1,X_train2,X_train3,X_train4),axis=1)
    X_test=np.concatenate((X_test1,X_test2,X_test3,X_test4),axis=1)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5,class_weight="balanced")
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))

    clf_lr = LogisticRegression(max_iter=5,class_weight="balanced").fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2193 instances, test on 729 instances
train on 2193 instances, test on 729 instances
train on 2193 instances, test on 729 instances
train on 2193 instances, test on 729 instances


In [33]:
print("BOW frequencies TF-IDF + bigram TF-IDF + trigram TF-IDF + bigrams, sentences")
print("------")
print("SVM accuracy: ", np.mean(results_svm))
print("SVM precision: ", precision_score(y_test, predicted_svm, average='weighted'))
print("SVM recall: ", recall_score(y_test, predicted_svm, average='weighted'))
print("SVM F1: ", f1_score(y_test, predicted_svm, average='weighted'))
print("SVM kappa: ", cohen_kappa_score(y_test, predicted_svm))
print("------")
print("Logistic regression accuracy: ", np.mean(results_lr))
print("Logistic regression precision: ", precision_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression recall: ", recall_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression F1: ", f1_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression kappa: ", cohen_kappa_score(y_test, predicted_lr))

BOW frequencies TF-IDF + bigram TF-IDF + trigram TF-IDF + bigrams, sentences
------
SVM accuracy:  0.6906925352246045
SVM precision:  0.6855407701613135
SVM recall:  0.701534170153417
SVM F1:  0.6831046998544436
SVM kappa:  0.45695659560010193
------
Logistic regression accuracy:  0.7036250536886096
Logistic regression precision:  0.6859469823462988
Logistic regression recall:  0.700139470013947
Logistic regression F1:  0.6898210762477062
Logistic regression kappa:  0.47150903873646255


### add sentiment words 

In [7]:
with open("TMF/sentiment-lexicons-kaggle/positive_words_es.txt") as f:
    pos_words = f.readlines()
    pos_words = [x.strip() for x in pos_words] 
    
with open("TMF/sentiment-lexicons-kaggle/negative_words_es.txt") as f:
    neg_words = f.readlines()
    neg_words = [x.strip() for x in neg_words] 

In [9]:
sent_words = np.concatenate((pos_words,neg_words),axis=0)

In [20]:
def sentiment_lexicon_features(sentence):
    sentence_words = feature_extraction.get_word_list(sentence)
    bag = np.zeros(len(sent_words))
    for w in sentence_words:
        for i,word in enumerate(sent_words):
            if word == w: 
                bag[i] += 1
    return np.array(bag)

In [29]:
def get_sentiment_lexicon_train_test_features(tf_idf=False,k=1):
    train,test = get_train_test.get_train_test(k,lemmatize=True,language="cat")
    train_corpus = " ".join([i[0] for i in train])
    X_train = []
    y_train = []
    for sentence in train:
        X_train.append(sentiment_lexicon_features(sentence[0]))
        y_train.append(sentence[1])
    X_test = []
    y_test = []
    for sentence in test:
        X_test.append(sentiment_lexicon_features(sentence[0]))
        y_test.append(sentence[1])
    #if tf_idf == True:
    #    transformer = TfidfTransformer(smooth_idf=False)
    #    X_train = transformer.fit_transform(X_train).toarray()
    #    X_test = transformer.transform(X_test).toarray() 
    return X_train,y_train,X_test,y_test

In [30]:
results_svm = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_sentiment_lexicon_train_test_features(k=i,tf_idf=True)
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5,class_weight="balanced")
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))

    clf_lr = LogisticRegression(max_iter=5,class_weight="balanced").fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2193 instances, test on 729 instances


In [31]:
print("Sentiment lexicon BOW TF-IDF, sentences")
print("------")
print("SVM accuracy: ", np.mean(results_svm))
print("SVM precision: ", precision_score(y_test, predicted_svm, average='weighted'))
print("SVM recall: ", recall_score(y_test, predicted_svm, average='weighted'))
print("SVM F1: ", f1_score(y_test, predicted_svm, average='weighted'))
print("SVM kappa: ", cohen_kappa_score(y_test, predicted_svm))
print("------")
print("Logistic regression accuracy: ", np.mean(results_lr))
print("Logistic regression precision: ", precision_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression recall: ", recall_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression F1: ", f1_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression kappa: ", cohen_kappa_score(y_test, predicted_lr))

Sentiment lexicon BOW TF-IDF, sentences
------
SVM accuracy:  0.5363144069384469
SVM precision:  0.450047286375123
SVM recall:  0.5439330543933054
SVM F1:  0.43395080733621316
SVM kappa:  0.035227553287795144
------
Logistic regression accuracy:  0.513891344665668
Logistic regression precision:  0.4503577093727184
Logistic regression recall:  0.5411436541143654
Logistic regression F1:  0.43745930294483926
Logistic regression kappa:  0.03686479777235219
