# BOW and *n*-gram feature extraction and models

In this nodebook are results from different bag-of-words and *n*-gram models and setups while working with the original sentences, comments, and their English translations. Both Catalan and Spanish.

In [1]:
import os, re, nltk, numpy as np
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.corpus import stopwords
from nltk import bigrams, trigrams, ngrams

In [16]:
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import f1_score

In [3]:
import get_train_test

Imported file *get_train_test.py*. This file contains 3 functions:

* **get_train_test(k=1,lemmatize=False,POS=False,language="es")**

Reads data from *txt* files. Returns two arrays of tuples (train and test), with sentences and their labels ('pos','neg' or 'neu').

*lemmatize* - if *True* returns word lemmas

*POS* - if *True* reurns words in form "*lemma_POStag*"

* **get_train_test_comments(k=1,language="es")**

Reads data from *MongoDB* (as sentence order in comments is saved there). Returns two arrays of tuples (train and test), with comments and their labels ('pos','neg' or 'neu').

* **get_english_train_test(k=1,language="es")**

Reads data from pre-created *txt* files with sentences translated to English. Returns two arrays of tuples (train and test), with comments and their labels ('pos','neg' or 'neu').


For all the functions train-test split is 3/4 to 1/4, selection order depending on parameter *k*.

*k* - takes values 1 to 4 - changes the selection of train-test split (used for cross-validation).

*language* - 'es' or 'cat'

### Create feature vectors

In [4]:
def get_word_list(text,remove_stopwords=False,repeat=False):
    if remove_stopwords == True:
        #with open("data/es_stopwords.txt") as f:
        #    es_stopwords = f.readlines()
        #es_stopwords = [x.strip() for x in es_stopwords] 
        with open("data/ca_stopwords.txt") as f:
            ca_stopwords = f.readlines()
        ca_stopwords = [x.strip() for x in ca_stopwords] 
        es_stopwords = set(stopwords.words("spanish"))
        for stopword in es_stopwords:
            text = re.sub(stopword, '', text)
        for stopword in ca_stopwords:
            text = re.sub(stopword, '', text)
    text = re.sub(r'\d+', '', text)
    text = text.lower()
    punctuation = ['.',',','%','&','\'','/','+','!']
    rx = '[' + re.escape(''.join(punctuation)) + ']'
    text = re.sub(rx, '', text)
    if repeat:
        tokens = nltk.wordpunct_tokenize(text)
    else:
        tokens = sorted(set(nltk.wordpunct_tokenize(text)))
    remove_from_vocabulary = ['-',')','(','(?)','1','=','[',']','][',':','<','>',';',':)','?']
    for i in remove_from_vocabulary:
        if i in tokens:
            tokens.remove(i)
    return tokens

In [5]:
def tokenize(text):
    tokens = get_word_list(text,remove_stopwords=False,repeat=True)
    return tokens

This function is to create BOW feature vectors. Parameters - sentence to convert and vocabulary.

remove_stopwords - removes spanish stopwords (like, "la", "el", ...) (I downloaded this list from github)

repeat - keeps all the instances of the same word (if False then only keeps distinct words). Use False if BOW vectors with only 1 and 0 are needed (word either appears or not), and True if BOW vectors with word frequencies are needed.

In [6]:
def BOW(sentence, vocabulary,remove_stopwords=False,repeat=False):
    sentence_words = get_word_list(sentence,remove_stopwords,repeat)
    bag = np.zeros(len(vocabulary))
    for w in sentence_words:
        for i,word in enumerate(vocabulary):
            if word == w: 
                bag[i] += 1
    return np.array(bag)

In [7]:
def get_2grams_vocab(tokens):
    bigr = list(bigrams(tokens))
    fdist = nltk.FreqDist(bigr)
    bi_grams = [x[0] for x in list(fdist.items()) if x[1] >= 1]
    return bi_grams

def get_3grams_vocab(tokens):
    trigr = list(trigrams(tokens))
    fdist = nltk.FreqDist(trigr)
    tri_grams = [x[0] for x in list(fdist.items()) if x[1] >= 1]
    return tri_grams

In [8]:
def ngrams_to_features(sentence, ngramlist, n=2):
    tokens = tokenize(sentence)
    if n==3:
        sentence_ngrams = list(trigrams(tokens))
    elif n == 4 or n==5:
        sentence_ngrams = list(ngrams(tokens, n))
    else:
        sentence_ngrams = list(bigrams(tokens))
    bag = np.zeros(len(ngramlist))
    for w in sentence_ngrams:
        for i,word in enumerate(ngramlist):
            if word == w: 
                bag[i] += 1
                
    return np.array(bag)

This function creates BOW feature vectores.

tf_idf

remove_stopwords - removes spanish stopwords (like, "la", "el", ...) (I downloaded this list from github)

repeat - keeps all the instances of the same word (if False then only keeps distinct words). Use False if BOW vectors with only 1 and 0 are needed (word either appears or not), and True if BOW vectors with word frequencies are needed.

In [9]:
def get_BOW_train_test_features(language="es",tf_idf=False,remove_stopwords=False,repeat=False,full_comments=False,english=False,k=1):
    if full_comments:
        train,test = get_train_test.get_train_test_comments(k=k,language=language)
    elif english:
        train,test = get_train_test.get_english_train_test(k=k,language=language)
    else:
        train,test = get_train_test.get_train_test(k=k,language=language)
    train_corpus = " ".join([i[0] for i in train])
    vocabulary = get_word_list(train_corpus)
    X_train = []
    y_train = []
    for sentence in train:
        bow = BOW(sentence[0],vocabulary,remove_stopwords,repeat)
        X_train.append(bow)
        y_train.append(sentence[1])
    X_test = []
    y_test = []
    for sentence in test:
        bow = BOW(sentence[0],vocabulary,remove_stopwords,repeat)
        X_test.append(bow)
        y_test.append(sentence[1])
    if tf_idf == True:
        transformer = TfidfTransformer(smooth_idf=False)
        X_train = transformer.fit_transform(X_train).toarray()
        X_test = transformer.transform(X_test).toarray() 
    return X_train,y_train,X_test,y_test

In [10]:
def get_ngram_train_test_features(language="es",n=2,tf_idf=False,full_comments=False,english=False,k=1):
    if full_comments:
        train,test = get_train_test.get_train_test_comments(k,language=language)
    elif english:
        train,test = get_train_test.get_english_train_test(k,language=language)
    else:
        train,test = get_train_test.get_train_test(k,language=language)
    train_corpus = " ".join([i[0] for i in train])
    tokens = tokenize(train_corpus)
    if n==3:
        ngramlist = get_3grams_vocab(tokens)
    elif n > 3:
        ngramlist = list(ngrams(tokens, n))
    else:
        ngramlist = get_2grams_vocab(tokens)
    X_train = []
    y_train = []
    for sentence in train:
        f = ngrams_to_features(sentence[0],ngramlist,n)
        X_train.append(f)
        y_train.append(sentence[1])
    X_test = []
    y_test = []
    for sentence in test:
        f = ngrams_to_features(sentence[0],ngramlist,n)
        X_test.append(f)
        y_test.append(sentence[1])
    if tf_idf == True:
        transformer = TfidfTransformer()
        X_train = transformer.fit_transform(X_train).toarray()
        X_test = transformer.transform(X_test).toarray() 
    
    return X_train,y_train,X_test,y_test

## Experiments for Catalan

### Train models, sentence level

In [15]:
k = [1,2,3,4]
random_states = [0,1,2,4,5,42,50,60,70,100]

#### BTO features, balanced class weights, sentences

In [200]:
results_svm = []
results_nb = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_BOW_train_test_features(language="cat",k=i)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5,class_weight="balanced")
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))
    
    clf_nb = MultinomialNB().fit(X_train, y_train)
    predicted_nb = clf_nb.predict(X_test)
    results_nb.append(np.mean(predicted_nb == y_test))
    
    clf_lr = LogisticRegression(max_iter=5,class_weight="balanced").fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2193 instances, test on 729 instances


In [204]:
print("BOW BTO features (balanced class weights), sentences")
print("------")
print("SVM accuracy: ", np.mean(results_svm))
print("SVM precision: ", precision_score(y_test, predicted_svm, average='weighted'))
print("SVM recall: ", recall_score(y_test, predicted_svm, average='weighted'))
print("SVM F1: ", f1_score(y_test, predicted_svm, average='weighted'))
print("SVM kappa: ", cohen_kappa_score(y_test, predicted_svm))
print("------")
print("Multinomial NB accuracy: ", np.mean(results_nb))
print("Multinomial NB precision: ", precision_score(y_test, predicted_nb, average='weighted'))
print("Multinomial NB recall: ", recall_score(y_test, predicted_nb, average='weighted'))
print("Multinomial NB F1: ", f1_score(y_test, predicted_nb, average='weighted'))
print("Multinomial NB kappa: ", cohen_kappa_score(y_test, predicted_nb))
print("------")
print("Logistic regression accuracy: ", np.mean(results_lr))
print("Logistic regression precision: ", precision_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression recall: ", recall_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression F1: ", f1_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression kappa: ", cohen_kappa_score(y_test, predicted_lr))

BOW BTO features (balanced class weights), sentences
------
SVM accuracy:  0.6470072659171813
SVM precision:  0.6989130004629689
SVM recall:  0.6721536351165981
SVM F1:  0.6802387595681454
SVM kappa:  0.4455391665473293
------
Multinomial NB accuracy:  0.6923389798066801
Multinomial NB precision:  0.6735450136192391
Multinomial NB recall:  0.6995884773662552
Multinomial NB F1:  0.6426575719849508
Multinomial NB kappa:  0.41216171434883453
------
Logistic regression accuracy:  0.6803700138300129
Logistic regression precision:  0.702304647031837
Logistic regression recall:  0.700960219478738
Logistic regression F1:  0.7016125740405577
Logistic regression kappa:  0.4923041536485786


In [205]:
C = confusion_matrix(y_test, predicted_svm,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.59728507, 0.21719457, 0.18552036],
       [0.05778894, 0.7839196 , 0.15829146],
       [0.11818182, 0.46363636, 0.41818182]])

In [206]:
C = confusion_matrix(y_test, predicted_nb,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.60633484, 0.39366516, 0.        ],
       [0.05276382, 0.93969849, 0.00753769],
       [0.07272727, 0.90909091, 0.01818182]])

In [207]:
C = confusion_matrix(y_test, predicted_lr,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.7239819 , 0.16289593, 0.11312217],
       [0.10552764, 0.77889447, 0.11557789],
       [0.19090909, 0.43636364, 0.37272727]])

#### BTO features, not balanced class weights, sentences

In [213]:
results_svm = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_BOW_train_test_features(language="cat",k=i)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5)
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))
    
    clf_lr = LogisticRegression(max_iter=5).fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2193 instances, test on 729 instances


In [214]:
print("BOW BTO features (not balanced class weights), sentences")
print("------")
print("SVM accuracy: ", np.mean(results_svm))
print("SVM precision: ", precision_score(y_test, predicted_svm, average='weighted'))
print("SVM recall: ", recall_score(y_test, predicted_svm, average='weighted'))
print("SVM F1: ", f1_score(y_test, predicted_svm, average='weighted'))
print("SVM kappa: ", cohen_kappa_score(y_test, predicted_svm))
print("------")
print("Logistic regression accuracy: ", np.mean(results_lr))
print("Logistic regression precision: ", precision_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression recall: ", recall_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression F1: ", f1_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression kappa: ", cohen_kappa_score(y_test, predicted_lr))

BOW BTO features (not balanced class weights), sentences
------
SVM accuracy:  0.6776321591896401
SVM precision:  0.6878437956000542
SVM recall:  0.6941015089163237
SVM F1:  0.6815270092714738
SVM kappa:  0.44823337745647074
------
Logistic regression accuracy:  0.702950277632347
Logistic regression precision:  0.6906909087256016
Logistic regression recall:  0.7133058984910837
Logistic regression F1:  0.6966524905032667
Logistic regression kappa:  0.48899755501222497


In [215]:
C = confusion_matrix(y_test, predicted_svm,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.59276018, 0.29864253, 0.10859729],
       [0.05025126, 0.86934673, 0.08040201],
       [0.10909091, 0.62727273, 0.26363636]])

In [216]:
C = confusion_matrix(y_test, predicted_lr,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.73303167, 0.21719457, 0.04977376],
       [0.10050251, 0.83668342, 0.06281407],
       [0.18181818, 0.59090909, 0.22727273]])

#### BTO features + remove stop-words, balanced class weights, sentences


In [217]:
results_svm = []
results_nb = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_BOW_train_test_features(language="cat",k=i,remove_stopwords=True)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5,class_weight="balanced")
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))
    
    clf_nb = MultinomialNB().fit(X_train, y_train)
    predicted_nb = clf_nb.predict(X_test)
    results_nb.append(np.mean(predicted_nb == y_test))
    
    clf_lr = LogisticRegression(max_iter=5,class_weight="balanced").fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2193 instances, test on 729 instances


In [218]:
print("BOW BTO features + removed stop-words (balanced class weights), sentences")
print("------")
print("SVM accuracy: ", np.mean(results_svm))
print("SVM precision: ", precision_score(y_test, predicted_svm, average='weighted'))
print("SVM recall: ", recall_score(y_test, predicted_svm, average='weighted'))
print("SVM F1: ", f1_score(y_test, predicted_svm, average='weighted'))
print("SVM kappa: ", cohen_kappa_score(y_test, predicted_svm))
print("------")
print("Multinomial NB accuracy: ", np.mean(results_nb))
print("Multinomial NB precision: ", precision_score(y_test, predicted_nb, average='weighted'))
print("Multinomial NB recall: ", recall_score(y_test, predicted_nb, average='weighted'))
print("Multinomial NB F1: ", f1_score(y_test, predicted_nb, average='weighted'))
print("Multinomial NB kappa: ", cohen_kappa_score(y_test, predicted_nb))
print("------")
print("Logistic regression accuracy: ", np.mean(results_lr))
print("Logistic regression precision: ", precision_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression recall: ", recall_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression F1: ", f1_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression kappa: ", cohen_kappa_score(y_test, predicted_lr))

BOW BTO features + removed stop-words (balanced class weights), sentences
------
SVM accuracy:  0.43097622626426396
SVM precision:  0.5498772242933014
SVM recall:  0.5363511659807956
SVM F1:  0.453590082566995
SVM kappa:  0.11296628291249977
------
Multinomial NB accuracy:  0.548939855394737
Multinomial NB precision:  0.526882335009907
Multinomial NB recall:  0.5500685871056241
Multinomial NB F1:  0.4016668492105644
Multinomial NB kappa:  0.016012147998172743
------
Logistic regression accuracy:  0.5150614844463961
Logistic regression precision:  0.5164098163825087
Logistic regression recall:  0.5198902606310014
Logistic regression F1:  0.5130385459908494
Logistic regression kappa:  0.17836407021340306


In [219]:
C = confusion_matrix(y_test, predicted_svm,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.05429864, 0.73303167, 0.21266968],
       [0.01256281, 0.88442211, 0.10301508],
       [0.01818182, 0.73636364, 0.24545455]])

In [220]:
C = confusion_matrix(y_test, predicted_nb,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.02714932, 0.97285068, 0.        ],
       [0.00502513, 0.99246231, 0.00251256],
       [0.        , 1.        , 0.        ]])

In [221]:
C = confusion_matrix(y_test, predicted_lr,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.28054299, 0.47058824, 0.24886878],
       [0.17336683, 0.6959799 , 0.13065327],
       [0.19090909, 0.44545455, 0.36363636]])

#### BTO features + remove stop-words, not balanced class weights, sentences


In [222]:
results_svm = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_BOW_train_test_features(language="cat",k=i,remove_stopwords=True)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5)
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))
    
    clf_lr = LogisticRegression(max_iter=5).fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2193 instances, test on 729 instances


In [223]:
print("BOW BTO features + removed stop-words (not balanced class weights), sentences")
print("------")
print("SVM accuracy: ", np.mean(results_svm))
print("SVM precision: ", precision_score(y_test, predicted_svm, average='weighted'))
print("SVM recall: ", recall_score(y_test, predicted_svm, average='weighted'))
print("SVM F1: ", f1_score(y_test, predicted_svm, average='weighted'))
print("SVM kappa: ", cohen_kappa_score(y_test, predicted_svm))
print("------")
print("Logistic regression accuracy: ", np.mean(results_lr))
print("Logistic regression precision: ", precision_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression recall: ", recall_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression F1: ", f1_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression kappa: ", cohen_kappa_score(y_test, predicted_lr))

BOW BTO features + removed stop-words (not balanced class weights), sentences
------
SVM accuracy:  0.5149675172968986
SVM precision:  0.5175135792419744
SVM recall:  0.5445816186556928
SVM F1:  0.4099530636016058
SVM kappa:  0.022192056431575402
------
Logistic regression accuracy:  0.5431155810012779
Logistic regression precision:  0.4185298291933165
Logistic regression recall:  0.5349794238683128
Logistic regression F1:  0.42846376440353007
Logistic regression kappa:  0.022869343455310998


In [224]:
C = confusion_matrix(y_test, predicted_svm,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.04524887, 0.9321267 , 0.02262443],
       [0.00753769, 0.97236181, 0.0201005 ],
       [0.00909091, 0.99090909, 0.        ]])

In [225]:
C = confusion_matrix(y_test, predicted_lr,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.11312217, 0.88687783, 0.        ],
       [0.08040201, 0.91708543, 0.00251256],
       [0.06363636, 0.93636364, 0.        ]])

#### Term frequency features (balanced class weights), sentences


In [226]:
results_svm = []
results_nb = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_BOW_train_test_features(language="cat",k=i,repeat=True)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5,class_weight="balanced")
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))
    
    clf_nb = MultinomialNB().fit(X_train, y_train)
    predicted_nb = clf_nb.predict(X_test)
    results_nb.append(np.mean(predicted_nb == y_test))
    
    clf_lr = LogisticRegression(max_iter=5,class_weight="balanced").fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2193 instances, test on 729 instances


In [227]:
print("BOW term frequency features (balanced class weights), sentences")
print("------")
print("SVM accuracy: ", np.mean(results_svm))
print("SVM precision: ", precision_score(y_test, predicted_svm, average='weighted'))
print("SVM recall: ", recall_score(y_test, predicted_svm, average='weighted'))
print("SVM F1: ", f1_score(y_test, predicted_svm, average='weighted'))
print("SVM kappa: ", cohen_kappa_score(y_test, predicted_svm))
print("------")
print("Multinomial NB accuracy: ", np.mean(results_nb))
print("Multinomial NB precision: ", precision_score(y_test, predicted_nb, average='weighted'))
print("Multinomial NB recall: ", recall_score(y_test, predicted_nb, average='weighted'))
print("Multinomial NB F1: ", f1_score(y_test, predicted_nb, average='weighted'))
print("Multinomial NB kappa: ", cohen_kappa_score(y_test, predicted_nb))
print("------")
print("Logistic regression accuracy: ", np.mean(results_lr))
print("Logistic regression precision: ", precision_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression recall: ", recall_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression F1: ", f1_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression kappa: ", cohen_kappa_score(y_test, predicted_lr))

BOW term frequency features (balanced class weights), sentences
------
SVM accuracy:  0.6333852662512034
SVM precision:  0.6768081910785957
SVM recall:  0.663923182441701
SVM F1:  0.6688930285315153
SVM kappa:  0.4299215765131712
------
Multinomial NB accuracy:  0.6923389798066801
Multinomial NB precision:  0.6702752893286386
Multinomial NB recall:  0.6995884773662552
Multinomial NB F1:  0.6432119815183186
Multinomial NB kappa:  0.4148312679370595
------
Logistic regression accuracy:  0.6830966093011996
Logistic regression precision:  0.6911070723769087
Logistic regression recall:  0.6872427983539094
Logistic regression F1:  0.6887806123257557
Logistic regression kappa:  0.4732608881663387


In [228]:
C = confusion_matrix(y_test, predicted_svm,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.63800905, 0.22624434, 0.13574661],
       [0.08291457, 0.76130653, 0.15577889],
       [0.16363636, 0.47272727, 0.36363636]])

In [229]:
C = confusion_matrix(y_test, predicted_nb,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.6199095 , 0.3800905 , 0.        ],
       [0.06030151, 0.9321608 , 0.00753769],
       [0.08181818, 0.9       , 0.01818182]])

In [230]:
C = confusion_matrix(y_test, predicted_lr,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.74208145, 0.16289593, 0.09502262],
       [0.11557789, 0.74874372, 0.13567839],
       [0.21818182, 0.42727273, 0.35454545]])

#### Term frequency features (not balanced class weights), sentences

In [231]:
results_svm = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_BOW_train_test_features(language="cat",k=i,repeat=True)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5)
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))
    
    clf_lr = LogisticRegression(max_iter=5).fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2193 instances, test on 729 instances


In [232]:
print("BOW term frequency features (not balanced class weights), sentences")
print("------")
print("SVM accuracy: ", np.mean(results_svm))
print("SVM precision: ", precision_score(y_test, predicted_svm, average='weighted'))
print("SVM recall: ", recall_score(y_test, predicted_svm, average='weighted'))
print("SVM F1: ", f1_score(y_test, predicted_svm, average='weighted'))
print("SVM kappa: ", cohen_kappa_score(y_test, predicted_svm))
print("------")
print("Logistic regression accuracy: ", np.mean(results_lr))
print("Logistic regression precision: ", precision_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression recall: ", recall_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression F1: ", f1_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression kappa: ", cohen_kappa_score(y_test, predicted_lr))

BOW term frequency features (not balanced class weights), sentences
------
SVM accuracy:  0.6701347253419503
SVM precision:  0.6736103932440763
SVM recall:  0.6872427983539094
SVM F1:  0.6767494300073893
SVM kappa:  0.44457699672183737
------
Logistic regression accuracy:  0.700218052576567
Logistic regression precision:  0.6967306600429606
Logistic regression recall:  0.7187928669410151
Logistic regression F1:  0.7026499977633002
Logistic regression kappa:  0.49985441913233797


In [233]:
C = confusion_matrix(y_test, predicted_svm,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.64253394, 0.2760181 , 0.08144796],
       [0.0879397 , 0.82663317, 0.08542714],
       [0.15454545, 0.57272727, 0.27272727]])

In [234]:
C = confusion_matrix(y_test, predicted_lr,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.73755656, 0.2081448 , 0.05429864],
       [0.09798995, 0.84170854, 0.06030151],
       [0.2       , 0.56363636, 0.23636364]])

#### BTO + tf-idf features (balanced class weights), sentences


In [235]:
results_svm = []
results_nb = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_BOW_train_test_features(language="cat",k=i,tf_idf=True)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5,class_weight="balanced")
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))
    
    clf_nb = MultinomialNB().fit(X_train, y_train)
    predicted_nb = clf_nb.predict(X_test)
    results_nb.append(np.mean(predicted_nb == y_test))
    
    clf_lr = LogisticRegression(max_iter=5,class_weight="balanced").fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2193 instances, test on 729 instances


In [236]:
print("BOW BTO + tf_idf features (balanced class weights), sentences")
print("------")
print("SVM accuracy: ", np.mean(results_svm))
print("SVM precision: ", precision_score(y_test, predicted_svm, average='weighted'))
print("SVM recall: ", recall_score(y_test, predicted_svm, average='weighted'))
print("SVM F1: ", f1_score(y_test, predicted_svm, average='weighted'))
print("SVM kappa: ", cohen_kappa_score(y_test, predicted_svm))
print("------")
print("Multinomial NB accuracy: ", np.mean(results_nb))
print("Multinomial NB precision: ", precision_score(y_test, predicted_nb, average='weighted'))
print("Multinomial NB recall: ", recall_score(y_test, predicted_nb, average='weighted'))
print("Multinomial NB F1: ", f1_score(y_test, predicted_nb, average='weighted'))
print("Multinomial NB kappa: ", cohen_kappa_score(y_test, predicted_nb))
print("------")
print("Logistic regression accuracy: ", np.mean(results_lr))
print("Logistic regression precision: ", precision_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression recall: ", recall_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression F1: ", f1_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression kappa: ", cohen_kappa_score(y_test, predicted_lr))

BOW BTO + tf_idf features (balanced class weights), sentences
------
SVM accuracy:  0.6772585893386927
SVM precision:  0.7006937572828149
SVM recall:  0.700960219478738
SVM F1:  0.6718919294068729
SVM kappa:  0.43150574675638265
------
Multinomial NB accuracy:  0.6420212835828177
Multinomial NB precision:  0.7614348559152432
Multinomial NB recall:  0.635116598079561
Multinomial NB F1:  0.5516376906221301
Multinomial NB kappa:  0.23788525479284095
------
Logistic regression accuracy:  0.7063721268007633
Logistic regression precision:  0.6999359333874701
Logistic regression recall:  0.7160493827160493
Logistic regression F1:  0.6981368229191002
Logistic regression kappa:  0.4829521508968494


In [237]:
C = confusion_matrix(y_test, predicted_svm,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.54751131, 0.40723982, 0.04524887],
       [0.02512563, 0.92964824, 0.04522613],
       [0.06363636, 0.75454545, 0.18181818]])

In [238]:
C = confusion_matrix(y_test, predicted_nb,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.29864253, 0.70135747, 0.        ],
       [0.00753769, 0.99246231, 0.        ],
       [0.01818182, 0.96363636, 0.01818182]])

In [239]:
C = confusion_matrix(y_test, predicted_lr,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.66515837, 0.29411765, 0.04072398],
       [0.06532663, 0.87437186, 0.06030151],
       [0.13636364, 0.61818182, 0.24545455]])

#### BTO + tf-idf features (not balanced class weights), sentences


In [240]:
results_svm = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_BOW_train_test_features(language="cat",k=i,tf_idf=True)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5)
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))
    
    clf_lr = LogisticRegression(max_iter=5).fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2193 instances, test on 729 instances


In [241]:
print("BOW BTO + tf_idf features (not balanced class weights), sentences")
print("------")
print("SVM accuracy: ", np.mean(results_svm))
print("SVM precision: ", precision_score(y_test, predicted_svm, average='weighted'))
print("SVM recall: ", recall_score(y_test, predicted_svm, average='weighted'))
print("SVM F1: ", f1_score(y_test, predicted_svm, average='weighted'))
print("SVM kappa: ", cohen_kappa_score(y_test, predicted_svm))
print("------")
print("Logistic regression accuracy: ", np.mean(results_lr))
print("Logistic regression precision: ", precision_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression recall: ", recall_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression F1: ", f1_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression kappa: ", cohen_kappa_score(y_test, predicted_lr))

BOW BTO + tf_idf features (not balanced class weights), sentences
------
SVM accuracy:  0.7068168170704017
SVM precision:  0.7795037297888661
SVM recall:  0.6954732510288066
SVM F1:  0.6440544381272382
SVM kappa:  0.38898390512974435
------
Logistic regression accuracy:  0.7091137344975313
Logistic regression precision:  0.7585030124334202
Logistic regression recall:  0.7242798353909465
Logistic regression F1:  0.6837755838680968
Logistic regression kappa:  0.467726643636324


In [242]:
C = confusion_matrix(y_test, predicted_svm,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.45701357, 0.54298643, 0.        ],
       [0.01256281, 0.98743719, 0.        ],
       [0.05454545, 0.82727273, 0.11818182]])

In [243]:
C = confusion_matrix(y_test, predicted_lr,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.6199095 , 0.3800905 , 0.        ],
       [0.04773869, 0.94723618, 0.00502513],
       [0.09090909, 0.78181818, 0.12727273]])

#### Term frequency + tf-idf features (balanced class weights), sentences


In [244]:
results_svm = []
results_nb = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_BOW_train_test_features(language="cat",k=i,repeat=True,tf_idf=True)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5,class_weight="balanced")
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))
    
    clf_nb = MultinomialNB().fit(X_train, y_train)
    predicted_nb = clf_nb.predict(X_test)
    results_nb.append(np.mean(predicted_nb == y_test))
    
    clf_lr = LogisticRegression(max_iter=5,class_weight="balanced").fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2193 instances, test on 729 instances


In [245]:
print("BOW term frequency + tf-idf features (balanced class weights), sentences")
print("------")
print("SVM accuracy: ", np.mean(results_svm))
print("SVM precision: ", precision_score(y_test, predicted_svm, average='weighted'))
print("SVM recall: ", recall_score(y_test, predicted_svm, average='weighted'))
print("SVM F1: ", f1_score(y_test, predicted_svm, average='weighted'))
print("SVM kappa: ", cohen_kappa_score(y_test, predicted_svm))
print("------")
print("Multinomial NB accuracy: ", np.mean(results_nb))
print("Multinomial NB precision: ", precision_score(y_test, predicted_nb, average='weighted'))
print("Multinomial NB recall: ", recall_score(y_test, predicted_nb, average='weighted'))
print("Multinomial NB F1: ", f1_score(y_test, predicted_nb, average='weighted'))
print("Multinomial NB kappa: ", cohen_kappa_score(y_test, predicted_nb))
print("------")
print("Logistic regression accuracy: ", np.mean(results_lr))
print("Logistic regression precision: ", precision_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression recall: ", recall_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression F1: ", f1_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression kappa: ", cohen_kappa_score(y_test, predicted_lr))

BOW term frequency + tf-idf features (balanced class weights), sentences
------
SVM accuracy:  0.6734281261552377
SVM precision:  0.6780474066566856
SVM recall:  0.6858710562414266
SVM F1:  0.6625377420511425
SVM kappa:  0.4148272779851727
------
Multinomial NB accuracy:  0.6399692999986865
Multinomial NB precision:  0.7614348559152432
Multinomial NB recall:  0.635116598079561
Multinomial NB F1:  0.5516376906221301
Multinomial NB kappa:  0.23788525479284095
------
Logistic regression accuracy:  0.7070589361211036
Logistic regression precision:  0.7004895588321132
Logistic regression recall:  0.720164609053498
Logistic regression F1:  0.7030367435862461
Logistic regression kappa:  0.49469950256870265


In [246]:
C = confusion_matrix(y_test, predicted_svm,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.53846154, 0.3800905 , 0.08144796],
       [0.04522613, 0.89949749, 0.05527638],
       [0.08181818, 0.70909091, 0.20909091]])

In [247]:
C = confusion_matrix(y_test, predicted_nb,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.29864253, 0.70135747, 0.        ],
       [0.00753769, 0.99246231, 0.        ],
       [0.01818182, 0.96363636, 0.01818182]])

In [248]:
C = confusion_matrix(y_test, predicted_lr,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.69230769, 0.25791855, 0.04977376],
       [0.06532663, 0.86934673, 0.06532663],
       [0.16363636, 0.6       , 0.23636364]])

#### Term frequency + tf-idf features (not balanced class weights), sentences


In [251]:
results_svm = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_BOW_train_test_features(language="cat",k=i,repeat=True,tf_idf=True)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5)
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))
    
    clf_lr = LogisticRegression(max_iter=5).fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2193 instances, test on 729 instances


In [252]:
print("BOW term frequency + tf-idf features (not balanced class weights), sentences")
print("------")
print("SVM accuracy: ", np.mean(results_svm))
print("SVM precision: ", precision_score(y_test, predicted_svm, average='weighted'))
print("SVM recall: ", recall_score(y_test, predicted_svm, average='weighted'))
print("SVM F1: ", f1_score(y_test, predicted_svm, average='weighted'))
print("SVM kappa: ", cohen_kappa_score(y_test, predicted_svm))
print("------")
print("Logistic regression accuracy: ", np.mean(results_lr))
print("Logistic regression precision: ", precision_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression recall: ", recall_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression F1: ", f1_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression kappa: ", cohen_kappa_score(y_test, predicted_lr))

BOW term frequency + tf-idf features (not balanced class weights), sentences
------
SVM accuracy:  0.7029135445928778
SVM precision:  0.7704666782231113
SVM recall:  0.6872427983539094
SVM F1:  0.6341748322252306
SVM kappa:  0.37120936989286357
------
Logistic regression accuracy:  0.7073971803287302
Logistic regression precision:  0.7411210027120291
Logistic regression recall:  0.7146776406035665
Logistic regression F1:  0.6742361921387212
Logistic regression kappa:  0.4497654013215907


In [253]:
C = confusion_matrix(y_test, predicted_svm,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.43438914, 0.56561086, 0.        ],
       [0.01507538, 0.98492462, 0.        ],
       [0.06363636, 0.81818182, 0.11818182]])

In [254]:
C = confusion_matrix(y_test, predicted_lr,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.60180995, 0.39366516, 0.00452489],
       [0.05527638, 0.93969849, 0.00502513],
       [0.1       , 0.77272727, 0.12727273]])

#### bigrams, sentences

In [15]:
results_svm = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_ngram_train_test_features(language="cat",k=i,n=2)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5,class_weight="balanced")
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))
    
    clf_lr = LogisticRegression(max_iter=5,class_weight="balanced").fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2193 instances, test on 729 instances


In [16]:
print("bigrams, sentences")
print("------")
print("SVM accuracy: ", np.mean(results_svm))
print("SVM precision: ", precision_score(y_test, predicted_svm, average='weighted'))
print("SVM recall: ", recall_score(y_test, predicted_svm, average='weighted'))
print("SVM F1: ", f1_score(y_test, predicted_svm, average='weighted'))
print("SVM kappa: ", cohen_kappa_score(y_test, predicted_svm))
print("------")
print("Logistic regression accuracy: ", np.mean(results_lr))
print("Logistic regression precision: ", precision_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression recall: ", recall_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression F1: ", f1_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression kappa: ", cohen_kappa_score(y_test, predicted_lr))

bigrams, sentences
------
SVM accuracy:  0.6246482917025553
SVM precision:  0.6690478941419349
SVM recall:  0.6310013717421125
SVM F1:  0.6449321594603259
SVM kappa:  0.3943798988270609
------
Logistic regression accuracy:  0.6451152094486948
Logistic regression precision:  0.6612941032655779
Logistic regression recall:  0.6584362139917695
Logistic regression F1:  0.6598142649168305
Logistic regression kappa:  0.42137877614252517


#### bigrams + TF-IDF, sentences

In [17]:
results_svm = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_ngram_train_test_features(language="cat",k=i,n=2,tf_idf=True)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5,class_weight="balanced")
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))
    
    clf_lr = LogisticRegression(max_iter=5,class_weight="balanced").fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2193 instances, test on 729 instances


In [18]:
print("bigrams + tf-idf, sentences")
print("------")
print("SVM accuracy: ", np.mean(results_svm))
print("SVM precision: ", precision_score(y_test, predicted_svm, average='weighted'))
print("SVM recall: ", recall_score(y_test, predicted_svm, average='weighted'))
print("SVM F1: ", f1_score(y_test, predicted_svm, average='weighted'))
print("SVM kappa: ", cohen_kappa_score(y_test, predicted_svm))
print("------")
print("Logistic regression accuracy: ", np.mean(results_lr))
print("Logistic regression precision: ", precision_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression recall: ", recall_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression F1: ", f1_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression kappa: ", cohen_kappa_score(y_test, predicted_lr))

bigrams + tf-idf, sentences
------
SVM accuracy:  0.5969569280482794
SVM precision:  0.6479739255215056
SVM recall:  0.6310013717421125
SVM F1:  0.5550990998218441
SVM kappa:  0.24162934775042344
------
Logistic regression accuracy:  0.6663368668359295
Logistic regression precision:  0.6705187770955745
Logistic regression recall:  0.6844993141289437
Logistic regression F1:  0.6295228774025692
Logistic regression kappa:  0.3833916218929623


#### trigrams, sentences

In [19]:
results_svm = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_ngram_train_test_features(language="cat",k=i,n=3)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5,class_weight="balanced")
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))
    
    clf_lr = LogisticRegression(max_iter=5,class_weight="balanced").fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2193 instances, test on 729 instances


In [20]:
print("trigrams, sentences")
print("------")
print("SVM accuracy: ", np.mean(results_svm))
print("SVM precision: ", precision_score(y_test, predicted_svm, average='weighted'))
print("SVM recall: ", recall_score(y_test, predicted_svm, average='weighted'))
print("SVM F1: ", f1_score(y_test, predicted_svm, average='weighted'))
print("SVM kappa: ", cohen_kappa_score(y_test, predicted_svm))
print("------")
print("Logistic regression accuracy: ", np.mean(results_lr))
print("Logistic regression precision: ", precision_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression recall: ", recall_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression F1: ", f1_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression kappa: ", cohen_kappa_score(y_test, predicted_lr))

trigrams, sentences
------
SVM accuracy:  0.5563049940044924
SVM precision:  0.5630427463598949
SVM recall:  0.5665294924554184
SVM F1:  0.5308880665299017
SVM kappa:  0.18347989749369975
------
Logistic regression accuracy:  0.5937799658096562
Logistic regression precision:  0.5559731822876984
Logistic regression recall:  0.6063100137174211
Logistic regression F1:  0.5533165190048753
Logistic regression kappa:  0.23178348369188062


#### trigrams + TF-IDF, sentences

In [21]:
results_svm = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_ngram_train_test_features(language="cat",k=i,n=3,tf_idf=True)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5,class_weight="balanced")
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))
    
    clf_lr = LogisticRegression(max_iter=5,class_weight="balanced").fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2193 instances, test on 729 instances


In [22]:
print("trigrams + tf-idf, sentences")
print("------")
print("SVM accuracy: ", np.mean(results_svm))
print("SVM precision: ", precision_score(y_test, predicted_svm, average='weighted'))
print("SVM recall: ", recall_score(y_test, predicted_svm, average='weighted'))
print("SVM F1: ", f1_score(y_test, predicted_svm, average='weighted'))
print("SVM kappa: ", cohen_kappa_score(y_test, predicted_svm))
print("------")
print("Logistic regression accuracy: ", np.mean(results_lr))
print("Logistic regression precision: ", precision_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression recall: ", recall_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression F1: ", f1_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression kappa: ", cohen_kappa_score(y_test, predicted_lr))

trigrams + tf-idf, sentences
------
SVM accuracy:  0.4754867245012657
SVM precision:  0.5699352785122451
SVM recall:  0.5829903978052127
SVM F1:  0.4791438769582689
SVM kappa:  0.11968062920812716
------
Logistic regression accuracy:  0.590015181113119
Logistic regression precision:  0.5624802654788682
Logistic regression recall:  0.6021947873799726
Logistic regression F1:  0.5170534036286406
Logistic regression kappa:  0.17855644146034422


#### 4-grams, sentences

In [23]:
results_svm = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_ngram_train_test_features(language="cat",k=i,n=4)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5,class_weight="balanced")
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))
    
    clf_lr = LogisticRegression(max_iter=5,class_weight="balanced").fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 2191 instances, test on 731 instances


  


train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2193 instances, test on 729 instances


In [24]:
print("4-grams, sentences")
print("------")
print("SVM accuracy: ", np.mean(results_svm))
print("SVM precision: ", precision_score(y_test, predicted_svm, average='weighted'))
print("SVM recall: ", recall_score(y_test, predicted_svm, average='weighted'))
print("SVM F1: ", f1_score(y_test, predicted_svm, average='weighted'))
print("SVM kappa: ", cohen_kappa_score(y_test, predicted_svm))
print("------")
print("Logistic regression accuracy: ", np.mean(results_lr))
print("Logistic regression precision: ", precision_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression recall: ", recall_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression F1: ", f1_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression kappa: ", cohen_kappa_score(y_test, predicted_lr))

4-grams, sentences
------
SVM accuracy:  0.516495386555426
SVM precision:  0.5492522787116231
SVM recall:  0.5610425240054869
SVM F1:  0.46395029135102156
SVM kappa:  0.09134463699358486
------
Logistic regression accuracy:  0.5681048378773463
Logistic regression precision:  0.5265652906256473
Logistic regression recall:  0.5692729766803841
Logistic regression F1:  0.47960519945618607
Logistic regression kappa:  0.10958888123884092


#### 4-grams + TF-IDF, sentences

In [25]:
results_svm = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_ngram_train_test_features(language="cat",k=i,n=4,tf_idf=True)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5,class_weight="balanced")
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))
    
    clf_lr = LogisticRegression(max_iter=5,class_weight="balanced").fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 2191 instances, test on 731 instances


  


train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2193 instances, test on 729 instances


In [26]:
print("4-grams + tf-idf, sentences")
print("------")
print("SVM accuracy: ", np.mean(results_svm))
print("SVM precision: ", precision_score(y_test, predicted_svm, average='weighted'))
print("SVM recall: ", recall_score(y_test, predicted_svm, average='weighted'))
print("SVM F1: ", f1_score(y_test, predicted_svm, average='weighted'))
print("SVM kappa: ", cohen_kappa_score(y_test, predicted_svm))
print("------")
print("Logistic regression accuracy: ", np.mean(results_lr))
print("Logistic regression precision: ", precision_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression recall: ", recall_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression F1: ", f1_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression kappa: ", cohen_kappa_score(y_test, predicted_lr))

4-grams + tf-idf, sentences
------
SVM accuracy:  0.4123323087489374
SVM precision:  0.6362937707966947
SVM recall:  0.5679012345679012
SVM F1:  0.4413778505642243
SVM kappa:  0.06909006883467506
------
Logistic regression accuracy:  0.559550214956305
Logistic regression precision:  0.5315398352742676
Logistic regression recall:  0.5624142661179699
Logistic regression F1:  0.44945088320409204
Logistic regression kappa:  0.06936790923824976


### Train models, comment level

#### BTO features, comments

In [28]:
results_svm = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_BOW_train_test_features(k=i,language="cat",full_comments=True)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5,class_weight="balanced")
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))

    clf_lr = LogisticRegression(max_iter=5,class_weight="balanced").fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 1198 instances, test on 400 instances
train on 1198 instances, test on 400 instances
train on 1198 instances, test on 400 instances
train on 1200 instances, test on 398 instances


In [29]:
print("BOW BTO features, comments")
print("SVM accuracy: ", np.mean(results_svm))
print("Logistic regression accuracy: ", np.mean(results_lr))

BOW BTO features, comments
SVM accuracy:  0.6227525125628142
Logistic regression accuracy:  0.6614698492462311


In [30]:
C = confusion_matrix(y_test, predicted_svm,['POSITIVE', 'NEGATIVE','NEUTRAL'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.68382353, 0.16176471, 0.15441176],
       [0.16666667, 0.703125  , 0.13020833],
       [0.31428571, 0.28571429, 0.4       ]])

In [31]:
C = confusion_matrix(y_test, predicted_lr,['POSITIVE', 'NEGATIVE','NEUTRAL'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.72794118, 0.125     , 0.14705882],
       [0.16145833, 0.71354167, 0.125     ],
       [0.25714286, 0.27142857, 0.47142857]])

#### Term frequency features, comments


In [35]:
results_svm = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_BOW_train_test_features(k=i,language="cat",repeat=True,full_comments=True)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5,class_weight="balanced")
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))
    
    clf_lr = LogisticRegression(max_iter=5,class_weight="balanced").fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 1198 instances, test on 400 instances
train on 1198 instances, test on 400 instances
train on 1198 instances, test on 400 instances
train on 1200 instances, test on 398 instances


In [36]:
print("BOW term frequency features, comments")
print("SVM accuracy: ", np.mean(results_svm))
print("Logistic regression accuracy: ", np.mean(results_lr))

BOW term frequency features, comments
SVM accuracy:  0.6067126256281408
Logistic regression accuracy:  0.6439667085427135


#### BTO + tf-idf features, comments

In [37]:
results_svm = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_BOW_train_test_features(k=i,language="cat",tf_idf=True,full_comments=True)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5,class_weight="balanced")
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))
    
    clf_lr = LogisticRegression(max_iter=5,class_weight="balanced").fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 1198 instances, test on 400 instances
train on 1198 instances, test on 400 instances
train on 1198 instances, test on 400 instances
train on 1200 instances, test on 398 instances


In [38]:
print("BOW BTO + tf_idf features, comments")
print("SVM accuracy: ", np.mean(results_svm))
print("Logistic regression accuracy: ", np.mean(results_lr))

BOW BTO + tf_idf features, comments
SVM accuracy:  0.6536774497487438
Logistic regression accuracy:  0.6821419597989948


#### Term frequency + tf-idf features, comments

In [39]:
results_svm = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_BOW_train_test_features(k=i,language="cat",repeat=True,tf_idf=True,full_comments=True)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5,class_weight="balanced")
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))
    
    clf_lr = LogisticRegression(max_iter=5,class_weight="balanced").fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 1198 instances, test on 400 instances
train on 1198 instances, test on 400 instances
train on 1198 instances, test on 400 instances
train on 1200 instances, test on 398 instances


In [40]:
print("BOW term frequency + tf-idf features, comments")
print("SVM accuracy: ", np.mean(results_svm))
print("Logistic regression accuracy: ", np.mean(results_lr))

BOW term frequency + tf-idf features, comments
SVM accuracy:  0.651169283919598
Logistic regression accuracy:  0.671485552763819


#### bigrams, comments

In [41]:
results_svm = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_ngram_train_test_features(language="cat",k=i,n=2,full_comments=True)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5,class_weight="balanced")
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))
    
    clf_lr = LogisticRegression(max_iter=5,class_weight="balanced").fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 1198 instances, test on 400 instances
train on 1198 instances, test on 400 instances
train on 1198 instances, test on 400 instances
train on 1200 instances, test on 398 instances


In [42]:
print("bigram features, comments")
print("SVM accuracy: ", np.mean(results_svm))
print("Logistic regression accuracy: ", np.mean(results_lr))

bigram features, comments
SVM accuracy:  0.608953203517588
Logistic regression accuracy:  0.6132600502512563


#### bigrams TF-IDF, comments

In [43]:
results_svm = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_ngram_train_test_features(language="cat",k=i,n=2,tf_idf=True,full_comments=True)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5,class_weight="balanced")
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))
    
    clf_lr = LogisticRegression(max_iter=5,class_weight="balanced").fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 1198 instances, test on 400 instances
train on 1198 instances, test on 400 instances
train on 1198 instances, test on 400 instances
train on 1200 instances, test on 398 instances


In [44]:
print("bigram + tf-idf features, comments")
print("SVM accuracy: ", np.mean(results_svm))
print("Logistic regression accuracy: ", np.mean(results_lr))

bigram + tf-idf features, comments
SVM accuracy:  0.6088872487437185
Logistic regression accuracy:  0.6420760050251257


### Train models, English translations

#### BTO features, English

In [12]:
results_svm = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_BOW_train_test_features(k=i,english=True,language="cat")
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5,class_weight="balanced")
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))
    
    clf_lr = LogisticRegression(max_iter=5,class_weight="balanced").fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 2182 instances, test on 729 instances
train on 2184 instances, test on 727 instances
train on 2183 instances, test on 728 instances
train on 2184 instances, test on 727 instances


In [13]:
print("BOW BTO features, English")
print("SVM accuracy: ", np.mean(results_svm))
print("Logistic regression accuracy: ", np.mean(results_lr))

BOW BTO features, English
SVM accuracy:  0.6441682370659643
Logistic regression accuracy:  0.6794815033451398


#### Term frequency features, English

In [15]:
results_svm = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_BOW_train_test_features(k=i,repeat=True,english=True,language="cat")
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5,class_weight="balanced")
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))

    clf_lr = LogisticRegression(max_iter=5,class_weight="balanced").fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 2182 instances, test on 729 instances
train on 2184 instances, test on 727 instances
train on 2183 instances, test on 728 instances
train on 2184 instances, test on 727 instances


In [16]:
print("BOW term frequency features, English")
print("SVM accuracy: ", np.mean(results_svm))
print("Logistic regression accuracy: ", np.mean(results_lr))

BOW term frequency features, English
SVM accuracy:  0.6425967971422517
Logistic regression accuracy:  0.6805117231253595


#### BTO + tf-idf features, English

In [18]:
results_svm = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_BOW_train_test_features(k=i,tf_idf=True,english=True,language="cat")
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5,class_weight="balanced")
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))

    clf_lr = LogisticRegression(max_iter=5,class_weight="balanced").fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 2182 instances, test on 729 instances
train on 2184 instances, test on 727 instances
train on 2183 instances, test on 728 instances
train on 2184 instances, test on 727 instances


In [19]:
print("BOW BTO + tf_idf features, English")
print("SVM accuracy: ", np.mean(results_svm))
print("Logistic regression accuracy: ", np.mean(results_lr))

BOW BTO + tf_idf features, English
SVM accuracy:  0.6917590931795476
Logistic regression accuracy:  0.7052701086791997


#### Term frequency + tf-idf features, English

In [21]:
results_svm = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_BOW_train_test_features(k=i,repeat=True,tf_idf=True,english=True,language="cat")
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5,class_weight="balanced")
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))
    
    clf_lr = LogisticRegression(max_iter=5,class_weight="balanced").fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 2182 instances, test on 729 instances
train on 2184 instances, test on 727 instances
train on 2183 instances, test on 728 instances
train on 2184 instances, test on 727 instances


In [22]:
print("BOW term frequency + tf-idf features, English")
print("SVM accuracy: ", np.mean(results_svm))
print("Logistic regression accuracy: ", np.mean(results_lr))

BOW term frequency + tf-idf features, English
SVM accuracy:  0.692852885750613
Logistic regression accuracy:  0.7045795113976933


#### bigrams, English

In [23]:
results_svm = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_ngram_train_test_features(language="cat",english=True,k=i,n=2,full_comments=True)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5,class_weight="balanced")
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))
    
    clf_lr = LogisticRegression(max_iter=5,class_weight="balanced").fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 1198 instances, test on 400 instances
train on 1198 instances, test on 400 instances
train on 1198 instances, test on 400 instances
train on 1200 instances, test on 398 instances


In [24]:
print("bigrams, English")
print("SVM accuracy: ", np.mean(results_svm))
print("Logistic regression accuracy: ", np.mean(results_lr))

bigrams, English
SVM accuracy:  0.608953203517588
Logistic regression accuracy:  0.6132600502512563


#### bigrams + TF-IDF, English

In [25]:
results_svm = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_ngram_train_test_features(language="cat",english=True,k=i,n=2,tf_idf=True,full_comments=True)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5,class_weight="balanced")
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))
    
    clf_lr = LogisticRegression(max_iter=5,class_weight="balanced").fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 1198 instances, test on 400 instances
train on 1198 instances, test on 400 instances
train on 1198 instances, test on 400 instances
train on 1200 instances, test on 398 instances


In [26]:
print("bigrams + tf-idf, English")
print("SVM accuracy: ", np.mean(results_svm))
print("Logistic regression accuracy: ", np.mean(results_lr))

bigrams + tf-idf, English
SVM accuracy:  0.6088872487437185
Logistic regression accuracy:  0.6420760050251257


## Experiments for Spanish

### Train models, sentence level

In [11]:
k = [1,2,3,4]
random_states = [0,1,2,4,5,42,50,60,70,100]

#### BTO features, sentences

In [17]:
results_svm = []
results_nb = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_BOW_train_test_features(language="es",k=i)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5)
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))
    
    clf_nb = MultinomialNB().fit(X_train, y_train)
    predicted_nb = clf_nb.predict(X_test)
    results_nb.append(np.mean(predicted_nb == y_test))
    
    clf_lr = LogisticRegression(max_iter=5).fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 889 instances, test on 297 instances
train on 890 instances, test on 296 instances
train on 889 instances, test on 297 instances
train on 890 instances, test on 296 instances


In [18]:
print("BOW BTO features, sentences")
print("SVM accuracy: ", np.mean(results_svm))
print("Multinomial NB accuracy: ", np.mean(results_nb))
print("Logistic regression accuracy: ", np.mean(results_lr))

BOW BTO features, sentences
SVM accuracy:  0.6819768632268632
Multinomial NB accuracy:  0.7428337428337428
Logistic regression accuracy:  0.7301733551733551


In [19]:
C = confusion_matrix(y_test, predicted_svm,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.23809524, 0.61904762, 0.14285714],
       [0.1       , 0.81818182, 0.08181818],
       [0.05882353, 0.58823529, 0.35294118]])

In [20]:
C = confusion_matrix(y_test, predicted_nb,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.        , 1.        , 0.        ],
       [0.        , 1.        , 0.        ],
       [0.02941176, 0.97058824, 0.        ]])

In [21]:
C = confusion_matrix(y_test, predicted_lr,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.07142857, 0.88095238, 0.04761905],
       [0.03636364, 0.95454545, 0.00909091],
       [0.05882353, 0.91176471, 0.02941176]])

#### BTO features + remove stop-words, sentences

In [23]:
results_svm = []
results_nb = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_BOW_train_test_features(language="es",k=i,remove_stopwords=True)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5)
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))
    
    clf_nb = MultinomialNB().fit(X_train, y_train)
    predicted_nb = clf_nb.predict(X_test)
    results_nb.append(np.mean(predicted_nb == y_test))
    
    clf_lr = LogisticRegression(max_iter=5).fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 889 instances, test on 297 instances
train on 890 instances, test on 296 instances
train on 889 instances, test on 297 instances
train on 890 instances, test on 296 instances


In [24]:
print("BOW BTO features, removed stopwords, sentences")
print("SVM accuracy: ", np.mean(results_svm))
print("Multinomial NB accuracy: ", np.mean(results_nb))
print("Logistic regression accuracy: ", np.mean(results_lr))

BOW BTO features, removed stopwords, sentences
SVM accuracy:  0.7227537196287197
Multinomial NB accuracy:  0.7436754936754937
Logistic regression accuracy:  0.7394638957138957


#### Term frequency features, sentences


In [25]:
results_svm = []
results_nb = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_BOW_train_test_features(language="es",k=i,repeat=True)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5)
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))
    
    clf_nb = MultinomialNB().fit(X_train, y_train)
    predicted_nb = clf_nb.predict(X_test)
    results_nb.append(np.mean(predicted_nb == y_test))
    
    clf_lr = LogisticRegression(max_iter=5).fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 889 instances, test on 297 instances
train on 890 instances, test on 296 instances
train on 889 instances, test on 297 instances
train on 890 instances, test on 296 instances


In [26]:
print("BOW term frequency features, sentences")
print("SVM accuracy: ", np.mean(results_svm))
print("Multinomial NB accuracy: ", np.mean(results_nb))
print("Logistic regression accuracy: ", np.mean(results_lr))

BOW term frequency features, sentences
SVM accuracy:  0.6872002684502685
Multinomial NB accuracy:  0.7428337428337428
Logistic regression accuracy:  0.7335517335517335


#### BTO + tf-idf features, sentences


In [27]:
results_svm = []
results_nb = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_BOW_train_test_features(language="es",k=i,tf_idf=True)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5)
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))
    
    clf_nb = MultinomialNB().fit(X_train, y_train)
    predicted_nb = clf_nb.predict(X_test)
    results_nb.append(np.mean(predicted_nb == y_test))
    
    clf_lr = LogisticRegression(max_iter=5).fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 889 instances, test on 297 instances
train on 890 instances, test on 296 instances
train on 889 instances, test on 297 instances
train on 890 instances, test on 296 instances


In [28]:
print("BOW BTO + tf_idf features, sentences")
print("SVM accuracy: ", np.mean(results_svm))
print("Multinomial NB accuracy: ", np.mean(results_nb))
print("Logistic regression accuracy: ", np.mean(results_lr))

BOW BTO + tf_idf features, sentences
SVM accuracy:  0.7371721152971152
Multinomial NB accuracy:  0.7436754936754937
Logistic regression accuracy:  0.7453589953589954


#### Term frequency + tf-idf features, sentences

In [29]:
results_svm = []
results_nb = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_BOW_train_test_features(language="es",k=i,repeat=True,tf_idf=True)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5)
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))
    
    clf_nb = MultinomialNB().fit(X_train, y_train)
    predicted_nb = clf_nb.predict(X_test)
    results_nb.append(np.mean(predicted_nb == y_test))
    
    clf_lr = LogisticRegression(max_iter=5).fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 889 instances, test on 297 instances
train on 890 instances, test on 296 instances
train on 889 instances, test on 297 instances
train on 890 instances, test on 296 instances


In [30]:
print("BOW term frequency + tf-idf features, sentenes")
print("SVM accuracy: ", np.mean(results_svm))
print("Multinomial NB accuracy: ", np.mean(results_nb))
print("Logistic regression accuracy: ", np.mean(results_lr))

BOW term frequency + tf-idf features, sentenes
SVM accuracy:  0.7393700518700519
Multinomial NB accuracy:  0.7436754936754937
Logistic regression accuracy:  0.7470424970424969


#### bigrams, sentences

In [33]:
results_svm = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_ngram_train_test_features(language="es",k=i,n=2)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5,class_weight="balanced")
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))
    
    clf_lr = LogisticRegression(max_iter=5,class_weight="balanced").fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 889 instances, test on 297 instances
train on 890 instances, test on 296 instances
train on 889 instances, test on 297 instances
train on 890 instances, test on 296 instances


In [34]:
print("bigram features, sentences")
print("SVM accuracy: ", np.mean(results_svm))
print("Multinomial NB accuracy: ", np.mean(results_nb))
print("Logistic regression accuracy: ", np.mean(results_lr))

bigram features, sentences
SVM accuracy:  0.6796267858767859
Multinomial NB accuracy:  0.7436754936754937
Logistic regression accuracy:  0.7259958822458823


#### bigrams + TF-IDF, sentences

In [35]:
results_svm = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_ngram_train_test_features(language="es",k=i,n=2,tf_idf=True)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5,class_weight="balanced")
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))
    
    clf_lr = LogisticRegression(max_iter=5,class_weight="balanced").fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 889 instances, test on 297 instances
train on 890 instances, test on 296 instances
train on 889 instances, test on 297 instances
train on 890 instances, test on 296 instances


In [36]:
print("bigram + tf_idf features, sentences")
print("SVM accuracy: ", np.mean(results_svm))
print("Multinomial NB accuracy: ", np.mean(results_nb))
print("Logistic regression accuracy: ", np.mean(results_lr))

bigram + tf_idf features, sentences
SVM accuracy:  0.6870043338793338
Multinomial NB accuracy:  0.7436754936754937
Logistic regression accuracy:  0.7470510283010283


#### trigrams, sentences

In [37]:
results_svm = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_ngram_train_test_features(language="es",k=i,n=3)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5,class_weight="balanced")
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))
    
    clf_lr = LogisticRegression(max_iter=5,class_weight="balanced").fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 889 instances, test on 297 instances
train on 890 instances, test on 296 instances
train on 889 instances, test on 297 instances
train on 890 instances, test on 296 instances


In [38]:
print("trigram features, sentences")
print("SVM accuracy: ", np.mean(results_svm))
print("Multinomial NB accuracy: ", np.mean(results_nb))
print("Logistic regression accuracy: ", np.mean(results_lr))

trigram features, sentences
SVM accuracy:  0.7160316566566567
Multinomial NB accuracy:  0.7436754936754937
Logistic regression accuracy:  0.7470510283010283


#### trigrams + TF-IDF, sentences

In [39]:
results_svm = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_ngram_train_test_features(language="es",k=i,n=3,tf_idf=True)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5,class_weight="balanced")
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))
    
    clf_lr = LogisticRegression(max_iter=5,class_weight="balanced").fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 889 instances, test on 297 instances
train on 890 instances, test on 296 instances
train on 889 instances, test on 297 instances
train on 890 instances, test on 296 instances


In [40]:
print("trigram + tf_idf features, sentences")
print("SVM accuracy: ", np.mean(results_svm))
print("Multinomial NB accuracy: ", np.mean(results_nb))
print("Logistic regression accuracy: ", np.mean(results_lr))

trigram + tf_idf features, sentences
SVM accuracy:  0.6787196287196287
Multinomial NB accuracy:  0.7436754936754937
Logistic regression accuracy:  0.7478899353899354


### Spanish, comment level

#### BTO features, comments

In [41]:
results_svm = []
results_nb = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_BOW_train_test_features(language="es",k=i,full_comments=True)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5)
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))
    
    clf_nb = MultinomialNB().fit(X_train, y_train)
    predicted_nb = clf_nb.predict(X_test)
    results_nb.append(np.mean(predicted_nb == y_test))
    
    clf_lr = LogisticRegression(max_iter=5).fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 472 instances, test on 159 instances
train on 473 instances, test on 158 instances
train on 473 instances, test on 158 instances
train on 475 instances, test on 156 instances


In [42]:
print("BTO features, comments")
print("SVM accuracy: ", np.mean(results_svm))
print("Multinomial NB accuracy: ", np.mean(results_nb))
print("Logistic regression accuracy: ", np.mean(results_lr))

BTO features, comments
SVM accuracy:  0.6503940003551897
Multinomial NB accuracy:  0.7179399147544597
Logistic regression accuracy:  0.6813247506016792


#### Term frequency features, comments


In [43]:
results_svm = []
results_nb = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_BOW_train_test_features(language="es",k=i,repeat=True,full_comments=True)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5)
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))
    
    clf_nb = MultinomialNB().fit(X_train, y_train)
    predicted_nb = clf_nb.predict(X_test)
    results_nb.append(np.mean(predicted_nb == y_test))
    
    clf_lr = LogisticRegression(max_iter=5).fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 472 instances, test on 159 instances
train on 473 instances, test on 158 instances
train on 473 instances, test on 158 instances
train on 475 instances, test on 156 instances


In [44]:
print("BOW term frequency features, comments")
print("SVM accuracy: ", np.mean(results_svm))
print("Multinomial NB accuracy: ", np.mean(results_nb))
print("Logistic regression accuracy: ", np.mean(results_lr))

BOW term frequency features, comments
SVM accuracy:  0.6384657104101217
Multinomial NB accuracy:  0.7195122417984849
Logistic regression accuracy:  0.6941655490437435


#### BTO + TF_IDF features, comments

In [45]:
results_svm = []
results_nb = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_BOW_train_test_features(language="es",k=i,tf_idf=True,full_comments=True)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5)
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))
    
    clf_nb = MultinomialNB().fit(X_train, y_train)
    predicted_nb = clf_nb.predict(X_test)
    results_nb.append(np.mean(predicted_nb == y_test))
    
    clf_lr = LogisticRegression(max_iter=5).fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 472 instances, test on 159 instances
train on 473 instances, test on 158 instances
train on 473 instances, test on 158 instances
train on 475 instances, test on 156 instances


In [46]:
print("BOW BTO + TF-IDF features, comments")
print("SVM accuracy: ", np.mean(results_svm))
print("Multinomial NB accuracy: ", np.mean(results_nb))
print("Logistic regression accuracy: ", np.mean(results_lr))

BOW BTO + TF-IDF features, comments
SVM accuracy:  0.6910597438347019
Multinomial NB accuracy:  0.7147550721708831
Logistic regression accuracy:  0.7163373506518956


#### BOW term frequency features + TF-IDF, comments

In [47]:
results_svm = []
results_nb = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_BOW_train_test_features(language="es",k=i,repeat=True,tf_idf=True,full_comments=True)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5)
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))
    
    clf_nb = MultinomialNB().fit(X_train, y_train)
    predicted_nb = clf_nb.predict(X_test)
    results_nb.append(np.mean(predicted_nb == y_test))
    
    clf_lr = LogisticRegression(max_iter=5).fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 472 instances, test on 159 instances
train on 473 instances, test on 158 instances
train on 473 instances, test on 158 instances
train on 475 instances, test on 156 instances


In [48]:
print("BOW term frequency + TF_IDF features, comments")
print("SVM accuracy: ", np.mean(results_svm))
print("Multinomial NB accuracy: ", np.mean(results_nb))
print("Logistic regression accuracy: ", np.mean(results_lr))

BOW term frequency + TF_IDF features, comments
SVM accuracy:  0.6944069479402055
Multinomial NB accuracy:  0.7147550721708831
Logistic regression accuracy:  0.7179196291329083


#### bigrams, comments

In [31]:
results_svm = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_ngram_train_test_features(language="es",k=i,n=2,full_comments=True)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5,class_weight="balanced")
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))
    
    clf_lr = LogisticRegression(max_iter=5,class_weight="balanced").fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 472 instances, test on 159 instances
train on 473 instances, test on 158 instances
train on 473 instances, test on 158 instances
train on 475 instances, test on 156 instances


In [32]:
print("bigram features, full comments")
print("SVM accuracy: ", np.mean(results_svm))
print("Multinomial NB accuracy: ", np.mean(results_nb))
print("Logistic regression accuracy: ", np.mean(results_lr))

bigram features, full comments
SVM accuracy:  0.5889018129987201
Multinomial NB accuracy:  0.7436754936754937
Logistic regression accuracy:  0.6655517229764901


#### bigrams + TF-IDF, comments

In [49]:
results_svm = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_ngram_train_test_features(language="es",k=i,n=2,tf_idf=True,full_comments=True)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5,class_weight="balanced")
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))
    
    clf_lr = LogisticRegression(max_iter=5,class_weight="balanced").fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 472 instances, test on 159 instances
train on 473 instances, test on 158 instances
train on 473 instances, test on 158 instances
train on 475 instances, test on 156 instances


In [50]:
print("bigram + tf-idf features, full comments")
print("SVM accuracy: ", np.mean(results_svm))
print("Multinomial NB accuracy: ", np.mean(results_nb))
print("Logistic regression accuracy: ", np.mean(results_lr))

bigram + tf-idf features, full comments
SVM accuracy:  0.6731881877973949
Multinomial NB accuracy:  0.7147550721708831
Logistic regression accuracy:  0.7147550721708831


#### trigrams, comments

In [51]:
results_svm = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_ngram_train_test_features(language="es",k=i,n=3,full_comments=True)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5,class_weight="balanced")
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))
    
    clf_lr = LogisticRegression(max_iter=5,class_weight="balanced").fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 472 instances, test on 159 instances
train on 473 instances, test on 158 instances
train on 473 instances, test on 158 instances
train on 475 instances, test on 156 instances


In [52]:
print("trigram features, full comments")
print("SVM accuracy: ", np.mean(results_svm))
print("Multinomial NB accuracy: ", np.mean(results_nb))
print("Logistic regression accuracy: ", np.mean(results_lr))

trigram features, full comments
SVM accuracy:  0.5379867018182041
Multinomial NB accuracy:  0.7147550721708831
Logistic regression accuracy:  0.6988710477485256


#### trigrams + TF-IDF, comments

In [53]:
results_svm = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_ngram_train_test_features(language="es",k=i,n=3,tf_idf=True,full_comments=True)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5,class_weight="balanced")
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))
    
    clf_lr = LogisticRegression(max_iter=5,class_weight="balanced").fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 472 instances, test on 159 instances
train on 473 instances, test on 158 instances
train on 473 instances, test on 158 instances
train on 475 instances, test on 156 instances


In [54]:
print("trigram + tf-idf features, full comments")
print("SVM accuracy: ", np.mean(results_svm))
print("Multinomial NB accuracy: ", np.mean(results_nb))
print("Logistic regression accuracy: ", np.mean(results_lr))

trigram + tf-idf features, full comments
SVM accuracy:  0.6726462937786678
Multinomial NB accuracy:  0.7147550721708831
Logistic regression accuracy:  0.7115702295873063


### English translations, Spanish

#### BOW BTO features, English

In [55]:
results_svm = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_BOW_train_test_features(language="es",k=i,english=True)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5,class_weight="balanced")
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))
    
    clf_lr = LogisticRegression(max_iter=5,class_weight="balanced").fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 885 instances, test on 296 instances
train on 886 instances, test on 295 instances
train on 885 instances, test on 296 instances
train on 887 instances, test on 294 instances


In [56]:
print("BOW BTO features, English translations")
print("SVM accuracy: ", np.mean(results_svm))
print("Multinomial NB accuracy: ", np.mean(results_nb))
print("Logistic regression accuracy: ", np.mean(results_lr))

BOW BTO features, English translations
SVM accuracy:  0.6103026011137391
Multinomial NB accuracy:  0.7147550721708831
Logistic regression accuracy:  0.6425679570958022


#### bigrams + TF-IDF, English


In [57]:
results_svm = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_ngram_train_test_features(language="es",k=i,n=1,tf_idf=True,english=True)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5,class_weight="balanced")
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))
    
    clf_lr = LogisticRegression(max_iter=5,class_weight="balanced").fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 885 instances, test on 296 instances
train on 886 instances, test on 295 instances
train on 885 instances, test on 296 instances
train on 887 instances, test on 294 instances


In [58]:
print("bigram + tf-idf features, English")
print("SVM accuracy: ", np.mean(results_svm))
print("Multinomial NB accuracy: ", np.mean(results_nb))
print("Logistic regression accuracy: ", np.mean(results_lr))

bigram + tf-idf features, English
SVM accuracy:  0.701798389534467
Multinomial NB accuracy:  0.7147550721708831
Logistic regression accuracy:  0.7332438781742656
