# BOW feature extraction and models, lemmatized texts, POS tags

In this nodebook are results from different bag-of-words models and setups while working with the lemmatized sentences and POS tags, only Catalan language.

In [1]:
import os, re, nltk, numpy as np
from nltk import bigrams, trigrams, ngrams
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfTransformer

In [2]:
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import f1_score

In [3]:
import get_train_test

Imported file *get_train_test.py*. This file contains 3 functions:

* **get_train_test(k=1,lemmatize=False,POS=False,language="es")**

Reads data from *txt* files. Returns two arrays of tuples (train and test), with sentences and their labels ('pos','neg' or 'neu').

*lemmatize* - if *True* returns word lemmas

*POS* - if *True* reurns words in form "*lemma_POStag*"

* **get_train_test_comments(k=1,language="es")**

Reads data from *MongoDB* (as sentence order in comments is saved there). Returns two arrays of tuples (train and test), with comments and their labels ('pos','neg' or 'neu').

* **get_english_train_test(k=1,language="es")**

Reads data from pre-created *txt* files with sentences translated to English. Returns two arrays of tuples (train and test), with comments and their labels ('pos','neg' or 'neu').


For all the functions train-test split is 3/4 to 1/4, selection order depending on parameter *k*.

*k* - takes values 1 to 4 - changes the selection of train-test split (used for cross-validation).

*language* - 'es' or 'cat'

### Create feature vectors

This function returns a list of words, with removed punctuation and numbers.

*repeat* - returns all words, else only retuns the unique words

In [4]:
def get_word_list(text,remove_stopwords=False,repeat=False):
    if remove_stopwords == True:
        #with open("/data/es_stopwords.txt") as f:
        #    es_stopwords = f.readlines()
        #es_stopwords = [x.strip() for x in es_stopwords] 
        with open("/data/ca_stopwords.txt") as f:
            ca_stopwords = f.readlines()
        ca_stopwords = [x.strip() for x in ca_stopwords] 
        es_stopwords = set(stopwords.words("spanish"))
        for stopword in es_stopwords:
            text = re.sub(stopword, '', text)
        for stopword in ca_stopwords:
            text = re.sub(stopword, '', text)
    text = re.sub(r'\d+', '', text)
    text = text.lower()
    punctuation = ['.',',','%','&','\'','/','+','!']
    rx = '[' + re.escape(''.join(punctuation)) + ']'
    text = re.sub(rx, '', text)
    if repeat:
        tokens = nltk.wordpunct_tokenize(text)
    else:
        tokens = sorted(set(nltk.wordpunct_tokenize(text)))
    remove_from_vocabulary = ['-',')','(','(?)','1','=','[',']','][',':','<','>',';',':)','?']
    for i in remove_from_vocabulary:
        if i in tokens:
            tokens.remove(i)
    return tokens

This function is to create BOW feature vectors. Parameters - sentence to convert and vocabulary.

remove_stopwords - removes spanish stopwords (like, "la", "el", ...) (I downloaded this list from github)

repeat - keeps all the instances of the same word (if False then only keeps distinct words). Use False if BOW vectors with only 1 and 0 are needed (word either appears or not), and True if BOW vectors with word frequencies are needed.

In [5]:
def BOW(sentence, vocabulary,remove_stopwords=False,repeat=False):
    sentence_words = get_word_list(sentence,remove_stopwords,repeat)
    bag = np.zeros(len(vocabulary))
    for w in sentence_words:
        for i,word in enumerate(vocabulary):
            if word == w: 
                bag[i] += 1
    return np.array(bag)

This function creates feature vectors.

tf_idf

In [6]:
# LP - lemmatized or POS tagges
def get_LP_BOW_train_test_features(language="es",POS=False,tf_idf=False,remove_stopwords=False,repeat=False,k=1):
    if POS==True:
        train,test = get_train_test.get_train_test(k,POS=True,language=language)
    else:
        train,test = get_train_test.get_train_test(k,lemmatize=True,language=language)
    train_corpus = " ".join([i[0] for i in train])
    vocabulary = get_word_list(train_corpus)
    X_train = []
    y_train = []
    for sentence in train:
        bow = BOW(sentence[0],vocabulary,remove_stopwords,repeat)
        X_train.append(bow)
        y_train.append(sentence[1])
    X_test = []
    y_test = []
    for sentence in test:
        bow = BOW(sentence[0],vocabulary,remove_stopwords,repeat)
        X_test.append(bow)
        y_test.append(sentence[1])
    if tf_idf == True:
        transformer = TfidfTransformer(smooth_idf=False)
        X_train = transformer.fit_transform(X_train).toarray()
        X_test = transformer.transform(X_test).toarray() 
    return X_train,y_train,X_test,y_test

## Experiments for Catalan (lemmatized texts)

### Train models, sentence level

In [7]:
k = [1,2,3,4]
random_states = [0,1,2,4,5,42,50,60,70,100]

#### BTO features, sentences

In [79]:
results_svm = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_LP_BOW_train_test_features(language="cat",k=i)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5,class_weight="balanced")
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))

    clf_lr = LogisticRegression(max_iter=5,class_weight="balanced").fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2193 instances, test on 729 instances


In [80]:
print("BOW BTO features (balanced class weights), sentences")
print("------")
print("SVM accuracy: ", np.mean(results_svm))
print("SVM precision: ", precision_score(y_test, predicted_svm, average='weighted'))
print("SVM recall: ", recall_score(y_test, predicted_svm, average='weighted'))
print("SVM F1: ", f1_score(y_test, predicted_svm, average='weighted'))
print("SVM kappa: ", cohen_kappa_score(y_test, predicted_svm))
print("------")
print("Logistic regression accuracy: ", np.mean(results_lr))
print("Logistic regression precision: ", precision_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression recall: ", recall_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression F1: ", f1_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression kappa: ", cohen_kappa_score(y_test, predicted_lr))

BOW BTO features (balanced class weights), sentences
------
SVM accuracy:  0.6503919018502462
SVM precision:  0.6719726103640418
SVM recall:  0.6736401673640168
SVM F1:  0.6720964742239768
SVM kappa:  0.44725137875822807
------
Logistic regression accuracy:  0.6889259364472542
Logistic regression precision:  0.6931593643519732
Logistic regression recall:  0.694560669456067
Logistic regression F1:  0.6937691149164187
Logistic regression kappa:  0.476684196844568


In [81]:
C = confusion_matrix(y_test, predicted_svm,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.74654378, 0.1797235 , 0.07373272],
       [0.12755102, 0.7372449 , 0.13520408],
       [0.25      , 0.4537037 , 0.2962963 ]])

In [82]:
C = confusion_matrix(y_test, predicted_lr,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.71889401, 0.17511521, 0.10599078],
       [0.0994898 , 0.78061224, 0.11989796],
       [0.14814815, 0.51851852, 0.33333333]])

#### BTO features + remove stop-words, sentences


In [89]:
results_svm = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_LP_BOW_train_test_features(language="cat",k=i,remove_stopwords=True)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5,class_weight="balanced")
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))
    
    clf_lr = LogisticRegression(max_iter=5,class_weight="balanced").fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2193 instances, test on 729 instances


In [90]:
print("BOW BTO features + removed stop-words, sentences")
print("------")
print("SVM accuracy: ", np.mean(results_svm))
print("SVM precision: ", precision_score(y_test, predicted_svm, average='weighted'))
print("SVM recall: ", recall_score(y_test, predicted_svm, average='weighted'))
print("SVM F1: ", f1_score(y_test, predicted_svm, average='weighted'))
print("SVM kappa: ", cohen_kappa_score(y_test, predicted_svm))
print("------")
print("Logistic regression accuracy: ", np.mean(results_lr))
print("Logistic regression precision: ", precision_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression recall: ", recall_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression F1: ", f1_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression kappa: ", cohen_kappa_score(y_test, predicted_lr))

BOW BTO features + removed stop-words, sentences
------
SVM accuracy:  0.4682421652696003
SVM precision:  0.4703038397363161
SVM recall:  0.5216178521617852
SVM F1:  0.46192110849664725
SVM kappa:  0.09949689495576775
------
Logistic regression accuracy:  0.5265918402696115
Logistic regression precision:  0.5343856906381905
Logistic regression recall:  0.5369595536959554
Logistic regression F1:  0.5247407506492933
Logistic regression kappa:  0.1884107382733391


In [94]:
C = confusion_matrix(y_test, predicted_svm,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.0921659 , 0.75576037, 0.15207373],
       [0.08418367, 0.81632653, 0.0994898 ],
       [0.03703704, 0.64814815, 0.31481481]])

In [95]:
C = confusion_matrix(y_test, predicted_lr,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.29493088, 0.52534562, 0.1797235 ],
       [0.14285714, 0.71683673, 0.14030612],
       [0.06481481, 0.56481481, 0.37037037]])

#### BTO + TF-IDF features, sentences


In [83]:
results_svm = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_LP_BOW_train_test_features(language="cat",k=i,tf_idf=True)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5,class_weight="balanced")
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))

    clf_lr = LogisticRegression(max_iter=5,class_weight="balanced").fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2193 instances, test on 729 instances


In [88]:
print("BOW + tf-idf features, sentences")
print("------")
print("SVM accuracy: ", np.mean(results_svm))
print("SVM precision: ", precision_score(y_test, predicted_svm, average='weighted'))
print("SVM recall: ", recall_score(y_test, predicted_svm, average='weighted'))
print("SVM F1: ", f1_score(y_test, predicted_svm, average='weighted'))
print("SVM kappa: ", cohen_kappa_score(y_test, predicted_svm))
print("------")
print("Logistic regression accuracy: ", np.mean(results_lr))
print("Logistic regression precision: ", precision_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression recall: ", recall_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression F1: ", f1_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression kappa: ", cohen_kappa_score(y_test, predicted_lr))

BOW + tf-idf features, sentences
------
SVM accuracy:  0.6973512004005623
SVM precision:  0.6908587847840124
SVM recall:  0.7126917712691772
SVM F1:  0.6929284936806823
SVM kappa:  0.4767257718810337
------
Logistic regression accuracy:  0.7113223202598765
Logistic regression precision:  0.6870159480928991
Logistic regression recall:  0.7071129707112971
Logistic regression F1:  0.6926048580325866
Logistic regression kappa:  0.47649858668177913


In [91]:
C = confusion_matrix(y_test, predicted_svm,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.0921659 , 0.75576037, 0.15207373],
       [0.08418367, 0.81632653, 0.0994898 ],
       [0.03703704, 0.64814815, 0.31481481]])

In [86]:
C = confusion_matrix(y_test, predicted_lr,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.69585253, 0.23502304, 0.06912442],
       [0.08163265, 0.84438776, 0.07397959],
       [0.15740741, 0.61111111, 0.23148148]])

#### Term frequency features, sentences


In [96]:
results_svm = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_LP_BOW_train_test_features(language="cat",k=i,repeat=True)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5,class_weight="balanced")
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))

    clf_lr = LogisticRegression(max_iter=5,class_weight="balanced").fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2193 instances, test on 729 instances


In [97]:
print("BOW term frequency features, sentences")
print("------")
print("SVM accuracy: ", np.mean(results_svm))
print("SVM precision: ", precision_score(y_test, predicted_svm, average='weighted'))
print("SVM recall: ", recall_score(y_test, predicted_svm, average='weighted'))
print("SVM F1: ", f1_score(y_test, predicted_svm, average='weighted'))
print("SVM kappa: ", cohen_kappa_score(y_test, predicted_svm))
print("------")
print("Logistic regression accuracy: ", np.mean(results_lr))
print("Logistic regression precision: ", precision_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression recall: ", recall_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression F1: ", f1_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression kappa: ", cohen_kappa_score(y_test, predicted_lr))

BOW term frequency features, sentences
------
SVM accuracy:  0.6421400040090788
SVM precision:  0.6877062812916511
SVM recall:  0.6820083682008368
SVM F1:  0.6846909038701812
SVM kappa:  0.4609857989297267
------
Logistic regression accuracy:  0.6850659635840506
Logistic regression precision:  0.6932451885562246
Logistic regression recall:  0.6861924686192469
Logistic regression F1:  0.6894942911121518
Logistic regression kappa:  0.4705969218652578


In [98]:
C = confusion_matrix(y_test, predicted_svm,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.70506912, 0.19354839, 0.10138249],
       [0.09693878, 0.75765306, 0.14540816],
       [0.17592593, 0.46296296, 0.36111111]])

In [99]:
C = confusion_matrix(y_test, predicted_lr,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.7235023 , 0.17050691, 0.10599078],
       [0.10714286, 0.75      , 0.14285714],
       [0.14814815, 0.47222222, 0.37962963]])

#### Term frequency + TF_IDF features, sentences


In [100]:
results_svm = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_LP_BOW_train_test_features(language="cat",k=i,repeat=True, tf_idf=True)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5,class_weight="balanced")
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))

    clf_lr = LogisticRegression(max_iter=5,class_weight="balanced").fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2193 instances, test on 729 instances


In [101]:
print("BOW term frequency + tf-idf features, sentences")
print("------")
print("SVM accuracy: ", np.mean(results_svm))
print("SVM precision: ", precision_score(y_test, predicted_svm, average='weighted'))
print("SVM recall: ", recall_score(y_test, predicted_svm, average='weighted'))
print("SVM F1: ", f1_score(y_test, predicted_svm, average='weighted'))
print("SVM kappa: ", cohen_kappa_score(y_test, predicted_svm))
print("------")
print("Logistic regression accuracy: ", np.mean(results_lr))
print("Logistic regression precision: ", precision_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression recall: ", recall_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression F1: ", f1_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression kappa: ", cohen_kappa_score(y_test, predicted_lr))

BOW term frequency + tf-idf features, sentences
------
SVM accuracy:  0.6982999444399793
SVM precision:  0.6870776709926885
SVM recall:  0.705718270571827
SVM F1:  0.6902652565861838
SVM kappa:  0.47013704771277764
------
Logistic regression accuracy:  0.7116827813771831
Logistic regression precision:  0.6868153824455833
Logistic regression recall:  0.705718270571827
Logistic regression F1:  0.6923557190599705
Logistic regression kappa:  0.47555923777961895


In [102]:
C = confusion_matrix(y_test, predicted_svm,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.65437788, 0.25806452, 0.0875576 ],
       [0.07653061, 0.8622449 , 0.06122449],
       [0.14814815, 0.61111111, 0.24074074]])

In [103]:
C = confusion_matrix(y_test, predicted_lr,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.69124424, 0.22580645, 0.08294931],
       [0.08673469, 0.84183673, 0.07142857],
       [0.14814815, 0.61111111, 0.24074074]])

### POS tags

#### BTO features, sentences

In [16]:
results_svm = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_LP_BOW_train_test_features(language="cat",k=i,POS=True)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5,class_weight="balanced")
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))

    clf_lr = LogisticRegression(max_iter=5,class_weight="balanced").fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2193 instances, test on 729 instances


In [17]:
print("POS tags BOW BTO features, sentences")
print("------")
print("SVM accuracy: ", np.mean(results_svm))
print("SVM precision: ", precision_score(y_test, predicted_svm, average='weighted'))
print("SVM recall: ", recall_score(y_test, predicted_svm, average='weighted'))
print("SVM F1: ", f1_score(y_test, predicted_svm, average='weighted'))
print("SVM kappa: ", cohen_kappa_score(y_test, predicted_svm))
print("------")
print("Logistic regression accuracy: ", np.mean(results_lr))
print("Logistic regression precision: ", precision_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression recall: ", recall_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression F1: ", f1_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression kappa: ", cohen_kappa_score(y_test, predicted_lr))

POS tags BOW BTO features, sentences
------
SVM accuracy:  0.49928289507337226
SVM precision:  0.5103135154848756
SVM recall:  0.5523012552301255
SVM F1:  0.4885927486416796
SVM kappa:  0.14152875222397532
------
Logistic regression accuracy:  0.5482662936108198
Logistic regression precision:  0.5291048837350969
Logistic regression recall:  0.5481171548117155
Logistic regression F1:  0.5298447693676717
Logistic regression kappa:  0.19890201974571275


In [18]:
C = confusion_matrix(y_test, predicted_svm,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.11059908, 0.75576037, 0.13364055],
       [0.07142857, 0.8622449 , 0.06632653],
       [0.03703704, 0.64814815, 0.31481481]])

In [19]:
C = confusion_matrix(y_test, predicted_lr,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.26728111, 0.5483871 , 0.1843318 ],
       [0.16071429, 0.74744898, 0.09183673],
       [0.12037037, 0.49074074, 0.38888889]])

#### Term frequency features, sentences

In [20]:
results_svm = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_LP_BOW_train_test_features(language="cat",k=i,POS=True,repeat=True)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5,class_weight="balanced")
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))

    clf_lr = LogisticRegression(max_iter=5,class_weight="balanced").fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2193 instances, test on 729 instances


In [21]:
print("POS tags BOW term frequency features, sentences")
print("------")
print("SVM accuracy: ", np.mean(results_svm))
print("SVM precision: ", precision_score(y_test, predicted_svm, average='weighted'))
print("SVM recall: ", recall_score(y_test, predicted_svm, average='weighted'))
print("SVM F1: ", f1_score(y_test, predicted_svm, average='weighted'))
print("SVM kappa: ", cohen_kappa_score(y_test, predicted_svm))
print("------")
print("Logistic regression accuracy: ", np.mean(results_lr))
print("Logistic regression precision: ", precision_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression recall: ", recall_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression F1: ", f1_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression kappa: ", cohen_kappa_score(y_test, predicted_lr))

POS tags BOW term frequency features, sentences
------
SVM accuracy:  0.46550760134415664
SVM precision:  0.5301752261114877
SVM recall:  0.4672245467224547
SVM F1:  0.45191150859277746
SVM kappa:  0.1568730664450293
------
Logistic regression accuracy:  0.5458335354951
Logistic regression precision:  0.5541091414874991
Logistic regression recall:  0.5550906555090656
Logistic regression F1:  0.5525088019558462
Logistic regression kappa:  0.24349077197856717


In [22]:
C = confusion_matrix(y_test, predicted_svm,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.0921659 , 0.39170507, 0.51612903],
       [0.05102041, 0.64285714, 0.30612245],
       [0.05555556, 0.36111111, 0.58333333]])

In [23]:
C = confusion_matrix(y_test, predicted_lr,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.359447  , 0.43778802, 0.20276498],
       [0.19642857, 0.69387755, 0.10969388],
       [0.2037037 , 0.35185185, 0.44444444]])

#### BTO + TF-IDF features, sentences

In [24]:
results_svm = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_LP_BOW_train_test_features(language="cat",k=i,POS=True,tf_idf=True)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5,class_weight="balanced")
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))

    clf_lr = LogisticRegression(max_iter=5,class_weight="balanced").fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2193 instances, test on 729 instances


In [25]:
print("POS tags BOW BTO + tf-idf features, sentences")
print("------")
print("SVM accuracy: ", np.mean(results_svm))
print("SVM precision: ", precision_score(y_test, predicted_svm, average='weighted'))
print("SVM recall: ", recall_score(y_test, predicted_svm, average='weighted'))
print("SVM F1: ", f1_score(y_test, predicted_svm, average='weighted'))
print("SVM kappa: ", cohen_kappa_score(y_test, predicted_svm))
print("------")
print("Logistic regression accuracy: ", np.mean(results_lr))
print("Logistic regression precision: ", precision_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression recall: ", recall_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression F1: ", f1_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression kappa: ", cohen_kappa_score(y_test, predicted_lr))

POS tags BOW BTO + tf-idf features, sentences
------
SVM accuracy:  0.5175885599197395
SVM precision:  0.5396906244694359
SVM recall:  0.5634588563458857
SVM F1:  0.4706302594563601
SVM kappa:  0.13301423207083574
------
Logistic regression accuracy:  0.5643906091260575
Logistic regression precision:  0.5364311763341733
Logistic regression recall:  0.5606694560669456
Logistic regression F1:  0.5322772070273003
Logistic regression kappa:  0.19924340192588608


In [26]:
C = confusion_matrix(y_test, predicted_svm,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.05069124, 0.83870968, 0.11059908],
       [0.02295918, 0.92091837, 0.05612245],
       [0.00925926, 0.69444444, 0.2962963 ]])

In [27]:
C = confusion_matrix(y_test, predicted_lr,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.23963134, 0.62211982, 0.13824885],
       [0.13010204, 0.78826531, 0.08163265],
       [0.09259259, 0.52777778, 0.37962963]])

#### Term frequency + TF_IDF features, sentences

In [28]:
results_svm = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_LP_BOW_train_test_features(language="cat",k=i,POS=True,tf_idf=True,repeat=True)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5,class_weight="balanced")
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))

    clf_lr = LogisticRegression(max_iter=5,class_weight="balanced").fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2193 instances, test on 729 instances


In [29]:
print("POS tags BOW term frequency + tf-idf features, sentences")
print("------")
print("SVM accuracy: ", np.mean(results_svm))
print("SVM precision: ", precision_score(y_test, predicted_svm, average='weighted'))
print("SVM recall: ", recall_score(y_test, predicted_svm, average='weighted'))
print("SVM F1: ", f1_score(y_test, predicted_svm, average='weighted'))
print("SVM kappa: ", cohen_kappa_score(y_test, predicted_svm))
print("------")
print("Logistic regression accuracy: ", np.mean(results_lr))
print("Logistic regression precision: ", precision_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression recall: ", recall_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression F1: ", f1_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression kappa: ", cohen_kappa_score(y_test, predicted_lr))

POS tags BOW term frequency + tf-idf features, sentences
------
SVM accuracy:  0.5277511570340341
SVM precision:  0.5328267447263357
SVM recall:  0.5578800557880056
SVM F1:  0.4677253110438668
SVM kappa:  0.12694650800113705
------
Logistic regression accuracy:  0.5738105881901232
Logistic regression precision:  0.549465879035555
Logistic regression recall:  0.5732217573221757
Logistic regression F1:  0.5400341554671492
Logistic regression kappa:  0.2099401158791947


In [30]:
C = confusion_matrix(y_test, predicted_svm,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.05529954, 0.81105991, 0.13364055],
       [0.02806122, 0.91326531, 0.05867347],
       [0.        , 0.72222222, 0.27777778]])

In [31]:
C = confusion_matrix(y_test, predicted_lr,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.24423963, 0.63133641, 0.12442396],
       [0.10459184, 0.82142857, 0.07397959],
       [0.11111111, 0.55555556, 0.33333333]])