# *n*-gram feature extraction and models, lemmatized texts, POS tags

In this nodebook are results from different *n*-gram models and setups while working with the lemmatized sentences and POS tags, only Catalan language.

In [1]:
import os, re, nltk, numpy as np
from nltk import bigrams, trigrams, ngrams
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfTransformer

In [2]:
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import f1_score

In [3]:
import get_train_test

Imported file *get_train_test.py*. This file contains 3 functions:

* **get_train_test(k=1,lemmatize=False,POS=False,language="es")**

Reads data from *txt* files. Returns two arrays of tuples (train and test), with sentences and their labels ('pos','neg' or 'neu').

*lemmatize* - if *True* returns word lemmas

*POS* - if *True* reurns words in form "*lemma_POStag*"

* **get_train_test_comments(k=1,language="es")**

Reads data from *MongoDB* (as sentence order in comments is saved there). Returns two arrays of tuples (train and test), with comments and their labels ('pos','neg' or 'neu').

* **get_english_train_test(k=1,language="es")**

Reads data from pre-created *txt* files with sentences translated to English. Returns two arrays of tuples (train and test), with comments and their labels ('pos','neg' or 'neu').


For all the functions train-test split is 3/4 to 1/4, selection order depending on parameter *k*.

*k* - takes values 1 to 4 - changes the selection of train-test split (used for cross-validation).

*language* - 'es' or 'cat'

### Create feature vectors

In [4]:
def get_word_list(text,remove_stopwords=False,repeat=False):
    if remove_stopwords == True:
        #with open("/data/es_stopwords.txt") as f:
        #    es_stopwords = f.readlines()
        #es_stopwords = [x.strip() for x in es_stopwords] 
        with open("/data/ca_stopwords.txt") as f:
            ca_stopwords = f.readlines()
        ca_stopwords = [x.strip() for x in ca_stopwords] 
        es_stopwords = set(stopwords.words("spanish"))
        for stopword in es_stopwords:
            text = re.sub(stopword, '', text)
        for stopword in ca_stopwords:
            text = re.sub(stopword, '', text)
    text = re.sub(r'\d+', '', text)
    text = text.lower()
    punctuation = ['.',',','%','&','\'','/','+','!']
    rx = '[' + re.escape(''.join(punctuation)) + ']'
    text = re.sub(rx, '', text)
    if repeat:
        tokens = nltk.wordpunct_tokenize(text)
    else:
        tokens = sorted(set(nltk.wordpunct_tokenize(text)))
    remove_from_vocabulary = ['-',')','(','(?)','1','=','[',']','][',':','<','>',';',':)','?']
    for i in remove_from_vocabulary:
        if i in tokens:
            tokens.remove(i)
    return tokens

In [5]:
def tokenize(text):
    tokens = get_word_list(text,remove_stopwords=False,repeat=True)
    return tokens

These two functions are used to get vocabulary from bigram and trigram tokens. Can be changed to select bigrams or trigrams that appear more times than a given threshold.

In [6]:
def get_2grams_vocab(tokens):
    bigr = list(bigrams(tokens))
    fdist = nltk.FreqDist(bigr)
    bi_grams = [x[0] for x in list(fdist.items()) if x[1] >= 1]
    return bi_grams

def get_3grams_vocab(tokens):
    trigr = list(trigrams(tokens))
    fdist = nltk.FreqDist(trigr)
    tri_grams = [x[0] for x in list(fdist.items()) if x[1] >= 1]
    return tri_grams

This function creates feature vectors from ngrams.

Ngramlist is a set of all ngrams (vocabulary).

n - parameter for n-grams (2grams and 3grams use special functions above, larger n ngrams use nltk function ngrams)


In [7]:
def ngrams_to_features(sentence, ngramlist, n=2):
    tokens = tokenize(sentence)
    if n==3:
        sentence_ngrams = list(trigrams(tokens))
    elif n == 4 or n==5:
        sentence_ngrams = list(ngrams(tokens, n))
    else:
        sentence_ngrams = list(bigrams(tokens))
    bag = np.zeros(len(ngramlist))
    for w in sentence_ngrams:
        for i,word in enumerate(ngramlist):
            if word == w: 
                bag[i] += 1
                
    return np.array(bag)

This function creates feature vectors from ngrams.

n - parameter n for ngram

tf_idf

In [8]:
def get_LP_ngram_train_test_features(language="es",POS=False,n=2,tf_idf=False,full_comments=False,k=1):
    if POS==True:
        train,test = get_train_test.get_train_test(k,POS=True,language=language)
    else:
        train,test = get_train_test.get_train_test(k,lemmatize=True,language=language)
    train_corpus = " ".join([i[0] for i in train])
    tokens = tokenize(train_corpus)
    if n==3:
        ngramlist = get_3grams_vocab(tokens)
    elif n > 3:
        ngramlist = list(ngrams(tokens, n))
    else:
        ngramlist = get_2grams_vocab(tokens)
    X_train = []
    y_train = []
    for sentence in train:
        f = ngrams_to_features(sentence[0],ngramlist,n)
        X_train.append(f)
        y_train.append(sentence[1])
    X_test = []
    y_test = []
    for sentence in test:
        f = ngrams_to_features(sentence[0],ngramlist,n)
        X_test.append(f)
        y_test.append(sentence[1])
    if tf_idf == True:
        transformer = TfidfTransformer()
        X_train = transformer.fit_transform(X_train).toarray()
        X_test = transformer.transform(X_test).toarray() 
    
    return X_train,y_train,X_test,y_test

## Experiments for Catalan (lemmatized texts)

### Train models, sentence level

In [9]:
k = [1,2,3,4]
random_states = [0,1,2,4,5,42,50,60,70,100]

#### bigrams, sentences

In [11]:
results_svm = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_LP_ngram_train_test_features(language="cat",k=i,n=2)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5,class_weight="balanced")
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))
    
    clf_lr = LogisticRegression(max_iter=5,class_weight="balanced").fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2193 instances, test on 729 instances


In [12]:
print("bi-grams, sentences")
print("------")
print("SVM accuracy: ", np.mean(results_svm))
print("SVM precision: ", precision_score(y_test, predicted_svm, average='weighted'))
print("SVM recall: ", recall_score(y_test, predicted_svm, average='weighted'))
print("SVM F1: ", f1_score(y_test, predicted_svm, average='weighted'))
print("SVM kappa: ", cohen_kappa_score(y_test, predicted_svm))
print("------")
print("Logistic regression accuracy: ", np.mean(results_lr))
print("Logistic regression precision: ", precision_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression recall: ", recall_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression F1: ", f1_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression kappa: ", cohen_kappa_score(y_test, predicted_lr))

bi-grams, sentences
------
SVM accuracy:  0.6465922336877264
SVM precision:  0.669636372452305
SVM recall:  0.6750348675034867
SVM F1:  0.6713324322250834
SVM kappa:  0.4349977340521236
------
Logistic regression accuracy:  0.6668967586503735
Logistic regression precision:  0.6615268204039437
Logistic regression recall:  0.6569037656903766
Logistic regression F1:  0.6590321522096558
Logistic regression kappa:  0.4180962812425769


In [13]:
C = confusion_matrix(y_test, predicted_svm,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.64976959, 0.2764977 , 0.07373272],
       [0.10459184, 0.78061224, 0.11479592],
       [0.15740741, 0.5       , 0.34259259]])

In [14]:
C = confusion_matrix(y_test, predicted_lr,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.64976959, 0.25345622, 0.09677419],
       [0.125     , 0.73469388, 0.14030612],
       [0.17592593, 0.43518519, 0.38888889]])

#### bigrams + TF-IDF, sentences

In [15]:
results_svm = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_LP_ngram_train_test_features(language="cat",k=i,n=2,tf_idf=True)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5,class_weight="balanced")
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))
    
    clf_lr = LogisticRegression(max_iter=5,class_weight="balanced").fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2193 instances, test on 729 instances


In [16]:
print("bi-grams + tf-idf, sentences")
print("------")
print("SVM accuracy: ", np.mean(results_svm))
print("SVM precision: ", precision_score(y_test, predicted_svm, average='weighted'))
print("SVM recall: ", recall_score(y_test, predicted_svm, average='weighted'))
print("SVM F1: ", f1_score(y_test, predicted_svm, average='weighted'))
print("SVM kappa: ", cohen_kappa_score(y_test, predicted_svm))
print("------")
print("Logistic regression accuracy: ", np.mean(results_lr))
print("Logistic regression precision: ", precision_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression recall: ", recall_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression F1: ", f1_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression kappa: ", cohen_kappa_score(y_test, predicted_lr))

bi-grams + tf-idf, sentences
------
SVM accuracy:  0.6536821612498218
SVM precision:  0.6605367262395463
SVM recall:  0.6847977684797768
SVM F1:  0.6423214463435816
SVM kappa:  0.39302383074998315
------
Logistic regression accuracy:  0.6794754921707944
Logistic regression precision:  0.6585482909053342
Logistic regression recall:  0.6861924686192469
Logistic regression F1:  0.6353287702447842
Logistic regression kappa:  0.39193314939617363


In [17]:
C = confusion_matrix(y_test, predicted_svm,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.56221198, 0.41935484, 0.01843318],
       [0.05357143, 0.91836735, 0.02806122],
       [0.11111111, 0.80555556, 0.08333333]])

In [18]:
C = confusion_matrix(y_test, predicted_lr,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.57603687, 0.41474654, 0.00921659],
       [0.06377551, 0.92346939, 0.0127551 ],
       [0.13888889, 0.81481481, 0.0462963 ]])

#### trigrams, sentences

In [19]:
results_svm = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_LP_ngram_train_test_features(language="cat",k=i,n=3)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5,class_weight="balanced")
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))
    
    clf_lr = LogisticRegression(max_iter=5,class_weight="balanced").fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2193 instances, test on 729 instances


In [20]:
print("tri-grams, sentences")
print("------")
print("SVM accuracy: ", np.mean(results_svm))
print("SVM precision: ", precision_score(y_test, predicted_svm, average='weighted'))
print("SVM recall: ", recall_score(y_test, predicted_svm, average='weighted'))
print("SVM F1: ", f1_score(y_test, predicted_svm, average='weighted'))
print("SVM kappa: ", cohen_kappa_score(y_test, predicted_svm))
print("------")
print("Logistic regression accuracy: ", np.mean(results_lr))
print("Logistic regression precision: ", precision_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression recall: ", recall_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression F1: ", f1_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression kappa: ", cohen_kappa_score(y_test, predicted_lr))

tri-grams, sentences
------
SVM accuracy:  0.5732557565074381
SVM precision:  0.5375803319933273
SVM recall:  0.5718270571827058
SVM F1:  0.5466119944063426
SVM kappa:  0.21402633016614359
------
Logistic regression accuracy:  0.6035565717759458
Logistic regression precision:  0.5571885215717153
Logistic regression recall:  0.595536959553696
Logistic regression F1:  0.561950918768401
Logistic regression kappa:  0.24102058694699957


In [21]:
C = confusion_matrix(y_test, predicted_svm,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.46082949, 0.49308756, 0.04608295],
       [0.16581633, 0.76020408, 0.07397959],
       [0.19444444, 0.69444444, 0.11111111]])

In [22]:
C = confusion_matrix(y_test, predicted_lr,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.4562212 , 0.51152074, 0.03225806],
       [0.13010204, 0.81122449, 0.05867347],
       [0.17592593, 0.73148148, 0.09259259]])

#### trigrams + TF-IDF, sentences

In [23]:
results_svm = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_LP_ngram_train_test_features(language="cat",k=i,n=3,tf_idf=True)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5,class_weight="balanced")
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))
    
    clf_lr = LogisticRegression(max_iter=5,class_weight="balanced").fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2193 instances, test on 729 instances


In [24]:
print("tri-grams + tf-idf, sentences")
print("------")
print("SVM accuracy: ", np.mean(results_svm))
print("SVM precision: ", precision_score(y_test, predicted_svm, average='weighted'))
print("SVM recall: ", recall_score(y_test, predicted_svm, average='weighted'))
print("SVM F1: ", f1_score(y_test, predicted_svm, average='weighted'))
print("SVM kappa: ", cohen_kappa_score(y_test, predicted_svm))
print("------")
print("Logistic regression accuracy: ", np.mean(results_lr))
print("Logistic regression precision: ", precision_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression recall: ", recall_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression F1: ", f1_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression kappa: ", cohen_kappa_score(y_test, predicted_lr))

tri-grams + tf-idf, sentences
------
SVM accuracy:  0.557560270441209
SVM precision:  0.5982428099496642
SVM recall:  0.6178521617852162
SVM F1:  0.5629812678938507
SVM kappa:  0.2422531299900489
------
Logistic regression accuracy:  0.6084567013940458
Logistic regression precision:  0.5794773304067494
Logistic regression recall:  0.6136680613668062
Logistic regression F1:  0.5412452996949276
Logistic regression kappa:  0.2141486402538667


In [25]:
C = confusion_matrix(y_test, predicted_svm,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.359447  , 0.61290323, 0.02764977],
       [0.06377551, 0.91071429, 0.0255102 ],
       [0.07407407, 0.85185185, 0.07407407]])

In [26]:
C = confusion_matrix(y_test, predicted_lr,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.32258065, 0.65898618, 0.01843318],
       [0.05357143, 0.93877551, 0.00765306],
       [0.06481481, 0.91666667, 0.01851852]])

#### 4-grams, sentences

In [27]:
results_svm = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_LP_ngram_train_test_features(language="cat",k=i,n=4)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5,class_weight="balanced")
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))
    
    clf_lr = LogisticRegression(max_iter=5,class_weight="balanced").fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 2191 instances, test on 731 instances


  


train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2193 instances, test on 729 instances


In [28]:
print("4-grams, sentences")
print("------")
print("SVM accuracy: ", np.mean(results_svm))
print("SVM precision: ", precision_score(y_test, predicted_svm, average='weighted'))
print("SVM recall: ", recall_score(y_test, predicted_svm, average='weighted'))
print("SVM F1: ", f1_score(y_test, predicted_svm, average='weighted'))
print("SVM kappa: ", cohen_kappa_score(y_test, predicted_svm))
print("------")
print("Logistic regression accuracy: ", np.mean(results_lr))
print("Logistic regression precision: ", precision_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression recall: ", recall_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression F1: ", f1_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression kappa: ", cohen_kappa_score(y_test, predicted_lr))

4-grams, sentences
------
SVM accuracy:  0.4953629607215222
SVM precision:  0.5255951387295758
SVM recall:  0.5718270571827058
SVM F1:  0.5048760204128034
SVM kappa:  0.14026090692496984
------
Logistic regression accuracy:  0.567164305602418
Logistic regression precision:  0.5437925461180383
Logistic regression recall:  0.5774058577405857
Logistic regression F1:  0.5168692736537541
Logistic regression kappa:  0.15786025777691637


In [29]:
C = confusion_matrix(y_test, predicted_svm,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.26728111, 0.71428571, 0.01843318],
       [0.08163265, 0.8877551 , 0.03061224],
       [0.08333333, 0.87962963, 0.03703704]])

In [30]:
C = confusion_matrix(y_test, predicted_lr,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.28110599, 0.70046083, 0.01843318],
       [0.08163265, 0.88265306, 0.03571429],
       [0.08333333, 0.85185185, 0.06481481]])

#### 4-grams + TF-IDF, sentences

In [31]:
results_svm = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_LP_ngram_train_test_features(language="cat",k=i,n=4,tf_idf=True)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5,class_weight="balanced")
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))
    
    clf_lr = LogisticRegression(max_iter=5,class_weight="balanced").fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 2191 instances, test on 731 instances


  


train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2193 instances, test on 729 instances


In [32]:
print("4-grams + tf-idf, sentences")
print("------")
print("SVM accuracy: ", np.mean(results_svm))
print("SVM precision: ", precision_score(y_test, predicted_svm, average='weighted'))
print("SVM recall: ", recall_score(y_test, predicted_svm, average='weighted'))
print("SVM F1: ", f1_score(y_test, predicted_svm, average='weighted'))
print("SVM kappa: ", cohen_kappa_score(y_test, predicted_svm))
print("------")
print("Logistic regression accuracy: ", np.mean(results_lr))
print("Logistic regression precision: ", precision_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression recall: ", recall_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression F1: ", f1_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression kappa: ", cohen_kappa_score(y_test, predicted_lr))

4-grams + tf-idf, sentences
------
SVM accuracy:  0.4962508718676295
SVM precision:  0.5590759837028739
SVM recall:  0.5760111576011158
SVM F1:  0.49183089249491985
SVM kappa:  0.12657330047484516
------
Logistic regression accuracy:  0.5738342097380285
Logistic regression precision:  0.5598588428870056
Logistic regression recall:  0.5760111576011158
Logistic regression F1:  0.4779091600459741
Logistic regression kappa:  0.11044724953169194


In [33]:
C = confusion_matrix(y_test, predicted_svm,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.1797235 , 0.80184332, 0.01843318],
       [0.04081633, 0.93622449, 0.02295918],
       [0.06481481, 0.87037037, 0.06481481]])

In [34]:
C = confusion_matrix(y_test, predicted_lr,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.16129032, 0.82488479, 0.01382488],
       [0.03571429, 0.95663265, 0.00765306],
       [0.0462963 , 0.92592593, 0.02777778]])

#### 5-grams, sentences

In [35]:
results_svm = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_LP_ngram_train_test_features(language="cat",k=i,n=5)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5,class_weight="balanced")
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))
    
    clf_lr = LogisticRegression(max_iter=5,class_weight="balanced").fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 2191 instances, test on 731 instances


  


train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2193 instances, test on 729 instances


In [36]:
print("5-grams, sentences")
print("------")
print("SVM accuracy: ", np.mean(results_svm))
print("SVM precision: ", precision_score(y_test, predicted_svm, average='weighted'))
print("SVM recall: ", recall_score(y_test, predicted_svm, average='weighted'))
print("SVM F1: ", f1_score(y_test, predicted_svm, average='weighted'))
print("SVM kappa: ", cohen_kappa_score(y_test, predicted_svm))
print("------")
print("Logistic regression accuracy: ", np.mean(results_lr))
print("Logistic regression precision: ", precision_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression recall: ", recall_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression F1: ", f1_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression kappa: ", cohen_kappa_score(y_test, predicted_lr))

5-grams, sentences
------
SVM accuracy:  0.47049681629682694
SVM precision:  0.5524071121515042
SVM recall:  0.5648535564853556
SVM F1:  0.44544870417716875
SVM kappa:  0.0669708584799028
------
Logistic regression accuracy:  0.5671731822015176
Logistic regression precision:  0.5996004557361386
Logistic regression recall:  0.5815899581589958
Logistic regression F1:  0.47887088411463896
Logistic regression kappa:  0.11433371213992805


In [37]:
C = confusion_matrix(y_test, predicted_svm,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.10138249, 0.88940092, 0.00921659],
       [0.02295918, 0.9744898 , 0.00255102],
       [0.00925926, 0.98148148, 0.00925926]])

In [38]:
C = confusion_matrix(y_test, predicted_lr,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.17050691, 0.82488479, 0.00460829],
       [0.03316327, 0.96683673, 0.        ],
       [0.02777778, 0.96296296, 0.00925926]])

#### 5-grams + TF-IDF, sentences

In [39]:
results_svm = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_LP_ngram_train_test_features(language="cat",k=i,n=5,tf_idf=True)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5,class_weight="balanced")
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))
    
    clf_lr = LogisticRegression(max_iter=5,class_weight="balanced").fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 2191 instances, test on 731 instances


  


train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2193 instances, test on 729 instances


In [40]:
print("5-grams + tf-idf, sentences")
print("------")
print("SVM accuracy: ", np.mean(results_svm))
print("SVM precision: ", precision_score(y_test, predicted_svm, average='weighted'))
print("SVM recall: ", recall_score(y_test, predicted_svm, average='weighted'))
print("SVM F1: ", f1_score(y_test, predicted_svm, average='weighted'))
print("SVM kappa: ", cohen_kappa_score(y_test, predicted_svm))
print("------")
print("Logistic regression accuracy: ", np.mean(results_lr))
print("Logistic regression precision: ", precision_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression recall: ", recall_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression F1: ", f1_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression kappa: ", cohen_kappa_score(y_test, predicted_lr))

5-grams + tf-idf, sentences
------
SVM accuracy:  0.476421278290353
SVM precision:  0.55336052970475
SVM recall:  0.5690376569037657
SVM F1:  0.4595008443130009
SVM kappa:  0.0857761822233225
------
Logistic regression accuracy:  0.5570263137870786
Logistic regression precision:  0.5916027011027162
Logistic regression recall:  0.5606694560669456
Logistic regression F1:  0.42778934755581977
Logistic regression kappa:  0.047399121863572424


In [41]:
C = confusion_matrix(y_test, predicted_svm,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.12903226, 0.85253456, 0.01843318],
       [0.02040816, 0.96683673, 0.0127551 ],
       [0.00925926, 0.98148148, 0.00925926]])

In [42]:
C = confusion_matrix(y_test, predicted_lr,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.06451613, 0.93087558, 0.00460829],
       [0.0127551 , 0.9872449 , 0.        ],
       [0.00925926, 0.98148148, 0.00925926]])

### POS tags

#### bigrams, sentences

In [10]:
results_svm = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_LP_ngram_train_test_features(language="cat",k=i,n=2,POS=True)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5,class_weight="balanced")
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))

    clf_lr = LogisticRegression(max_iter=5,class_weight="balanced").fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2193 instances, test on 729 instances


In [11]:
print("POS tags bigram features, sentences")
print("------")
print("SVM accuracy: ", np.mean(results_svm))
print("SVM precision: ", precision_score(y_test, predicted_svm, average='weighted'))
print("SVM recall: ", recall_score(y_test, predicted_svm, average='weighted'))
print("SVM F1: ", f1_score(y_test, predicted_svm, average='weighted'))
print("SVM kappa: ", cohen_kappa_score(y_test, predicted_svm))
print("------")
print("Logistic regression accuracy: ", np.mean(results_lr))
print("Logistic regression precision: ", precision_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression recall: ", recall_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression F1: ", f1_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression kappa: ", cohen_kappa_score(y_test, predicted_lr))

POS tags bigram features, sentences
------
SVM accuracy:  0.4918112397991642
SVM precision:  0.5567885822580932
SVM recall:  0.5871687587168759
SVM F1:  0.5413706363927735
SVM kappa:  0.21569844789356984
------
Logistic regression accuracy:  0.5646911645163857
Logistic regression precision:  0.5868039811361114
Logistic regression recall:  0.5746164574616457
Logistic regression F1:  0.5798612740992576
Logistic regression kappa:  0.2897462463096425


In [12]:
C = confusion_matrix(y_test, predicted_svm,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.21658986, 0.640553  , 0.14285714],
       [0.09183673, 0.875     , 0.03316327],
       [0.09259259, 0.62037037, 0.28703704]])

In [13]:
C = confusion_matrix(y_test, predicted_lr,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.49769585, 0.31797235, 0.1843318 ],
       [0.20153061, 0.66836735, 0.13010204],
       [0.23148148, 0.37962963, 0.38888889]])

#### bigrams + TF-IDF, sentences

In [14]:
results_svm = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_LP_ngram_train_test_features(language="cat",k=i,n=2,POS=True,tf_idf=True)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5,class_weight="balanced")
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))

    clf_lr = LogisticRegression(max_iter=5,class_weight="balanced").fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2193 instances, test on 729 instances


In [15]:
print("POS tags bigram + tf-idf features, sentences")
print("------")
print("SVM accuracy: ", np.mean(results_svm))
print("SVM precision: ", precision_score(y_test, predicted_svm, average='weighted'))
print("SVM recall: ", recall_score(y_test, predicted_svm, average='weighted'))
print("SVM F1: ", f1_score(y_test, predicted_svm, average='weighted'))
print("SVM kappa: ", cohen_kappa_score(y_test, predicted_svm))
print("------")
print("Logistic regression accuracy: ", np.mean(results_lr))
print("Logistic regression precision: ", precision_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression recall: ", recall_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression F1: ", f1_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression kappa: ", cohen_kappa_score(y_test, predicted_lr))

POS tags bigram + tf-idf features, sentences
------
SVM accuracy:  0.5768032555925146
SVM precision:  0.5940463808646673
SVM recall:  0.599721059972106
SVM F1:  0.5787066168571915
SVM kappa:  0.2795735816689132
------
Logistic regression accuracy:  0.5881709316935205
Logistic regression precision:  0.5842434709920424
Logistic regression recall:  0.5885634588563459
Logistic regression F1:  0.5854775859036804
Logistic regression kappa:  0.29015618812379596


In [None]:
C = confusion_matrix(y_test, predicted_svm,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

In [None]:
C = confusion_matrix(y_test, predicted_lr,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

#### trigrams, sentences

In [18]:
results_svm = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_LP_ngram_train_test_features(language="cat",k=i,n=3,POS=True)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5,class_weight="balanced")
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))

    clf_lr = LogisticRegression(max_iter=5,class_weight="balanced").fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2193 instances, test on 729 instances


In [19]:
print("POS tags trigram features, sentences")
print("------")
print("SVM accuracy: ", np.mean(results_svm))
print("SVM precision: ", precision_score(y_test, predicted_svm, average='weighted'))
print("SVM recall: ", recall_score(y_test, predicted_svm, average='weighted'))
print("SVM F1: ", f1_score(y_test, predicted_svm, average='weighted'))
print("SVM kappa: ", cohen_kappa_score(y_test, predicted_svm))
print("------")
print("Logistic regression accuracy: ", np.mean(results_lr))
print("Logistic regression precision: ", precision_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression recall: ", recall_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression F1: ", f1_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression kappa: ", cohen_kappa_score(y_test, predicted_lr))

POS tags trigram features, sentences
------
SVM accuracy:  0.5200328537047991
SVM precision:  0.572754116843014
SVM recall:  0.5202231520223152
SVM F1:  0.5216348081438825
SVM kappa:  0.2085178482036274
------
Logistic regression accuracy:  0.5468353586753036
Logistic regression precision:  0.5878432977619156
Logistic regression recall:  0.5690376569037657
Logistic regression F1:  0.5766860728120499
Logistic regression kappa:  0.28519300011614845


In [20]:
C = confusion_matrix(y_test, predicted_svm,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.26267281, 0.41935484, 0.31797235],
       [0.07653061, 0.67602041, 0.24744898],
       [0.09259259, 0.43518519, 0.47222222]])

In [22]:
C = confusion_matrix(y_test, predicted_lr,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.48387097, 0.31336406, 0.20276498],
       [0.17857143, 0.66581633, 0.15561224],
       [0.24074074, 0.37037037, 0.38888889]])

#### trigrams + TF-IDF, sentences

In [23]:
results_svm = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_LP_ngram_train_test_features(language="cat",k=i,n=3,POS=True,tf_idf=True)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5,class_weight="balanced")
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))

    clf_lr = LogisticRegression(max_iter=5,class_weight="balanced").fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2193 instances, test on 729 instances


In [24]:
print("POS tags trigram + tf-idf features, sentences")
print("------")
print("SVM accuracy: ", np.mean(results_svm))
print("SVM precision: ", precision_score(y_test, predicted_svm, average='weighted'))
print("SVM recall: ", recall_score(y_test, predicted_svm, average='weighted'))
print("SVM F1: ", f1_score(y_test, predicted_svm, average='weighted'))
print("SVM kappa: ", cohen_kappa_score(y_test, predicted_svm))
print("------")
print("Logistic regression accuracy: ", np.mean(results_lr))
print("Logistic regression precision: ", precision_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression recall: ", recall_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression F1: ", f1_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression kappa: ", cohen_kappa_score(y_test, predicted_lr))

POS tags trigram + tf-idf features, sentences
------
SVM accuracy:  0.5750419087000835
SVM precision:  0.5969169279864831
SVM recall:  0.599721059972106
SVM F1:  0.5737304372667352
SVM kappa:  0.2680915085682579
------
Logistic regression accuracy:  0.5867202929302394
Logistic regression precision:  0.5935779700244853
Logistic regression recall:  0.603905160390516
Logistic regression F1:  0.5930221984059234
Logistic regression kappa:  0.2971291882544208


In [25]:
C = confusion_matrix(y_test, predicted_svm,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.29953917, 0.55760369, 0.14285714],
       [0.07142857, 0.83418367, 0.09438776],
       [0.09259259, 0.55555556, 0.35185185]])

In [26]:
C = confusion_matrix(y_test, predicted_lr,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.41935484, 0.47004608, 0.11059908],
       [0.13010204, 0.77295918, 0.09693878],
       [0.16666667, 0.47222222, 0.36111111]])

#### 4-grams, sentences

In [27]:
results_svm = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_LP_ngram_train_test_features(language="cat",k=i,n=4,POS=True)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5,class_weight="balanced")
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))

    clf_lr = LogisticRegression(max_iter=5,class_weight="balanced").fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 2191 instances, test on 731 instances


  


train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2193 instances, test on 729 instances


In [28]:
print("POS tags 4-gram features, sentences")
print("------")
print("SVM accuracy: ", np.mean(results_svm))
print("SVM precision: ", precision_score(y_test, predicted_svm, average='weighted'))
print("SVM recall: ", recall_score(y_test, predicted_svm, average='weighted'))
print("SVM F1: ", f1_score(y_test, predicted_svm, average='weighted'))
print("SVM kappa: ", cohen_kappa_score(y_test, predicted_svm))
print("------")
print("Logistic regression accuracy: ", np.mean(results_lr))
print("Logistic regression precision: ", precision_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression recall: ", recall_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression F1: ", f1_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression kappa: ", cohen_kappa_score(y_test, predicted_lr))

POS tags 4-gram features, sentences
------
SVM accuracy:  0.47472673881444605
SVM precision:  0.5260280955568072
SVM recall:  0.5285913528591353
SVM F1:  0.5221137187238404
SVM kappa:  0.18725748952824273
------
Logistic regression accuracy:  0.5339051064807115
Logistic regression precision:  0.5697842166927188
Logistic regression recall:  0.5564853556485355
Logistic regression F1:  0.5623266576158675
Logistic regression kappa:  0.2593143640504042


In [29]:
C = confusion_matrix(y_test, predicted_svm,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.30875576, 0.48847926, 0.20276498],
       [0.14030612, 0.70918367, 0.1505102 ],
       [0.25925926, 0.42592593, 0.31481481]])

In [30]:
C = confusion_matrix(y_test, predicted_lr,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.4516129 , 0.33179724, 0.21658986],
       [0.19897959, 0.67346939, 0.12755102],
       [0.31481481, 0.34259259, 0.34259259]])

#### 4-grams + TF-IDF, sentences

In [31]:
results_svm = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_LP_ngram_train_test_features(language="cat",k=i,n=4,POS=True,tf_idf=True)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5,class_weight="balanced")
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))

    clf_lr = LogisticRegression(max_iter=5,class_weight="balanced").fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 2191 instances, test on 731 instances


  


train on 2191 instances, test on 731 instances
train on 2191 instances, test on 731 instances
train on 2193 instances, test on 729 instances


In [32]:
print("POS tags 4-gram + tf-idf features, sentences")
print("------")
print("SVM accuracy: ", np.mean(results_svm))
print("SVM precision: ", precision_score(y_test, predicted_svm, average='weighted'))
print("SVM recall: ", recall_score(y_test, predicted_svm, average='weighted'))
print("SVM F1: ", f1_score(y_test, predicted_svm, average='weighted'))
print("SVM kappa: ", cohen_kappa_score(y_test, predicted_svm))
print("------")
print("Logistic regression accuracy: ", np.mean(results_lr))
print("Logistic regression precision: ", precision_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression recall: ", recall_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression F1: ", f1_score(y_test, predicted_lr, average='weighted'))
print("Logistic regression kappa: ", cohen_kappa_score(y_test, predicted_lr))

POS tags 4-gram + tf-idf features, sentences
------
SVM accuracy:  0.534770891355818
SVM precision:  0.5458337534275337
SVM recall:  0.5746164574616457
SVM F1:  0.532788494388241
SVM kappa:  0.20323754768332813
------
Logistic regression accuracy:  0.5507197637911054
Logistic regression precision:  0.5367832898908882
Logistic regression recall:  0.5509065550906556
Logistic regression F1:  0.5416243956883869
Logistic regression kappa:  0.21394159565014592


In [33]:
C = confusion_matrix(y_test, predicted_svm,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.22119816, 0.6359447 , 0.14285714],
       [0.06887755, 0.85714286, 0.07397959],
       [0.17592593, 0.56481481, 0.25925926]])

In [34]:
C = confusion_matrix(y_test, predicted_lr,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.359447  , 0.46543779, 0.17511521],
       [0.17602041, 0.73214286, 0.09183673],
       [0.28703704, 0.43518519, 0.27777778]])