# *word2vec* feature extraction and models

In this nodebook are results from *word2vec* feature models while working with the original and lemmatized sentences, and comments. Catalan language.

In [1]:
import os,nltk,re,numpy as np
from gensim.models.word2vec import Word2Vec
from sklearn.feature_extraction.text import TfidfTransformer

In [2]:
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import f1_score

Here we use pre-trained *word2vec* that was trained on Wikipedia corpus with dimension 100. The process of pre-training these models is described in notebook 

In [4]:
es_word2vec = Word2Vec.load('w2v_models/es_word2vec_model')

In [5]:
ca_word2vec = Word2Vec.load('w2v_models/ca_word2vec_model')

In [6]:
import get_train_test

Imported file *get_train_test.py*. This file contains 3 functions:

* **get_train_test(k=1,lemmatize=False,POS=False)**

Reads data from *txt* files. Returns two arrays of tuples (train and test), with sentences and their labels ('pos','neg' or 'neu').

*lemmatize* - if *True* returns word lemmas

*POS* - if *True* reurns words in form "*lemma_POStag*"

* **get_train_test_comments(k=1)**

Reads data from *MongoDB* (as sentence order in comments is saved there). Returns two arrays of tuples (train and test), with comments and their labels ('pos','neg' or 'neu').

* **get_english_train_test(k=1)**

Reads data from pre-created *txt* files with sentences translated to English. Returns two arrays of tuples (train and test), with comments and their labels ('pos','neg' or 'neu').


For all the functions train-test split is 3/4 to 1/4, selection order depending on parameter *k*.

*k* - takes values 1 to 4 - changes the selection of train-test split (used for cross-validation).

### Create feature vectors

In [7]:
def get_word_list(text,remove_stopwords=False,repeat=False):
    if remove_stopwords == True:
        #with open("es_stopwords.txt") as f:
        #    es_stopwords = f.readlines()
        #es_stopwords = [x.strip() for x in es_stopwords] 
        es_stopwords = set(stopwords.words("spanish"))
        for stopword in es_stopwords:
            text = re.sub(stopword, '', text)
    text = re.sub(r'\d+', '', text)
    text = text.lower()
    punctuation = ['.',',','%','&','\'','/','+','!']
    rx = '[' + re.escape(''.join(punctuation)) + ']'
    text = re.sub(rx, '', text)
    if repeat:
        tokens = nltk.wordpunct_tokenize(text)
    else:
        tokens = sorted(set(nltk.wordpunct_tokenize(text)))
    remove_from_vocabulary = ['-',')','(','(?)','1','=','[',']','][',':','<','>',';',':)','?']
    for i in remove_from_vocabulary:
        if i in tokens:
            tokens.remove(i)
    return tokens

In [8]:
def tokenize(text):
    tokens = get_word_list(text,remove_stopwords=False,repeat=True)
    return tokens

This function converts each word to word2vec vector and then averages all vectors to get averaged word2vec representation of text.

In [9]:
def w2v_es(sentence):
    sentence_words = tokenize(sentence)
    words = []
    for word in sentence_words:
        if word in es_word2vec.wv.vocab:
            v = es_word2vec.wv[word]
            words.append(v)
    vect = np.mean(words,axis=0)
    return vect

def w2v_ca(sentence):
    sentence_words = tokenize(sentence)
    words = []
    for word in sentence_words:
        if word in ca_word2vec.wv.vocab:
            v = ca_word2vec.wv[word]
            words.append(v)
    vect = np.mean(words,axis=0)
    return vect

This function created averaged word2vec feature vectors.

tf_idf 

In [10]:
def get_w2v_train_test_features(language="cat",tf_idf=False,full_comments=False,k=1):
    #if full_comments:
    #    train,test = get_train_test.get_train_test_comments(k=k,language=language)
    #else:
    #    train,test = get_train_test.get_train_test(k=k,language=language)
    train,test = get_train_test.get_train_test(k,lemmatize=True,language=language)
    X_train = []
    y_train = []
    for sentence in train:
        vect = w2v_ca(sentence[0])
        if not np.isnan(np.sum(vect)): ## check that it is not nan
            X_train.append(vect)
            y_train.append(sentence[1])
    X_test = []
    y_test = []
    for sentence in test:
        vect = w2v_ca(sentence[0])
        if not np.isnan(np.sum(vect)): ## check that it is not nan
            X_test.append(vect)
            y_test.append(sentence[1])
    if tf_idf == True:
        transformer = TfidfTransformer(smooth_idf=False)
        X_train = transformer.fit_transform(X_train).toarray()
        X_test = transformer.transform(X_test).toarray()
    return X_train,y_train,np.array(X_test),y_test

### Train models, Catalan, sentence level

In [13]:
k = [4]
random_states = [0,1,2,4,5,42,50,60,70,100]

#### *word2vec* features, sentences

In [17]:
results_svm = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_w2v_train_test_features(k=i,language="cat")
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5,class_weight="balanced")
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))
    
    clf_lr = LogisticRegression(max_iter=5,class_weight="balanced").fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 2193 instances, test on 729 instances


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [18]:
print("word2vec features, sentences")
print("SVM accuracy: ", np.mean(results_svm))
print("Logistic regression accuracy: ", np.mean(results_lr))

word2vec features, sentences
SVM accuracy:  0.3901960784313726
Logistic regression accuracy:  0.5490196078431373


In [19]:
C = confusion_matrix(y_test, predicted_svm,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.        , 0.05069124, 0.94930876],
       [0.        , 0.25255102, 0.74744898],
       [0.        , 0.20952381, 0.79047619]])

#### *word2vec* + tf-idf features, sentences

In [20]:
results_svm = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_w2v_train_test_features(k=i,tf_idf=True,language="cat")
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5)
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))
    
    clf_lr = LogisticRegression(max_iter=5).fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 2193 instances, test on 729 instances


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [21]:
print("word2vec + tf-idf features, sentences")
print("SVM accuracy: ", np.mean(results_svm))
print("Logistic regression accuracy: ", np.mean(results_lr))

word2vec + tf-idf features, sentences
SVM accuracy:  0.6651260504201681
Logistic regression accuracy:  0.6680672268907563


In [22]:
C = confusion_matrix(y_test, predicted_svm,['pos', 'neg','neu'])
np.transpose( np.transpose(C) / C.astype(np.float).sum(axis=1) )

array([[0.47004608, 0.52534562, 0.00460829],
       [0.05612245, 0.94132653, 0.00255102],
       [0.1047619 , 0.8       , 0.0952381 ]])

### Train models, comment level


#### *word2vec* features, comments

In [30]:
results_svm = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_w2v_train_test_features(k=i,full_comments=True)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5)
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))
    
    clf_lr = LogisticRegression(max_iter=5).fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 472 instances, test on 159 instances
train on 473 instances, test on 158 instances


In [31]:
print("word2vec features, comments")
print("SVM accuracy: ", np.mean(results_svm))
print("Logistic regression accuracy: ", np.mean(results_lr))

word2vec features, comments
SVM accuracy:  0.6648873497333014
Logistic regression accuracy:  0.7066117347344956


#### *word2vec* + tf-idf features, comments

In [32]:
results_svm = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_w2v_train_test_features(k=i,tf_idf=True,full_comments=True)
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5)
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))
    
    clf_lr = LogisticRegression(max_iter=5).fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 472 instances, test on 159 instances
train on 473 instances, test on 158 instances


In [33]:
print("word2vec + tf-idf features, comments")
print("SVM accuracy: ", np.mean(results_svm))
print("Logistic regression accuracy: ", np.mean(results_lr))

word2vec + tf-idf features, comments
SVM accuracy:  0.6961866093463897
Logistic regression accuracy:  0.7097762916965209


In [11]:
def get_w2v_train_test_features(language="cat",tf_idf=False,full_comments=False,k=1):
    if full_comments:
        train,test = get_train_test.get_train_test_comments(k=k,language=language)
    else:
        train,test = get_train_test.get_train_test(k=k,language=language)
    #train,test = get_train_test.get_train_test(k,lemmatize=True,language=language)
    X_train = []
    y_train = []
    for sentence in train:
        vect = w2v_ca(sentence[0])
        if not np.isnan(np.sum(vect)): ## check that it is not nan
            X_train.append(vect)
            y_train.append(sentence[1])
    X_test = []
    y_test = []
    for sentence in test:
        vect = w2v_ca(sentence[0])
        if not np.isnan(np.sum(vect)): ## check that it is not nan
            X_test.append(vect)
            y_test.append(sentence[1])
    if tf_idf == True:
        transformer = TfidfTransformer(smooth_idf=False)
        X_train = transformer.fit_transform(X_train).toarray()
        X_test = transformer.transform(X_test).toarray()
    return X_train,y_train,np.array(X_test),y_test

In [14]:
results_svm = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_w2v_train_test_features(k=i,tf_idf=True,full_comments=True,language="cat")
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5)
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))
    
    clf_lr = LogisticRegression(max_iter=5).fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 1200 instances, test on 398 instances


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [15]:
print("word2vec + tf-idf features, comments")
print("SVM accuracy: ", np.mean(results_svm))
print("Logistic regression accuracy: ", np.mean(results_lr))

word2vec + tf-idf features, comments
SVM accuracy:  0.6326633165829145
Logistic regression accuracy:  0.6432160804020101


In [20]:
results_svm = []
results_lr = []
for i in k:
    X_train,y_train,X_test,y_test = get_w2v_train_test_features(k=i,tf_idf=True,language="cat")
    accuracies = []
    for s in random_states:
        clf_svm = SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2', random_state=s, max_iter=5)
        clf_svm.fit(X_train, y_train) 
        predicted_svm = clf_svm.predict(X_test)
        accuracies.append(np.mean(predicted_svm == y_test))
    results_svm.append(np.mean(accuracies))
    
    clf_lr = LogisticRegression(max_iter=5).fit(X_train, y_train)
    predicted_lr = clf_lr.predict(X_test)
    results_lr.append(np.mean(predicted_lr == y_test))

train on 2193 instances, test on 729 instances


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [21]:
print("word2vec features, comments")
print("SVM accuracy: ", np.mean(results_svm))
print("Logistic regression accuracy: ", np.mean(results_lr))

word2vec features, comments
SVM accuracy:  0.6509615384615385
Logistic regression accuracy:  0.6552197802197802
