## 4.1.3 Linear Regression Example with Word2Vec

### Word2Vec Feature Example

In [None]:
import os
import re

import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [None]:
DEFAULT_PATH = './data_in/'
DATA_OUT_PATH = './data_out/'

RANDOM_SEED = 42
TEST_SPLIT = 0.2

In [None]:
train = pd.read_csv(DEFAULT_PATH + 'labeledTrainData.tsv', header=0, delimiter='\t', quoting=3)

In [None]:
def preprocess(review): 
    html_filtered_review = BeautifulSoup(review, "html5lib").get_text()
    non_alphabet_filtered_review = re.sub("[^a-zA-Z]", " ", html_filtered_review)
    lowered_review = non_alphabet_filtered_review.lower()
    processed_words = lowered_review.split()
    stops = set(stopwords.words("english"))
    processed_words_to_stopwords = [w for w in processed_words if not w in stops]
    clean_review = ' '.join(processed_words_to_stopwords)

    return clean_review

In [None]:
reviews = train['review']
sentiments = list(train['sentiment'])

In [None]:
clean_train_reviews = []
for review in reviews:
    clean_train_reviews.append(preprocess(review))

In [None]:
sentences = []
for r in clean_train_reviews:
    sentences.append(r.split())

In [None]:
num_features = 300    
min_word_count = 40   
num_workers = 4       
context = 10          
downsampling = 1e-3 

In [None]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
   level=logging.INFO)

In [None]:
from gensim.models import word2vec

model = word2vec.Word2Vec(sentences, workers=num_workers, \
           size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

In [None]:
def get_features(words, model, num_features):
    feature_vector = np.zeros((num_features),dtype=np.float32)

    num_words = 0
    index2word_set = set(model.wv.index2word)

    for w in words:
        if w in index2word_set:
            num_words += 1
            feature_vector = np.add(feature_vector, model[w])

    feature_vector = np.divide(feature_vector, num_words)
    return feature_vector

In [None]:
def get_dataset(reviews, model, num_features):
    dataset = list()

    for s in reviews:
        dataset.append(get_features(s, model, num_features))

    reviewFeatureVecs = np.stack(dataset)
    
    return reviewFeatureVecs

In [None]:
trainDataVecs = get_dataset(sentences, model, num_features)

In [None]:
from sklearn.model_selection import train_test_split
import numpy as np

X = trainDataVecs
y = np.array(sentiments)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SPLIT, random_state=RANDOM_SEED)

In [None]:
from sklearn.linear_model import LogisticRegression

lgs = LogisticRegression(class_weight='balanced')
lgs.fit(X_train, y_train)

In [None]:
predicted = lgs.predict(X_test)
from sklearn import metrics

fpr, tpr, _ = metrics.roc_curve(y_test, (lgs.predict_proba(X_test)[:, 1]))
auc = metrics.auc(fpr, tpr)

print("------------")
print("Accuracy: %f" % lgs.score(X_test, y_test))  #checking the accuracy
print("Precision: %f" % metrics.precision_score(y_test, predicted))
print("Recall: %f" % metrics.recall_score(y_test, predicted))
print("F1-Score: %f" % metrics.f1_score(y_test, predicted))
print("AUC: %f" % auc)

In [None]:
test = pd.read_csv(DEFAULT_PATH + 'testData.tsv', header=0, delimiter='\t', quoting=3)

In [None]:
test.head(5)

In [None]:
clean_test_reviews = []
for review in test['review']:
    clean_test_reviews.append(preprocess(review))

In [None]:
test_sentences = list()
for review in clean_test_reviews:
    test_sentences.append(review.split())

In [None]:
test_data_vecs = get_dataset(test_sentences, model, num_features)

In [None]:
test_predicted = lgs.predict(test_data_vecs)

In [None]:
answer_dataset = pd.DataFrame({'id': test['id'], 'sentiment': test_predicted})

In [None]:
if not os.path.exists(DATA_OUT_PATH):
    os.makedirs(DATA_OUT_PATH)

answer_dataset.to_csv(DATA_OUT_PATH + 'lgs_w2v_answer.csv', index=False, quoting=3)