In [1]:
import numpy as np
import pandas as pd

import gensim
from gensim.models import Phrases
from gensim.models.phrases import Phraser

import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn import metrics


import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.metrics import roc_auc_score


from tqdm import tqdm
from zipfile import *

import re

import time
from time import perf_counter

In [2]:
VECTOR_SIZE=200
WINDOW_SIZE=10

In [3]:
with ZipFile('../input/word2vec-nlp-tutorial/labeledTrainData.tsv.zip', 'r') as zip_ref:
    zip_ref.extractall('/kaggle/working')

In [4]:
data = pd.read_csv("/kaggle/working/labeledTrainData.tsv", sep='\t')

In [5]:
data.review = data.review.apply(lambda x: re.sub('<[^<]+?>', '', x))
data.review = data.review.apply(lambda x: x.lower())

In [6]:
data.head(10)

In [7]:
tokens = []
for review in tqdm(data['review']):
    tokens += word_tokenize(review)

In [8]:
# stop_words = set(stopwords.words('english'))
# new_stopwords = ["film", "movie", "imdb"]
# stop_words.update(new_stopwords)
stop_words = []

In [9]:
lemmatizer = WordNetLemmatizer()

In [10]:
words = [word.lower() for word in tokens if (not word.lower() in stop_words) and (word.isalpha())]
words = [lemmatizer.lemmatize(word) for word in words]

In [11]:
print("Number of words: %s" % len(words))

In [12]:
words[:20]

In [13]:
w2v = gensim.models.Word2Vec([words], vector_size=VECTOR_SIZE, window=10, min_count=50, workers = 3, epochs=100)

In [14]:
len(w2v.wv)

In [15]:
w2v.wv.most_similar(['bad'], topn = 10)

In [16]:
data['doc_vector'] = data['review'].apply(lambda x: [lemmatizer.lemmatize(y.lower()) for y in x.split(" ")])
data['doc_vector'] = data['doc_vector'].apply(lambda x: [y for y in x if y in w2v.wv.key_to_index])
data['doc_vector'] = data['doc_vector'].apply(lambda x: np.mean(w2v.wv[x], axis=0))

In [17]:
data.head(10)

In [18]:
y = data['sentiment']

In [19]:
X = np.stack(data['doc_vector'].to_numpy())

In [20]:
rskf = RepeatedStratifiedKFold(n_splits=3, n_repeats=3, random_state=43524)

for train_index, test_index in rskf.split(X, y):
    
    #X = data['doc_vector'].to_numpy()    
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Logistic regression
    X_train = list(X_train)

    # C = Inverse of regularization strength (smaller values specify stronger regularization)
    # Doesn't seem to change much
    clf = LogisticRegression(C=10, solver='liblinear')
    clf.fit(X_train, y_train)
    
    #eval
    X_test = list(X_test)
    
    y_pred = clf.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    print(acc)

In [21]:
# Longer evaluation on different splits using AUC

rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=43524)

print("  AUC ROC training      AUC ROC test")

for train_index, test_index in rskf.split(X, y):
    
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

        
    
    # Logistic regression
    X_train = list(X_train)

    # C = Inverse of regularization strength (smaller values specify stronger regularization)
    # Doesn't seem to change much
    clf = LogisticRegression(C=100, solver='liblinear')
    clf.fit(X_train, y_train)
    
    #eval
    X_test = list(X_test)
    y_trainscore = clf.predict_proba(X_train)[:, 1]
    y_testscore = clf.predict_proba(X_test)[:, 1] 
    
    roctrain = roc_auc_score(y_train, y_trainscore)
    roctest = roc_auc_score(y_test, y_testscore)
        
    print(roctrain, "          ", roctest)

In [22]:
# Iterative evaluation for Decision Tree

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=241124)

# Making the Confusion Matrix
 
for i in range(1, 15):
    start = perf_counter()
    X_train = list(X_train)
    X_test = list(X_test)
    classifier2 = DecisionTreeClassifier(criterion = 'entropy', random_state = 0, max_depth=i)
    classifier2.fit(X_train, y_train)

    # Predicting the Test set results
    y_pred = classifier2.predict(X_test)
    
    cm2 = confusion_matrix(y_test, y_pred)

    end = perf_counter()
    execution_time = (end - start)
    print(cm2)

    print("Accuracy Decision Tree of depth", i,":", metrics.accuracy_score(y_test,y_pred))
    print("Recall Decision Tree of depth", i, ":", metrics.recall_score(y_test,y_pred))
    print("Precision Decision Tree of depth", i, ":", metrics.precision_score(y_test,y_pred))
    print("Elapsed time in seconds:", execution_time, "\n")