# Importing Modules

In [1]:
import os
import wget
import gzip
import shutil
import warnings

from time                    import time
from string                  import punctuation

import nltk
from nltk.tokenize           import word_tokenize
from nltk.corpus             import stopwords

from gensim.models           import Word2Vec, KeyedVectors

import numpy                 as     np
import pandas                as     pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model    import LogisticRegression
from sklearn.metrics         import classification_report

In [2]:
warnings.filterwarnings('ignore')

# Dataset

- We’ll use the sentiment-labeled sentences dataset from the UCI repository, consisting of 1,500 positive-sentiment and 1,500 negative- sentiment sentences from Amazon, Yelp, and IMDB.
- http://archive.ics.uci.edu/ml/datasets/Sentiment+Labelled+Sentences

- Let us first combine all the three separate data files into one using the following unix command:

`cat amazon_cells_labelled.txt imdb_labelled.txt yelp_labelled.txt > sentiment_sentences.txt`

In [3]:
#Read text data, categories.
texts = []
cats = []

fh = open("data/sentiment labelled sentences/sentiment_sentences.txt")

for line in fh:
    text, sentiment = line.split("\t")
    texts.append(text)
    cats.append(sentiment)

In [4]:
df = pd.DataFrame(texts, columns = ["Text"])

In [5]:
df['Sentiment'] = cats

In [6]:
df.head()

Unnamed: 0,Text,Sentiment
0,So there is no way for me to plug it in here i...,0\n
1,"Good case, Excellent value.",1\n
2,Great for the jawbone.,1\n
3,Tied to charger for conversations lasting more...,0\n
4,The mic is great.,1\n


# Cleaning Data

## Normalizing Labels

In [7]:
df['Sentiment'] = [int(label.replace("\n", "")) for label in list(df['Sentiment'])]

In [8]:
df.shape

(3000, 2)

In [9]:
df.head()

Unnamed: 0,Text,Sentiment
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [10]:
df.Sentiment.value_counts()

0    1500
1    1500
Name: Sentiment, dtype: int64

## Text Preprocessing

In [14]:
def preprocess_corpus(texts):
    
    StopWords = set(stopwords.words('english'))
    
    #Nested function that lowercases, removes stopwords and digits from a list of tokens
    def remove_stops_digits(tokens):
        return [token.lower() for token in tokens if token.lower() not in StopWords and not token.isdigit() and token not in punctuation]
    
    #This return statement below uses the above function to process twitter tokenizer output further
    return [remove_stops_digits(word_tokenize(text)) for text in texts]

In [15]:
processed_texts = preprocess_corpus(texts)

In [17]:
df['Text'] = processed_texts

## Final View

In [19]:
df.shape

(3000, 2)

In [18]:
df.head()

Unnamed: 0,Text,Sentiment
0,"[way, plug, us, unless, go, converter]",0
1,"[good, case, excellent, value]",1
2,"[great, jawbone]",1
3,"[tied, charger, conversations, lasting, minute...",0
4,"[mic, great]",1


# Loading Word2Vec Model

In [11]:
#Load W2V model. This will take some time. 
%time 
w2v_model = KeyedVectors.load_word2vec_format("data/GoogleNews-vectors-negative300.bin", binary = True)
print('done loading Word2Vec')

CPU times: user 4 µs, sys: 1 µs, total: 5 µs
Wall time: 8.82 µs
done loading Word2Vec


## Inspecting Model

In [12]:
#Inspect the model
word2vec_vocab = w2v_model.vocab.keys()
word2vec_vocab_lower = [item.lower() for item in word2vec_vocab]

In [13]:
len(word2vec_vocab)

3000000

## Doc2Vec Feature Vector

In [22]:
# Creating a feature vector by averaging all embeddings for all sentences
def embedding_feats(list_of_lists):
    
    DIMENSION = 300
    zero_vector = np.zeros(DIMENSION)
    feats = []
    
    for tokens in list_of_lists:
        feat_for_this = np.zeros(DIMENSION)
        count_for_this = 0 + 1e-5 # to avoid divide-by-zero 
        
        for token in tokens:
            if token in w2v_model:
                feat_for_this += w2v_model[token]
                count_for_this += 1
                
        if count_for_this != 0:
            feats.append(feat_for_this / count_for_this)
        else:
            feats.append(zero_vector)
            
    return feats

In [23]:
train_vectors = embedding_feats(processed_texts)

In [24]:
len(train_vectors)

3000

# Training The Model

In [26]:
train_data, test_data, train_cats, test_cats = train_test_split(train_vectors, cats)

In [25]:
classifier = LogisticRegression(random_state = 1234)

In [27]:
classifier.fit(train_data, train_cats)

LogisticRegression(random_state=1234)

# Evaluation

In [28]:
classifier.score(test_data, test_cats)

0.8026666666666666

In [29]:
preds = classifier.predict(test_data)

In [31]:
print(classification_report(test_cats, preds))

              precision    recall  f1-score   support

          0
       0.81      0.79      0.80       373
          1
       0.79      0.82      0.81       377

    accuracy                           0.80       750
   macro avg       0.80      0.80      0.80       750
weighted avg       0.80      0.80      0.80       750

