In [12]:
import numpy as np
import json
import operator
import pickle
import re
import math

In [15]:
with open("data_prepared.json") as f:
    data = json.loads(f.read())

In [14]:
with open("testing.json") as f:
    testData = json.loads(f.read())

# Bag of words vectorization

In [3]:
word_index = dict()

for sentence in data:
    for word in sentence["text"]:
        if word not in word_index:
            word_index[word] = 1
        else:
            word_index[word]+= 1
            
sortedDictionary = sorted(word_index.items(), key=operator.itemgetter(1),reverse=True)    

wordMatrix = np.zeros(shape=(len(data), 7000)) 

vocabulary = dict()
for i in range(7000):
    vocabulary[sortedDictionary[i][0]] = i

row = 0

for sentence in data:
    for word in sentence["text"]:
        if word in vocabulary:
            wordMatrix[row, vocabulary[word]]+=1
    row+=1    

In [4]:
wordVectors = list()
sentiment = 0

for i in range(len(wordMatrix)):
    if data[i]["manual_sentiment"] == "positive":
        sentiment = 1
    elif data[i]["manual_sentiment"] == "neutral":
        sentiment = 0
    else:
        sentiment = -1
    wordVectors.append((wordMatrix[i],sentiment))    

In [None]:
with open('BagWords.pkl', 'wb') as f:
    pickle.dump(wordVectors,f)

# Word2Vec vectorization

In [21]:
text = []
sentiment = []
neutral = 0
positive = 0
for item in data:
    if item["manual_sentiment"] == "neutral" and neutral < 30000:
        text.append(item["text"])
        sentiment.append(0)
        neutral+=1
    elif item["manual_sentiment"] == "positive" and positive < 25000:
        text.append(item["text"])
        sentiment.append(1)
        positive+=1
    elif item["manual_sentiment"] == "negative":
        text.append(item["text"])
        sentiment.append(-1)

In [22]:
num_features = 120
min_word_count = 40
num_workers = 4
context = 10
downsampling = 1e-3

from gensim.models import word2vec
model = word2vec.Word2Vec(text,workers=num_workers, size=num_features,\
                          min_count = min_word_count,window=context, sample=downsampling)
model.init_sims(replace = True)
model_name = "word2vec"
model.save(model_name)

In [23]:
def makeFeatureVec(words,model,num_features):
    featureVec = np.zeros((num_features,), dtype='float32')
    n = 0
    index2word_set = set(model.wv.index2word)
    
    for word in words:
        if word in index2word_set:
            n+=1
            featureVec = np.add(featureVec,model[word])
    featureVec = np.divide(featureVec,n)
    return featureVec

def getAverageVec(posts, model,num_features):
    cnt = 0
    reviewFeatureVecs = np.zeros((len(posts), num_features), dtype='float32')
    
    for sentence in posts:
        reviewFeatureVecs[cnt] = makeFeatureVec(sentence,model,num_features)
        cnt+=1
    return reviewFeatureVecs    

In [6]:
DataVecs = getAverageVec(text,model,num_features)

  # Remove the CWD from sys.path while we load stuff.


In [7]:
np.isnan(DataVecs).any()
np.count_nonzero(np.isnan(DataVecs))
DataVecs = np.nan_to_num(DataVecs)
np.isnan(DataVecs).any()

False

In [10]:
with open("text.pkl", "wb") as f:
    pickle.dump(DataVecs,f)
    
with open("sentiment.pkl", "wb") as f:
    pickle.dump(sentiment,f)

In [9]:
len(DataVecs)

65573

In [11]:
DataVecs[0]

array([ -1.34336267e-04,   1.33666266e-02,   1.04589080e-02,
         2.11514346e-02,   2.51224004e-02,  -2.01841034e-02,
         1.48397684e-02,  -1.06298262e-02,   1.10759595e-02,
         1.57386921e-02,   9.19556245e-03,   1.40464082e-02,
         5.60426340e-03,   1.60362758e-02,   1.33384243e-02,
        -8.08512967e-04,  -5.03373472e-03,   1.22603471e-03,
         6.05679210e-03,  -3.51574528e-03,  -3.25760469e-02,
         4.43344153e-02,   2.04750933e-02,  -2.62582605e-03,
        -1.47328863e-03,  -4.02599180e-05,   1.03632752e-02,
        -1.95809081e-03,   7.53942586e-04,   1.10852607e-02,
        -1.68742966e-02,  -4.42791777e-03,   3.64656821e-02,
        -2.04444062e-02,  -1.35190170e-02,   2.40673907e-02,
         1.21841412e-02,  -9.18254536e-03,  -2.42143292e-02,
        -5.45758009e-03,  -3.45048420e-02,  -1.62998196e-02,
        -1.19103435e-02,  -1.64612511e-03,  -1.71572808e-02,
        -2.95967311e-02,   1.03858467e-02,   8.99517071e-03,
         5.06902812e-03,