In [None]:
from sklearn.externals import joblib
import os
import json
import numpy as np
import math
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
from collections import OrderedDict
from collections import defaultdict
from collections import Counter
from gensim.models import Word2Vec

In [None]:
predictiondict_1 = joblib.load(open("labeled_data/predictions_tablets_EN.p", "rb"))

In [None]:
predictiondict_2 = joblib.load(open("labeled_data/predictions_automobiles_EN.p", "rb"))

In [None]:
tablets_automobiles = ["tablets_EN_extra_data","automobiles_EN_extra_data"]

In [None]:
raw_data_list = []

for dirfile in tablets_automobiles:
        print("collecting comments from: ",dirfile)    
        for filename in os.listdir(dirfile):
            file = dirfile+"/"+filename
            if file[-4:] == "json":
                with open(file) as json_data:
                    d = json.load(json_data)
                    for comment in d["comments"]:
                        lower = comment["text"].lower().strip()
                        raw_data_list.append([comment["id"],d["video_id"],lower])

In [None]:
print(len(predictiondict_2))

In [None]:
predictiondict = {}

In [None]:
predictiondict.update(predictiondict_2)

In [None]:
print(len(predictiondict))

In [None]:
predictiondict.update(predictiondict_1)

In [None]:
print(len(predictiondict))

In [None]:
# create commentlists of positive, negative and neutral comments

labellist_pos_test = []
commentlist_pos_test =[]
labellist_neg_test = []
commentlist_neg_test =[]
labellist_neu_test = []
commentlist_neu_test =[]
for comment in raw_data_list:
    commentid, videoid_raw, comment_text = comment
    
    if commentid in predictiondict:
        
        label,videoid,comment_output = predictiondict[commentid]
        
        if videoid == videoid_raw:
            
            raw_comment = comment_text
            comment = raw_comment.lower().strip()
            words = comment.split()
            
            if label == "positive":
                commentlist_pos_test.append(words)
                labellist_pos_test.append(label)
            elif label == "negative":
                commentlist_neg_test.append(words)
                labellist_neg_test.append(label)
            elif label == "neutral":
                commentlist_neu_test.append(words)
                labellist_neu_test.append(label)
        else:
            print(commentid, videoid_raw, comment_text )
    else:
        print(commentid)

In [None]:
print(len(commentlist_pos_test))
print(len(commentlist_neg_test))
print(len(commentlist_neu_test))

print(len(commentlist_pos_test)+len(commentlist_neg_test)+len(commentlist_neu_test))

In [None]:
lexiconfile = pd.read_csv('lexicons/SentiWordNet_3.0.0.txt', sep="\t")

In [None]:
lexiconfile.head(2)

In [None]:
pos = []
neg = []

for num, row in lexiconfile.iterrows():
    # add positives
    if row['PosScore'] > 0.6 and row['NegScore'] < 0.2:
        words = row['SynsetTerms'].split()
        for word in words: 
            word = word.rstrip('0123456789#')
            if '_' not in word:
                pos.append(word)
    # add negatives
    elif row['NegScore'] > 0.6 and row['PosScore'] < 0.2:
        words = row['SynsetTerms'].split()
        for word in words: 
            word = word.rstrip('0123456789#')
            if '_' not in word:
                neg.append(word)

print(len(pos), len(neg))

In [None]:
# Check if words occur in both lists, if they do check how many times
# if word occur more in one list remove in other, if equal remove from both lists

new_pos = []
new_neg = []

for word in pos:
    if word in neg:
        if pos.count(word) > neg.count(word):
            if word not in new_pos:
                new_pos.append(word)
        elif neg.count(word) > pos.count(word):
            if word not in new_neg:
                new_neg.append(word)
    else:
        if word not in new_pos:
            new_pos.append(word)

for word in neg:
    if word not in new_neg and word not in pos:
        new_neg.append(word)

In [None]:
print(len(new_pos),len(new_neg))

In [None]:
# Add _pos to positive words

new_pos_comments = []
changed_words_pos = []

for line in commentlist_pos_test:
    new_line = []
    for word in line:
        if word in new_pos:
            
            new_word = word +"_pos"
            
#             count_changes_pos[word] += 1

            new_line.append(new_word)
            changed_words_pos.append(word)
            
        else:
            new_line.append(word)
    new_pos_comments.append(new_line)

print(new_pos_comments[:100])
print(len(changed_words_pos))

In [None]:
count_changes_pos = Counter(changed_words_pos)

In [None]:
# Add _neg to positive words

new_neg_comments = []
changed_words_neg = []
count_changes_neg = 0

for line in commentlist_neg_test:
    new_line = []
    for word in line:
        if word in new_neg:
            new_word = word +"_neg"
            count_changes_neg += 1
            new_line.append(new_word)
            changed_words_neg.append(word)
        else:
            new_line.append(word)
    
    new_neg_comments.append(new_line)

print(new_neg_comments[:10])
print(len(changed_words_neg))

In [None]:
count_changes_neg = Counter(changed_words_neg)

In [None]:
# Remove double words
words_neg = list(OrderedDict.fromkeys(changed_words_neg))
words_pos = list(OrderedDict.fromkeys(changed_words_pos))

In [None]:
print(len(words_pos))

In [None]:
# pickle.dump(words_neg, open("changed_words_neg_ALL.p", "wb" ))
# pickle.dump(words_pos, open("changed_words_pos_ALL.p", "wb" ))

In [None]:
new_data = new_neg_comments + new_pos_comments + commentlist_neu_test

In [None]:
# Count amount of times each changed word occurs in other context for weighting the embeddings

changed_word_neg_other_context = [] 
changed_word_pos_other_context = [] 

for line in new_data:
    for word in line:
        if word in words_neg:
            changed_word_neg_other_context.append(word)
        elif word in words_pos:
            changed_word_pos_other_context.append(word)

In [None]:
count_changes_neg_other = Counter(changed_word_neg_other_context)
count_changes_pos_other = Counter(changed_word_pos_other_context)

In [None]:
def getWeightdict(count_changes_pos,count_changes_pos_other):
    pos_word_weights = {}
    for word in count_changes_pos:
        pos_count = count_changes_pos[word]
        other_count = count_changes_pos_other[word]
        total_count_pos = pos_count + other_count
        weight_pos = pos_count / total_count_pos
        weight_other_pos = other_count / total_count_pos
        pos_word_weights[word] = [weight_pos,weight_other_pos]
    return pos_word_weights 

In [None]:
pos_word_weights = getWeightdict(count_changes_pos,count_changes_pos_other)

In [None]:
neg_word_weights = getWeightdict(count_changes_neg,count_changes_neg_other)

In [None]:
# Train both skip models

# model_SKIP = Word2Vec(new_data, size=300, sg=1)
model_SKIP_n10 = Word2Vec(new_data, size=300, sg=1, negative=10)

In [None]:
model_SKIP_n10.most_similar("great_pos")

In [None]:
model_SKIP_n10.wv.vectors.shape[0]

In [None]:
model_SKIP_n10.save("SKIP_negative10_EN_Change_inputSentiment_no_delete_test.model")

In [None]:
# Delete other context words

model_dict_delete = {}

count_word_del = 0
count_all = 0
count_else = 0
amount_posneg = 0
indict = 0


for word in model_SKIP.wv.vocab:
    count_all +=1
    if word[-4:] == "_neg" or word[-4:] == "_pos":
        amount_posneg +=1
        non_word = word[:-4]
        if non_word not in model_dict_delete:
            model_dict_delete[non_word] = model_SKIP_n10.wv[word]
        else:
            print(word)
    else:
        if word not in words_neg and word not in words_pos:
            count_else +=1
            if word not in model_dict_delete:
                model_dict_delete[word] = model_SKIP_n10.wv[word]
            else:
                print(word)
                indict  +=1
        else:
            count_word_del +=1

print(count_all)
print("")   
print(count_word_del)
print(amount_posneg)
print(count_else)
print("")
print(count_word_del + amount_posneg + count_else)

In [None]:
print(len(model_dict_delete))

In [None]:
print(model_dict_delete["great"])

In [None]:
pickle.dump(model_dict_delete, open("SKIP_EN_Change_inputSentiment_deletedoubles.p", "wb" ))

In [None]:
# Average the embeddings

model_dict_average = {}

count_word_del = 0
count_all = 0
count_else = 0
amount_posneg = 0
indict = 0


for word in model_SKIP.wv.vocab:
    count_all +=1
    if word[-4:] == "_neg" or word[-4:] == "_pos":
        amount_posneg +=1
        non_word = word[:-4]
        
        vectorslist = []
        posneg_vector = model_SKIP_n10.wv[word]
        vectorslist.append(posneg_vector)
        
        try:
            other_vector = model_SKIP_n10.wv[non_word]
            vectorslist.append(other_vector)
            
            vectorarray = np.array(vectorslist)
            averagevector = np.average(vectorarray, axis=0)
            
            model_dict_average[non_word] = model_SKIP_n10.wv[averagevector]
            
        except:
            # use posneg word
            model_dict_average[non_word] = model_SKIP_n10.wv[word]
    
    else:
        if word not in words_neg and word not in words_pos:
            count_else +=1
            if word not in model_dict_average:
                model_dict_average[word] = model_SKIP_n10.wv[word]
            else:
                print(word)
                indict  +=1
        else:
            count_word_del +=1
        
        
            
        

print(count_all)
print("")   
print(count_word_del)
print(amount_posneg)
print(count_else)
print("")
print(count_word_del + amount_posneg + count_else)

In [None]:
pickle.dump(model_dict_average, open("SKIP_EN_Change_inputSentiment_average.p", "wb" ))

In [None]:
# Weight and sum embeddings

model_dict_weightsum = {}

count_word_del = 0
count_all = 0
count_else = 0
amount_posneg = 0
indict = 0

for word in model_SKIP.wv.vocab:
    count_all +=1
    if word[-4:] == "_neg" or word[-4:] == "_pos": 
        amount_posneg +=1
        non_word = word[:-4]
        
        if word[-4:] == "_neg":
            sentiment_weight, other_weight = neg_word_weights[non_word]
        else:
            sentiment_weight, other_weight = pos_word_weights[non_word]
        
        vectorslist = []
        posneg_vector = model_SKIP_n10.wv[word]
        vectorslist.append(posneg_vector*sentiment_weight)
        
        try:
            other_vector = model_SKIP_n10.wv[non_word]
            vectorslist.append(other_vector*other_weight)
            
            vectorarray = np.array(vectorslist)
            weight_sum_vector = np.sum(vectorarray, axis=0)
            
            model_dict_weightsum[non_word] = model_SKIP_n10.wv[weight_sum_vector]
        except:
            # use posneg word
            model_dict_weightsum[non_word] = model_SKIP_n10.wv[word]
    
    else:
        if word not in words_neg and word not in words_pos:
            count_else +=1
            if word not in model_dict_weightsum:
                model_dict_weightsum[word] = model_SKIP_n10.wv[word]
            else:
                print(word)
                indict  +=1
        else:
            count_word_del +=1
        
        
            
print(count_all)
print("")   
print(count_word_del)
print(amount_posneg)
print(count_else)
print("")
print(count_word_del + amount_posneg + count_else)

In [None]:
pickle.dump(model_dict_weightsum, open("SKIP_EN_Change_inputSentiment_weightsum.p", "wb" ))