In [None]:
import numpy as np
import os
import gensim
import pickle
import json
from keras.preprocessing import sequence as sq
from sklearn.model_selection import train_test_split
from glove import Glove

In [None]:
word2id= {}
id2word={}

MAXLEN = 0
index = 1

def convertToIds(sent):
    global index
    global word2id
    global id2word
    global MAXLEN
    ids = np.array([],dtype='int32')

    if sent == None:
        return np.append(ids,0)
    words = sent.split()    
    for word in words:
        word = word.lower()
        if word in word2id:
            ids = np.append(ids,word2id[word])
        else:
            if word != '':
                #print (word, "not in vocalbulary")
                word2id[word] = index
                id2word[index] = word
                ids = np.append(ids, index)
                index = index + 1
    return ids

def parse(path):
    g = open(path, 'r').read()
    return json.loads(g)

def get_annotation(data_folder,category,files,set_index):
    global data_type
    
    annotation1_count = 0 # positive product
    annotation2_count = 0 # neutral product
    annotation3_count = 0 # negative product 
    annotation4_count = 0 # positive video
    annotation5_count = 0 # neutral video
    annotation6_count = 0 # negative video
    annotation7_count = 0 # uninfo
    
    for file in files:
        json_text = parse(data_folder+"/"+category+"/"+file)
        for comment in json_text['comments']:
            if ("annotation" in comment):
                if ("product-related" in comment["annotation"] and "video-related" in comment["annotation"]):
                    continue
                if ("positive-product" in comment["annotation"] or "positive-video" in comment["annotation"]) and ("negative-product" in comment["annotation"] or "negative-video" in comment["annotation"] ):
                    continue
                data = [json_text['video_id'],json_text['video_description'],json_text['title'],comment['text']]
                if  "spam" in comment["annotation"] or "off-topic-or-undecidable" in comment["annotation"]:
                    data.append([0,0,0,0,0,0,1])
                    data_type[set_index].append(data)
                    annotation7_count +=1 
                elif "positive-product" in comment["annotation"]:# or "positive-video" in comment["annotation"]):
                    data.append([1,0,0,0,0,0,0])
                    data_type[set_index].append(data)
                    annotation1_count +=1
                elif "negative-product" in comment["annotation"]:# or "positive-video" in comment["annotation"]):
                    data.append([0,0,1,0,0,0,0])
                    data_type[set_index].append(data)
                    annotation3_count +=1
                elif "product-related" in  comment["annotation"]:
                    data.append([0,1,0,0,0,0,0])
                    data_type[set_index].append(data)
                    annotation2_count +=1
                elif "positive-video" in comment["annotation"]:# or "positive-video" in comment["annotation"]):
                    data.append([0,0,0,1,0,0,0])
                    data_type[set_index].append(data)
                    annotation4_count +=1
                elif "negative-video" in comment["annotation"]:# or "positive-video" in comment["annotation"]):
                    data.append([0,0,0,0,0,1,0])
                    data_type[set_index].append(data)
                    annotation6_count +=1
                elif "video-related" in  comment["annotation"]:
                    data.append([0,0,0,0,1,0,0])
                    data_type[set_index].append(data)
                    annotation5_count +=1
    print (annotation1_count,annotation2_count,annotation3_count,annotation4_count,annotation5_count,annotation6_count,annotation7_count)

In [None]:
category = "automobiles_IT"
data_folder = "../../SenTube"

In [None]:
files = [f for f in os.listdir(data_folder+"/"+category+"/")]

# indexes:
# 0:trainset, 1:testset, 2:val set 

train_all, test = train_test_split(files,test_size=0.5,random_state=12,shuffle=True) 
train, val = train_test_split(train_all,test_size=0.2,random_state=12,shuffle=True)

In [None]:
data_type_train = []
data_type_test = []
data_type_val = []
data_type = [data_type_train,data_type_test,data_type_val]

In [None]:
print("Labeldistribution train:")
get_annotation(data_folder,category,train,0)
print("Labeldistribution test:")
get_annotation(data_folder,category,test,1)
print("Labeldistribution val:")
get_annotation(data_folder,category,val,2)

In [None]:
for data in data_type:
    print (len(data))

In [None]:
X_train = []
X_test = []
X_val = []
X_data = [X_train,X_test,X_val]

y_train = []
y_test = []
y_val = []
y_data = [y_train, y_test, y_val]

In [None]:
#max description length : 2446
#max title length : 80
#max comment length: 507

for i in range(0,3):
    video_descriptions = []
    tiles = []
    comments = []
    labels = []
    for row in data_type[i]:
        video_descriptions.append(convertToIds(row[1]))
        tiles.append(convertToIds(row[2]))
        comments.append(convertToIds(row[3]))
        labels.append(row[4])
    video_descriptions = sq.pad_sequences(video_descriptions,maxlen=2446)
    tiles = sq.pad_sequences(tiles,maxlen=80)
    comments = sq.pad_sequences(comments,maxlen=507)
    X_data[i] = [video_descriptions,tiles,comments]
    y_data[i] = labels

In [None]:
print(len(word2id))

In [None]:
# pickle.dump([X_data,y_data],open("corpus_automobiles_IT_full.p", "wb"))

In [None]:
# pickle.dump([word2id,id2word],open("word2id_id2word_automobiles_IT_full.p", "wb"))

In [None]:
def loadGloveModel(gloveFile):
    print("Loading Glove Model")
    f = Glove.load(gloveFile)
    model = {}
    for word in f.dictionary:
        embedding = f.word_vectors[f.dictionary[word]]
        model[word] = embedding
    print("Done.",len(model)," words loaded!")
    return model

In [None]:
def loadRetrofitModel(open_model):
    model_org = {}
    for line in open_model:
        line = line.rstrip()
        line = line.split("\t")
        try:
            values = line[1].rstrip(";")
            values = values.split(";")
            if len(values) == 300:
                model_org[line[0]] = values
            else:
                print(line[0], values, len(values))
        except:
            pass
        
    return model_org

In [None]:
# load embedding file

# glove =  "../IT_Embeddings/Italian_GloVe_embeddings.model"
# model_org = loadGloveModel(glove)

# model_org = gensim.models.Word2Vec.load('../IT_Embeddings/home/berardi/glove_WIKI') # glove model
# model_org = gensim.models.Word2Vec.load('../IT_Embeddings/home/common/word2vec/models/wiki_iter=5_algorithm=skipgram_window=10_size=300_neg-samples=10.m') # SKIP model

# model_org = gensim.models.FastText.load("../IT_Embeddings/FastText_default_IT.model")
# model_org = gensim.models.Word2Vec.load("../IT_Embeddings/SKIP_default_IT.model")
# model_org = gensim.models.Word2Vec.load("../IT_Embeddings/SKIP_negative10_IT.model")
# model_org = gensim.models.Word2Vec.load("../IT_Embeddings/CBOW_default_IT.model")

# model_org = pickle.load(open("../IT_Embeddings/model_org.p", "rb"),encoding='latin1')

# open_model = open('../IT_Embeddings/SKIP_default_IT_Retro_sentix.txt','r', encoding="utf-8")
# open_model = open('../IT_Embeddings/SKIP_negative10_IT_Retro_sentix.txt','r', encoding="utf-8")
# model_org = loadRetrofitModel(open_model)

# model_org = pickle.load(open("../IT_Embeddings/SKIP_IT_Change_inputSentiment_average.p", "rb"))
# model_org = pickle.load(open("../IT_Embeddings/SKIP_IT_Change_inputSentiment_deletedoubles.p", "rb"))
# model_org = pickle.load(open("../IT_Embeddings/SKIP_IT_Change_inputSentiment_weightsum.p", "rb"))

# model_org = pickle.load(open("../IT_Embeddings/SKIP_negative10_IT_Change_inputSentiment_average.p", "rb"))
# model_org = pickle.load(open("../IT_Embeddings/SKIP_negative10_IT_Change_inputSentiment_deletedoubles.p", "rb"))
# model_org = pickle.load(open("../IT_Embeddings/SKIP_negative10_IT_Change_inputSentiment_weightsum.p", "rb"))


In [None]:
unknowVec = np.zeros(len(model_org['tu']))

In [None]:
dim = 300

#get W weight for embedding layer

W = np.zeros(shape=(len(word2id)+1+2, dim), dtype='float32')
W[0] = np.zeros(dim, dtype='float32')

count_in = 0
count_out = 0

for word in word2id:
    i = word2id[word]
    if word in model_org:
        W[i] = model_org[word]
        count_in += 1
    else:
        W[i] = np.random.uniform(-0.25,0.25,dim)
        count_out += 1

print (count_in, count_out)
print(count_in/(count_in + count_out)*100)

In [None]:
# pickle.dump([W,word2id,id2word], open("automobiles_IT_full_skip_negative10_sentimentchange_weight.p", "wb"))