# Data processing

In [25]:
import os
import logging
import pandas as pd
import numpy as np
from IPython.core.interactiveshell import InteractiveShell
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
# InteractiveShell.ast_node_interactivity = 'all'


In [26]:
# plt.style.library['ggplot']
# 'axes.prop_cycle': cycler('color', ['#E24A33', '#348ABD', '#988ED5', '#777777', '#FBC15E', '#8EBA42', '#FFB5B8']),

In [27]:
# Set up a logger object to print info about the training run to stdout
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.handlers = [logging.StreamHandler()]

In [28]:
# read data
dataset_sentence = pd.read_csv('data/datasetSentences.txt', delimiter='\t')  # sentence_index - sentence
dictionary = pd.read_csv('data/dictionary.txt', delimiter='|', header=None)  # phrase - phrase ids
sentiment_labels= pd.read_csv('data/sentiment_labels.txt', delimiter='|')  # phrase ids - sentiment values
dataset_split= pd.read_csv('data/datasetSplit.txt', delimiter = ',')  # sentence_index - splitset_label

dictionary=dictionary.rename(columns={0:'phrase',1:'phrase ids'})
sentiment_labels['sentiment']=sentiment_labels['sentiment values'].apply(lambda x: 5 if x>0.8 
                                     else 4 if x>0.6
                                     else 3 if x>0.4
                                     else 2 if x>0.2
                                     else 1)

In [29]:
# merge data
dataset_label = pd.merge(dataset_sentence, dataset_split)  # sentence_index - sentence - splitset_label
phrase_label = pd.merge(dictionary, sentiment_labels)  # phrase - phrase ids - sentiment values - sentiment
dataset = pd.merge(dataset_label, phrase_label, how='left', left_on='sentence', right_on='phrase')
dataset = dataset.drop(columns=['sentence_index','sentence','phrase ids'])
dataset = dataset.dropna(axis=0, how='any')
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11286 entries, 0 to 11854
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   splitset_label    11286 non-null  int64  
 1   phrase            11286 non-null  object 
 2   sentiment values  11286 non-null  float64
 3   sentiment         11286 non-null  float64
dtypes: float64(2), int64(1), object(1)
memory usage: 440.9+ KB


In [30]:
# split dataset
train_set=dataset[dataset['splitset_label']==1]
valid_set=dataset[dataset['splitset_label']==3]
test_set=dataset[dataset['splitset_label']==2]
print('train_set: {}, valid_set: {}, test_set: {}'.format(train_set.shape[0], valid_set.shape[0], test_set.shape[0]))


train_set: 8117, valid_set: 1044, test_set: 2125


In [31]:
# extract phrases
SOStr = []
STree = []
with open('data/SOStr.txt', 'r') as f:
    for line in f.readlines():
        SOStr.append(line.strip().split('|'))
with open('data/STree.txt', 'r') as f:
    for line in f.readlines():
        STree.append(list(map(int, line.strip().split('|'))))

def get_indices(_list, _value):
    indices = []
    for idx, value in enumerate(_list):
        if value == _value:
            indices.append(idx+1)  # +1 beacause the n0 is the root
    return indices


def get_parse_tree(tokens, parents, cur_tree, cur_key, cur_value):
    if cur_value <= len(tokens):  # leaf node
        cur_tree[cur_key] = tokens[cur_value - 1]
    else:  # sub_tree
        [left_value, right_value] = get_indices(parents, cur_value)
        cur_tree[cur_key] = {'left':left_value, 'right':right_value}
        get_parse_tree(tokens, parents, cur_tree[cur_key], 'left', left_value)
        get_parse_tree(tokens, parents, cur_tree[cur_key], 'right', right_value)



# parse_tree = {'root': max(STree[2])}
# get_parse_tree(SOStr[2], STree[2], parse_tree, 'root', max(STree[2]))
# parse_tree

Offers|that|rare|combination|of|entertainment|and|education|.
16|14|13|13|12|10|10|11|17|11|12|15|14|15|16|17|0

Effective|but|too-tepid|biopic
6|6|5|5|7|7|0

# Word2vec

In [88]:
from collections import defaultdict
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models.word2vec import Word2Vec

In [35]:
train_label = train_set['sentiment']
valid_label = valid_set['sentiment']
test_label = test_set['sentiment']

In [93]:
def sentence_vectorizing(sentence, dim, model):
    vec = np.zeros((1, dim))
    count = 0
    for word in sentence:
        try:  # some words are ignored
            vec += model.wv[word].reshape((1, dim))
            count += 1
        except:
            continue
    return vec / count


In [99]:
vectorizing = 'word2vec'
if vectorizing == 'word2vec':
    dim = 500
    w2v = Word2Vec(vector_size=dim, min_count=10)  #  Ignores all words with total frequency lower than 10.
    w2v.build_vocab(dataset['phrase'])
    w2v.train(dataset['phrase'], total_examples=w2v.corpus_count, epochs=10)
    train_data = np.concatenate([sentence_vectorizing(sentence, dim, w2v) for sentence in train_set['phrase']])
    valid_data = np.concatenate([sentence_vectorizing(sentence, dim, w2v) for sentence in valid_set['phrase']])
    test_data = np.concatenate([sentence_vectorizing(sentence, dim, w2v) for sentence in test_set['phrase']])
else:
    if vectorizing == 'top n': 
        features_selected = []
        for i in range(1, 6):
            vectorizer = CountVectorizer(max_features=500)
            vectorizer.fit(dataset[dataset['sentiment'] == i]['phrase'])
            features_selected.extend(vectorizer.vocabulary_.keys())
        features_selected = set(features_selected)
        vectorizer = CountVectorizer()
        vectorizer.fit(features_selected)
    else:
        if vectorizing == 'one hot':
            vectorizer = OneHotEncoder()
        elif vectorizing == 'word count':
            vectorizer = CountVectorizer()  # including stopwords
        elif vectorizing == 'tf-idf':
            vectorizer = TfidfVectorizer()
        elif vectorizing == 'n-gram':
            vectorizer = TfidfVectorizer(ngram_range=(3,3))

        vectorizer.fit(dataset['phrase'])

    train_data = vectorizer.transform(train_set['phrase']).toarray()
    valid_data = vectorizer.transform(valid_set['phrase']).toarray()
    test_data = vectorizer.transform(test_set['phrase']).toarray()
 
train_data.shape

Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=500, alpha=0.025>', 'datetime': '2022-11-16T01:14:34.081481', 'gensim': '4.2.0', 'python': '3.8.13 (default, Mar 28 2022, 06:59:08) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'created'}
collecting all words and their counts
Each 'sentences' item should be a list of words (usually unicode strings). First item here is instead plain <class 'str'>.
PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
PROGRESS: at sentence #10000, processed 1030807 words, keeping 81 word types
collected 81 word types from a corpus of 1139899 raw words and 11286 sentences
Creating a fresh vocabulary
Word2Vec lifecycle event {'msg': 'effective_min_count=10 retains 77 unique words (95.06% of original 81, drops 4)', 'datetime': '2022-11-16T01:14:34.164483', 'gensim': '4.2.0', 'python': '3.8.13 (default, Mar 28 2022, 06:59:08) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 

(2290770, 11398990)

(8117, 500)