In [None]:
import re
import numpy as np
import pandas as pd
import matplotlib.ticker as ticker
import warnings
import scipy.stats
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from keras.models import Sequential, load_model
from keras.layers import Dense, LSTM, Embedding, Dropout
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords
from time import sleep
import sys
from sklearn.metrics import confusion_matrix
from scipy import sparse as sp_sparse
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score 
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score
import sklearn.feature_selection as fs
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

warnings.filterwarnings('ignore')

In [166]:
headers = ['Anonymized Message', 'Valence', 'Arousal']
dtypes = {'Anonymized Message':'str', 'Valence':'float', 'Arousal':'float'}
data = pd.read_csv("small_data.csv", encoding='utf8', skiprows=[0], names=headers, dtype=dtypes, na_filter=False)
data = data.sample(frac=1).reset_index(drop=True)
data.head()

Unnamed: 0,Anonymized Message,Valence,Arousal
0,My birthday is tomorrow!!!! so excited,7.0,7
1,"JUST GOT LAID OFF AND NEED TO FIND SOME WORK, ...",4.0,5
2,Thanks everone for wishing me happy bday. Well...,6.0,4
3,Bazinga!,5.0,5
4,getting ready to go up snowboarding @ <PERSON>...,6.0,6


In [167]:
# set all words to lowercase and remove anything that isn't a letter or space
# data['Anonymized Message'] = data['Anonymized Message'].apply(lambda x: x.lower()) #transform text to lowercase
# data['Anonymized Message'] = data['Anonymized Message'].apply(lambda x: re.sub('[^a-zA-z0-9\s]', '', x))
# data['Anonymized Message'] = data['Anonymized Message'].apply(lambda x: re.sub('[_\\/(){}\[\]\|@,;]', '', x))

# find and remove all stop words
stop = stopwords.words('english')
data['Anonymized Message'] = data['Anonymized Message'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [168]:
# Dictionary of all words from train corpus with their counts.
words_counts = {}
for comments in data['Anonymized Message']:
    for word in comments.split():
        if word not in words_counts:
            words_counts[word] = 1
        words_counts[word] += 1
        
DICT_SIZE = 5000
POPULAR_WORDS = sorted(words_counts, key=words_counts.get, reverse=True)[:DICT_SIZE]
WORDS_TO_INDEX = {key: rank for rank, key in enumerate(POPULAR_WORDS, 0)}
INDEX_TO_WORDS = {index:word for word, index in WORDS_TO_INDEX.items()}
ALL_WORDS = WORDS_TO_INDEX.keys()
POPULAR_WORDS[:10]

['to', 'the', 'a', 'I', 'and', 'is', 'of', 'my', 'for', 'you']

In [169]:

def my_bag_of_words(text, words_to_index, dict_size):
    """
        text: a string
        dict_size: size of the dictionary
        
        return a vector which is a bag-of-words representation of 'text'
    """
    result_vector = np.zeros(dict_size)
    for word in text.split(' '):
        if word in words_to_index:
            result_vector[words_to_index[word]] +=1
    return result_vector

BoW = (my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE) for text in data['Anonymized Message'])
BoW = pd.DataFrame(BoW)
print(BoW.shape)

(2895, 5000)


In [171]:
k = 10
kf = KFold(n_splits=k, random_state=None)
model = LogisticRegression(penalty='l2', C=4, max_iter=10000)


for train_index, test_index in kf.split(data['Anonymized Message']):
    X_train, X_test = BoW.iloc[train_index,:], BoW.iloc[test_index,:]
    y_train, y_test = data['Arousal'].iloc[train_index], data['Arousal'].iloc[test_index]
    
    print(X_train.shape)
    print(y_train.shape)
    print(y_test.shape)
    model.fit(X_train, y_train)
    
    pred_values = model.predict(X_test)
    print('R: ', scipy.stats.pearsonr(pred_values, y_test)[1])

(2605, 5000)
(2605,)
(290,)
R:  3.6362623789692916e-05
(2605, 5000)
(2605,)
(290,)
R:  1.0117963262974067e-08
(2605, 5000)
(2605,)
(290,)
R:  1.7725303382023525e-06
(2605, 5000)
(2605,)
(290,)
R:  2.6727647648947384e-06
(2605, 5000)
(2605,)
(290,)


KeyboardInterrupt: 

In [90]:
def print_evaluation_scores(y_test, predicted):
    
    print('Accuracy: ', accuracy_score(y_test, predicted, normalize=False))
    print('F1-score macro: ', f1_score(y_test, predicted, average='macro'))
    print('F1-score micro: ', f1_score(y_test, predicted, average='micro'))
    print('F1-score weighted: ', f1_score(y_test, predicted, average='weighted'))
    print('R2-score: ', r2_score(y_test, predicted))
#     print('Precision macro: ', average_precision_score(y_test, predicted, average='macro'))
#     print('Precision micro: ', average_precision_score(y_test, predicted, average='micro'))
#     print('Precision weighted: ', average_precision_score(y_test, predicted, average='weighted'))
    
print('Bag-of-words\n')
print_evaluation_scores(y_val, y_val_predicted_labels_mybag)

Bag-of-words

Accuracy:  470
F1-score macro:  0.15212009608041793
F1-score micro:  0.40587219343696024
F1-score weighted:  0.3453862461894369
R2-score:  0.01443525036042681
