In [61]:
import re
import numpy as np
import pandas as pd
import matplotlib.ticker as ticker
import warnings
import scipy.stats
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from keras.models import Sequential, load_model
from keras.layers import Dense, LSTM, Embedding, Dropout
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords
from time import sleep
import sys
from sklearn.metrics import confusion_matrix
from scipy import sparse as sp_sparse
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score 
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import r2_score

warnings.filterwarnings('ignore')

In [5]:
headers = ['Anonymized Message', 'Valence', 'Arousal']
dtypes = {'Anonymized Message':'str', 'Valence':'float', 'Arousal':'float'}
data = pd.read_csv("dataset-fb-valence-arousal-anon.csv", encoding='utf8', skiprows=[0], names=headers, dtype=dtypes, na_filter=False)
data = data.sample(frac=1).reset_index(drop=True)
data.head()

Unnamed: 0,Anonymized Message,Valence,Arousal
0,Strolling down memory lane in the name of new ...,6.0,4.0
1,Hey yall. I'm going to tennis camp tommorow.,5.0,3.0
2,,6.0,2.0
3,"... chocolate peanutbutter ice cream, yes plea...",6.0,5.0
4,NOTHING leave me alone!!!!!!!!!!!!!!!!!,4.0,9.0


In [25]:
# set all words to lowercase and remove anything that isn't a letter or space
data['Anonymized Message'] = data['Anonymized Message'].apply(lambda x: x.lower()) #transform text to lowercase
data['Anonymized Message'] = data['Anonymized Message'].apply(lambda x: re.sub('[^a-zA-z0-9\s]', '', x))
data['Anonymized Message'] = data['Anonymized Message'].apply(lambda x: re.sub('[_\\/(){}\[\]\|@,;]', '', x))

# find and remove all stop words
stop = stopwords.words('english')
data['Anonymized Message'] = data['Anonymized Message'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [37]:
    # split X, y into training and testing 
    X, X_test, y, y_test = train_test_split(data['Anonymized Message'], data['Valence'], test_size=0.2, random_state=0)
    X_train, X_val, y_train, y_val = train_test_split(X,y,test_size = 0.25)
    X_train[:6]

2389    3rd overall 50k fanball fantasy football sure ...
1480                   7 months im jobless homeless happy
5752                                                     
4758    wow havent thank everyone happy bday congrates...
2357    may 18th better day get move new house sex cra...
3974                                                bored
Name: Anonymized Message, dtype: object

In [38]:
# Dictionary of all words from train corpus with their counts.
words_counts = {}
for comments in X_train:
    for word in comments.split():
        if word not in words_counts:
            words_counts[word] = 1
        words_counts[word] += 1
        
DICT_SIZE = 10000
POPULAR_WORDS = sorted(words_counts, key=words_counts.get, reverse=True)[:DICT_SIZE]
WORDS_TO_INDEX = {key: rank for rank, key in enumerate(POPULAR_WORDS, 0)}
INDEX_TO_WORDS = {index:word for word, index in WORDS_TO_INDEX.items()}
ALL_WORDS = WORDS_TO_INDEX.keys()
POPULAR_WORDS[:10]

['im', 'person', 'day', 'like', 'love', 'today', 'time', 'go', 'one', 'get']

In [53]:

def my_bag_of_words(text, words_to_index, dict_size):
    """
        text: a string
        dict_size: size of the dictionary
        
        return a vector which is a bag-of-words representation of 'text'
    """
    result_vector = np.zeros(dict_size)
    for word in text.split(' '):
        if word in words_to_index:
            result_vector[words_to_index[word]] +=1
    return result_vector

X_train_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_train])
X_val_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_val])
print('X_train shape ', X_train_mybag.shape, '\nX_val shape ', X_val_mybag.shape)

X_train shape  (3474, 10000) 
X_val shape  (1158, 10000)


In [54]:
def train_classifier(X_train, y_train, C, regularisation):
    """
      X_train, y_train — training data
      
      return: trained classifier
    """
    
    # Create and fit LogisticRegression wraped into OneVsRestClassifier.

    model = OneVsRestClassifier(LogisticRegression(penalty=regularisation, C=C, max_iter=10000)).fit(X_train, y_train)
    return model

classifier_mybag = train_classifier(X_train_mybag, y_train, C = 4, regularisation = 'l2')

y_val_predicted_labels_mybag = classifier_mybag.predict(X_val_mybag)
y_val_predicted_scores_mybag = classifier_mybag.decision_function(X_val_mybag)

In [62]:
def print_evaluation_scores(y_test, predicted):
    
    print('Accuracy: ', accuracy_score(y_test, predicted, normalize=False))
    print('F1-score macro: ', f1_score(y_test, predicted, average='macro'))
    print('F1-score micro: ', f1_score(y_test, predicted, average='micro'))
    print('F1-score weighted: ', f1_score(y_test, predicted, average='weighted'))
    print('R2-score: ', r2_score(y_test, predicted))
#     print('Precision macro: ', average_precision_score(y_test, predicted, average='macro'))
#     print('Precision micro: ', average_precision_score(y_test, predicted, average='micro'))
#     print('Precision weighted: ', average_precision_score(y_test, predicted, average='weighted'))
    
print('Bag-of-words\n')
print_evaluation_scores(y_val, y_val_predicted_labels_mybag)

Bag-of-words

Accuracy:  470
F1-score macro:  0.15212009608041793
F1-score micro:  0.40587219343696024
F1-score weighted:  0.3453862461894369
R2-score:  0.01443525036042681
