In [4]:
import re
import numpy as np
import pandas as pd
import matplotlib.ticker as ticker
import warnings
import scipy.stats
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from keras.models import Sequential, load_model
from keras.layers import Dense, LSTM, Embedding, Dropout
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords
from time import sleep
import sys
from sklearn.metrics import confusion_matrix

warnings.filterwarnings('ignore')

In [5]:
headers = ['Anonymized Message', 'Valence', 'Arousal']
dtypes = {'Anonymized Message':'str', 'Valence':'float', 'Arousal':'float'}
data = pd.read_csv("dataset-fb-valence-arousal-anon.csv", encoding='utf8', skiprows=[0], names=headers, dtype=dtypes, na_filter=False)
data = data.sample(frac=1).reset_index(drop=True)
data.head()

Unnamed: 0,Anonymized Message,Valence,Arousal
0,Strolling down memory lane in the name of new ...,6.0,4.0
1,Hey yall. I'm going to tennis camp tommorow.,5.0,3.0
2,,6.0,2.0
3,"... chocolate peanutbutter ice cream, yes plea...",6.0,5.0
4,NOTHING leave me alone!!!!!!!!!!!!!!!!!,4.0,9.0


In [25]:
# set all words to lowercase and remove anything that isn't a letter or space
data['Anonymized Message'] = data['Anonymized Message'].apply(lambda x: x.lower()) #transform text to lowercase
data['Anonymized Message'] = data['Anonymized Message'].apply(lambda x: re.sub('[^a-zA-z0-9\s]', '', x))
data['Anonymized Message'] = data['Anonymized Message'].apply(lambda x: re.sub('[_\\/(){}\[\]\|@,;]', '', x))

# find and remove all stop words
stop = stopwords.words('english')
data['Anonymized Message'] = data['Anonymized Message'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [26]:
    # split X, y into training and testing 
    X_train, X_test, y_train, y_test = train_test_split(data['Anonymized Message'], data['Valence'], test_size=0.2, random_state=0)
    X_train[:6]

4266                                                     
1573    saw paranormal activity yesterday awsome good ...
5397                                                     
2265                                                     
3905                                                     
4265    still need help horse stable need 6 nail 4 hor...
Name: Anonymized Message, dtype: object

In [28]:
# Dictionary of all words from train corpus with their counts.
words_counts = {}
for comments in X_train:
    for word in comments.split():
        if word not in words_counts:
            words_counts[word] = 1
        words_counts[word] += 1
        
DICT_SIZE = 10000
POPULAR_WORDS = sorted(words_counts, key=words_counts.get, reverse=True)[:DICT_SIZE]
WORDS_TO_INDEX = {key: rank for rank, key in enumerate(POPULAR_WORDS, 0)}
INDEX_TO_WORDS = {index:word for word, index in WORDS_TO_INDEX.items()}
ALL_WORDS = WORDS_TO_INDEX.keys()
POPULAR_WORDS[:10]

['im', 'person', 'day', 'love', 'like', 'got', 'one', 'time', 'today', 'go']