# AGDML-Lab Final
Task: Sentiment Analysis of Twitter messages

## Preprocessing
For preprocessing I used NLTK library. I removed non-alphabetic characters, made words lowercase, removed mentions of other users, removed stopwords and lemmatized each word to its lemma. This should make the data more consistent and easier to work with. My assumption is: most of the spelling mistakes and special characters are unnecessary for sentiment analysis.

The regex will most likely match stuff that we do not want removed, but that is a tradeoff we accept.

In [136]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import torch

N = 1000

tqdm.pandas()
torch.cuda.get_device_name(0)

'NVIDIA GeForce GTX 1060 6GB'

The dataset is evenly balanced. There is no bias towards negative or positive messages.

In [137]:
# preprocessing text messages
import re
import nltk
from nltk.corpus import stopwords, words
from nltk.stem import WordNetLemmatizer

# download stopwords and wordnet
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('words')

# create object of WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

# this allows very
stopwords = set(stopwords.words('english'))
stopwords.remove('very')
print(stopwords)

correct_words = [str.lower(w) for w in words.words()]

# BEGIN SOURCE http://norvig.com/spell-correct.html
from collections import Counter

WORDS = Counter(correct_words)

def P(word, N=sum(WORDS.values())):
    """Probability of `word`."""
    return WORDS[word] / N

def correction(word):
    """Most probable spelling correction for word."""
    return max(candidates(word), key=P)

def candidates(word):
    """Generate possible spelling corrections for word."""
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

def known(words):
    """The subset of `words` that appear in the dictionary of WORDS."""
    return set(w for w in words if w in words)

def edits1(word):
    """All edits that are one edit away from `word`."""
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word):
    """All edits that are two edits away from `word`."""
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))
# END SOURCE

# function to clean sentences
def clean_text(sentence):
    # remove mentions of other users
    sentence = re.sub('\B@[._a-zA-Z0-9]{3,24}', '', sentence)
    
    # rewrite words in all caps to "very" followed by word
    # text = re.sub('([A-Z]+)', lambda x: 'very ' + x.group(0).lower(), text)
    
    # make words lowercase, because Go and go will be considered as two words
    sentence = sentence.lower()
    
    # detect laughing
    sentence = re.sub(r'\b(?:a*(?:ha)+h?|(?:l+o+)+l+)\b', ' laughing ', sentence)
    
    # remove multiple dots
    sentence = re.sub(r'(\.)\1{2,}', '\1', sentence)
    
    # remove URLs from text (prefer safely!)
    sentence = re.sub('https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,4}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)', ' ', sentence)
    
    # remove everything but letters
    sentence = re.sub('[^a-z]', ' ', sentence)
    
    # split the sentences into words
    _words = sentence.split() 
    
    for i in range(len(_words)):
        # remove words with length 1
        if len(_words[i]) == 1:
            _words[i] = ''
    
        # remove repetition of letters
        _words[i] = re.sub(r'([a-z])\1{3,}', r'\1', _words[i])
    
        # prepend very if there were repeating letters
        # if _tmp[1] > 0:
        #     words[i] = "very " + _tmp[0] // TODO if this is added also add back _tmp variable and rewrite sub to subn
    
    # remove stopwords like to, and, or etc.
    # _words = [word for word in _words if word not in stopwords]
    
    # spell-check
    _words = [correction(word) for word in _words]
    
    # remove words if they are unknown
    # _words = ['' if word not in WORDS else word for word in _words]
    
    # lemmatize each word
    _words = [wordnet_lemmatizer.lemmatize(word) for word in _words]
    
    # join words to make sentence
    sentence = ' '.join(_words)
    # remove multiple spaces
    sentence = re.sub('\s+', ' ', sentence)
    sentence = sentence.strip()
    
    return sentence

{'no', 'further', 'from', 'they', 'or', 'herself', 'there', 'too', 'the', 'can', 'hadn', 'won', "shouldn't", 'an', 'him', 'did', 'my', 'where', "hasn't", 'all', 'you', 'up', 'then', "isn't", 'ain', 'only', 'own', 'through', 'below', 'hasn', 's', 'because', 've', "should've", 'both', "mustn't", "you'd", 'a', 'ma', 'what', 'other', 'should', 'doing', "doesn't", 'at', "needn't", 'any', "mightn't", 'yourself', 'why', 'theirs', 'not', 'this', 'which', 'in', 'over', 'been', 'does', 'as', 'few', 'here', 'most', 'me', 'whom', 'm', 't', 'isn', 'were', "you're", 'was', 'when', 'yourselves', 'those', 'himself', 'once', "couldn't", 'o', 'themselves', 'of', 'her', 'haven', 'wasn', 'your', 'nor', 'between', "aren't", 'our', 'wouldn', "won't", "that'll", 'do', 'aren', 'each', 'some', 'such', 'same', 'ours', "you've", 'them', "shan't", 'had', 'its', 'd', 'out', 'we', 'is', 'so', 're', 'll', 'he', "weren't", 'having', 'than', 'just', 'itself', 'during', 'and', 'have', 'am', 'before', "you'll", 'shan', 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\frand\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\frand\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\frand\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [138]:
to_be_cleaned_examples = [
    "sooooooooooo full .... BBQ was great ..... lovely day ! ",
    "@symphnysldr lets do it",
    ":3 Up and ready for a full day of doing noithing. Apart from finishing new picture, animation, more guitar, tiding my rooms. And homework ",
    "@carswani yeh i need to do another,now that im like u and have sum white face paint..but um..im ok..just tired",
    "SCOTUS decides that having convicted someone removes their rights to bring evidence that could prove their innocence.  http://tr.im/oXqj"
]
cleaned_examples = [clean_text(c) for c in to_be_cleaned_examples]

pd.DataFrame({'raw': to_be_cleaned_examples, 'cleaned': cleaned_examples})

Unnamed: 0,raw,cleaned
0,sooooooooooo full .... BBQ was great ..... lov...,so full bbq wa great lovely day
1,@symphnysldr lets do it,let do it
2,:3 Up and ready for a full day of doing noithi...,up and ready for full day of doing noithing ap...
3,"@carswani yeh i need to do another,now that im...",yeh need to do another now that im like and ha...
4,SCOTUS decides that having convicted someone r...,scotus decides that having convicted someone r...


In [139]:
# read training data
df = pd.read_csv('data.csv')

# read validation data
df_test = pd.read_csv('data_valid.csv')

ct = len(df.loc[df['target'] == 1]) / len(df)
"positive messages", ct, "negative messages", 1-ct 

('positive messages', 0.499076, 'negative messages', 0.5009239999999999)

In [140]:
# drop rows with missing values
df = df.dropna()
df_test = df_test.dropna()

# clean text data
df['text'] = df['text'].progress_apply(clean_text)
df_test['text'] = df_test['text'].progress_apply(clean_text)

# drop rows with missing values
df = df.dropna()
df_test = df_test.dropna()

df.to_csv('data_cleaned.csv', index=False)    
df_test.to_csv('data_valid_cleaned.csv', index=False)

  0%|          | 0/500000 [00:00<?, ?it/s]

  0%|          | 0/100000 [00:00<?, ?it/s]

In [141]:
# remove words which occur less than O times
O = 100
    
training_text = (' '.join(df['text'])).split()
test_text = (' '.join(df_test['text'])).split()
dataset_counter = Counter(training_text + test_text)

dataset_counter.most_common()[-1000:]

[('statisitic', 1),
 ('quuens', 1),
 ('spcy', 1),
 ('talis', 1),
 ('razzing', 1),
 ('wrost', 1),
 ('daddyduty', 1),
 ('horroble', 1),
 ('exasperation', 1),
 ('imnotjoanna', 1),
 ('rtb', 1),
 ('balum', 1),
 ('unforshenetlee', 1),
 ('afterwardst', 1),
 ('deceptive', 1),
 ('muthafucker', 1),
 ('hovadoposrate', 1),
 ('thehhee', 1),
 ('palapa', 1),
 ('pice', 1),
 ('mahoney', 1),
 ('xkame', 1),
 ('munab', 1),
 ('milkshakefail', 1),
 ('tomfelton', 1),
 ('klinsmann', 1),
 ('navin', 1),
 ('matthartley', 1),
 ('extendable', 1),
 ('caran', 1),
 ('castlebar', 1),
 ('inpound', 1),
 ('ybjf', 1),
 ('phocking', 1),
 ('hasing', 1),
 ('meck', 1),
 ('remeniscing', 1),
 ('kxhz', 1),
 ('showr', 1),
 ('shael', 1),
 ('codechef', 1),
 ('kleineee', 1),
 ('hppnd', 1),
 ('hurrrayyy', 1),
 ('pharoh', 1),
 ('pae', 1),
 ('crochetting', 1),
 ('couuld', 1),
 ('jbc', 1),
 ('btlgy', 1),
 ('yeeew', 1),
 ('frwends', 1),
 ('prospecting', 1),
 ('cheekies', 1),
 ('ugss', 1),
 ('haahahahah', 1),
 ('oatbran', 1),
 ('awakke', 

In [142]:
def remove_infrequent_words(sentence):
    _words = sentence.split()
    _words = [word if dataset_counter[word] >= O else ' ' for word in _words]
    sentence = ' '.join(_words)
    sentence = re.sub('\s+', ' ', sentence)
    sentence = sentence.strip()
    return sentence
    
df['text'] = df['text'].progress_apply(remove_infrequent_words)
df_test['text'] = df_test['text'].progress_apply(remove_infrequent_words)

df

  0%|          | 0/500000 [00:00<?, ?it/s]

  0%|          | 0/100000 [00:00<?, ?it/s]

Unnamed: 0,target,text
0,0,this is my last tweet of the day so goodnight ...
1,1,laughing okay yeah ll vote for her right now d...
2,0,gonna feel like shit at uni today still up on
3,0,can find my phone charger so my service over t...
4,0,it wa just to black though
...,...,...
499995,1,watching twilight text me
499996,1,for me it is usual day for me fighting with my...
499997,0,question for you can stress re yesterday wa an...
499998,0,okay then made me sad


In [143]:
# read the csv file
# df = pd.read_csv('data_cleaned.csv')
# df_test = pd.read_csv('data_valid_cleaned.csv')
# 
# df

In [144]:
# def drop_not_string(_df, column):
#     return _df.drop(_df[_df[column].apply(lambda x: isinstance(x, str)) == False].index)
# 
# drop_not_string(df, 'text')
# drop_not_string(df_test, 'text')

X = df['text']
y = df['target']

X_test = df_test['text']

# make them all strings
X = X.astype(str)
X_test = X_test.astype(str)

"sizes:", X.shape, y.shape, X_test.shape

('sizes:', (500000,), (500000,), (100000,))

## Word2Vec
Using the Gensim implementation of Google's Word2Vec.

In [145]:
from gensim.models import Word2Vec
sentences = [sentence.split() for sentence in X]
w2v_model = Word2Vec(sentences, window=5, min_count=5, workers=4, hs=1 , negative=0)

def vectorize(sentence):
    words = sentence.split()
    words_vecs = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
    if len(words_vecs) == 0:
        return np.zeros(100)
    words_vecs = np.array(words_vecs)
    return words_vecs.mean(axis=0)

X_w2v = np.array([vectorize(sentence) for sentence in tqdm(X)])
X_test_w2v = np.array([vectorize(sentence) for sentence in tqdm(X_test)])

X_w2v.shape

  0%|          | 0/500000 [00:00<?, ?it/s]

  0%|          | 0/100000 [00:00<?, ?it/s]

(500000, 100)

## Skip-Gram

## Mapping Visualisation
The 100-dimensional Word2Vec mapping of our features is reduced to a 2-dimensional space. The colouring is according to the label.

# Classifiers
This compares different classifiers. 

In [146]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.naive_bayes import GaussianNB

embeddings = [{"name": "w2v", "data": X_w2v}, ]
classifiers = [LogisticRegression(), RidgeClassifier(), GaussianNB()]

for e in embeddings:
    _X_train, _X_test, _y_train, _y_test = train_test_split(e["data"], y, test_size=0.33)
    train_acc = []
    test_acc = []
    
    for c in tqdm(classifiers):
        c.fit(_X_train, _y_train)
        _y_hat_train = c.predict(_X_train)
        _y_hat_test = c.predict(_X_test)
        
        train_acc.append(accuracy_score(_y_hat_train, _y_train))
        test_acc.append(accuracy_score(_y_hat_test, _y_test))
        
pd.DataFrame({'embedding': list(map(lambda e: e["name"], embeddings))*len(classifiers), 'classifier': classifiers, 'train accuracy': train_acc, 'test accuracy': test_acc})

  0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,embedding,classifier,train accuracy,test accuracy
0,w2v,LogisticRegression(),0.751313,0.749612
1,w2v,RidgeClassifier(),0.747785,0.747115
2,w2v,GaussianNB(),0.649322,0.650255


Save prediction with Logistic Regression to file.

In [147]:
lr = LogisticRegression()
lr.fit(X_w2v, y)
y_pred = lr.predict(X_w2v[:N])
accuracy_score(y[:N], y_pred)

y_pred_test = lr.predict(X_test_w2v)
np.save('y_pred.npy', y_pred_test)

## Transformer
Using a pre-trained transformer for sentiment analysis from Huggingface.
Score on the validation set: 0.71

This method is therefore worse than what I used before...