# AGDML-Lab Final
Task: Sentiment Analysis of Twitter messages

## Preprocessing
For preprocessing I used NLTK library. I removed non-alphabetic characters, made words lowercase, removed mentions of other users, removed stopwords and lemmatized each word to its lemma. This should make the data more consistent and easier to work with. My assumption is: most of the spelling mistakes and special characters are unnecessary for sentiment analysis.

The regex will most likely match stuff that we do not want removed, but that is a tradeoff we accept.

In [None]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import torch

N = 1000

tqdm.pandas()
torch.cuda.get_device_name(0)

The dataset is evenly balanced. There is no bias towards negative or positive messages.

In [None]:
# preprocessing text messages
import re
import nltk
from nltk.corpus import stopwords, words
from nltk.stem import WordNetLemmatizer, PorterStemmer

# download stopwords and wordnet
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('words')
nltk.download('sentiwordnet')

# create object of WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
porter_stemmer = PorterStemmer()

# this allows very
stopwords = set(stopwords.words('english'))

correct_words = [str.lower(w) for w in words.words()]

# BEGIN SOURCE http://norvig.com/spell-correct.html
from collections import Counter

WORDS = Counter(correct_words)

def P(word, N=sum(WORDS.values())):
    """Probability of `word`."""
    return WORDS[word] / N

def correction(word):
    """Most probable spelling correction for word."""
    return max(candidates(word), key=P)

def candidates(word):
    """Generate possible spelling corrections for word."""
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

def known(words):
    """The subset of `words` that appear in the dictionary of WORDS."""
    return set(w for w in words if w in words)

def edits1(word):
    """All edits that are one edit away from `word`."""
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word):
    """All edits that are two edits away from `word`."""
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))
# END SOURCE

# function to clean sentences
def clean_text(sentence):
    # remove mentions of other users
    sentence = re.sub('\B@[._a-zA-Z0-9]{3,24}', '', sentence)
    
    # rewrite words in all caps to "very" followed by word
    # text = re.sub('([A-Z]+)', lambda x: 'very ' + x.group(0).lower(), text)
    
    # make words lowercase, because Go and go will be considered as two words
    sentence = sentence.lower()
    
    # detect laughing
    sentence = re.sub(r'\b(?:a*(?:ha)+h?|(?:l+o+)+l+)\b', ' laughing ', sentence)
    
    # remove multiple dots
    sentence = re.sub(r'(\.)\1{2,}', '\1', sentence)
    
    # remove URLs from text (prefer safely!)
    sentence = re.sub('https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,4}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)', ' ', sentence)
    
    # remove everything but letters
    sentence = re.sub('[^a-z]', ' ', sentence)
    
    # split the sentences into words
    _words = sentence.split() 
    
    for i in range(len(_words)):
        # remove words with length 1
        if len(_words[i]) == 1:
            _words[i] = ''
    
        # remove repetition of letters
        _words[i] = re.sub(r'([a-z])\1{3,}', r'\1', _words[i])
    
        # prepend very if there were repeating letters
        # if _tmp[1] > 0:
        #     words[i] = "very " + _tmp[0] // TODO if this is added also add back _tmp variable and rewrite sub to subn
    
    # remove stopwords like to, and, or etc.
    # _words = [word for word in _words if word not in stopwords]
    
    # spell-check
    _words = [correction(word) for word in _words]
    
    # remove words if they are unknown
    # _words = ['' if word not in WORDS else word for word in _words]
    
    # lemmatize each word
    # _words = [wordnet_lemmatizer.lemmatize(word) for word in _words]
    
    # porter stem word
    # _words = [porter_stemmer.stem(word) for word in _words]
    
    # join words to make sentence
    sentence = ' '.join(_words)
    # remove multiple spaces
    sentence = re.sub('\s+', ' ', sentence)
    sentence = sentence.strip()
    
    return sentence

In [None]:
to_be_cleaned_examples = [
    "sooooooooooo full .... BBQ was great ..... lovely day ! ",
    "@symphnysldr lets do it",
    ":3 Up and ready for a full day of doing noithing. Apart from finishing new picture, animation, more guitar, tiding my rooms. And homework ",
    "@carswani yeh i need to do another,now that im like u and have sum white face paint..but um..im ok..just tired",
    "SCOTUS decides that having convicted someone removes their rights to bring evidence that could prove their innocence.  http://tr.im/oXqj"
    "Finally got my watch fixed... Its only been 7 months.. As me what time it is  http://myloc.me/2Vti"
]
cleaned_examples = [clean_text(c) for c in to_be_cleaned_examples]

pd.DataFrame({'raw': to_be_cleaned_examples, 'cleaned': cleaned_examples})

In [None]:
# read training data
df_clean = pd.read_csv('data.csv')

# read validation data
df_test_clean = pd.read_csv('data_valid.csv')

ct = len(df_clean.loc[df_clean['target'] == 1]) / len(df_clean)
"positive messages", ct, "negative messages", 1-ct 

## Clean the data

In [None]:
# drop rows with missing values
df_clean = df_clean.dropna()
df_test_clean = df_test_clean.dropna()

X_unprocessed = df_clean['text']
X_test_unprocessed = df_test_clean['text']

# clean text data
df_clean['text'] = df_clean['text'].progress_apply(clean_text)
df_test_clean['text'] = df_test_clean['text'].progress_apply(clean_text)

# drop rows with missing values
df_clean = df_clean.dropna()
df_test_clean = df_test_clean.dropna()

df_clean.to_csv('data_cleaned_no_spell.csv', index=False)    
df_test_clean.to_csv('data_valid_cleaned_no_spell.csv', index=False)

In [None]:
# remove words which occur less than O times
O = 100
    
training_text = (' '.join(df_clean['text'])).split()
test_text = (' '.join(df_test_clean['text'])).split()
dataset_counter = Counter(training_text + test_text)

dataset_counter.most_common()[-1000:]

In [None]:
def remove_infrequent_words(sentence):
    _words = sentence.split()
    _words = [word if dataset_counter[word] >= O else ' ' for word in _words]
    sentence = ' '.join(_words)
    sentence = re.sub('\s+', ' ', sentence)
    sentence = sentence.strip()
    return sentence
    
df_clean['text'] = df_clean['text'].progress_apply(remove_infrequent_words)
df_test_clean['text'] = df_test_clean['text'].progress_apply(remove_infrequent_words)

df_clean

In [None]:
# def drop_not_string(_df, column):
#     return _df.drop(_df[_df[column].apply(lambda x: isinstance(x, str)) == False].index)
# 
# drop_not_string(df, 'text')
# drop_not_string(df_test, 'text')

X_clean = df_clean['text']
y = df_clean['target']

X_test_clean = df_test_clean['text']

# make them all strings
X_clean = X_clean.astype(str)
X_test_clean = X_test_clean.astype(str)

"sizes:", X_clean.shape, y.shape, X_test_clean.shape

## VADER Sentiment

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

def vader(sentence):
    _v = analyzer.polarity_scores(sentence)
    return np.array([_v['neg'], _v['neu'], _v['pos'], _v['compound']])

_temp = df_clean['text']
vader_X = np.array([vader(s) for s in _temp])

vader_X_compound = vader_X[:, 3]
vader_X = vader_X[:, [0, 1, 2]]

from sklearn.preprocessing import PolynomialFeatures
vader_X_poly = PolynomialFeatures().fit_transform(vader_X, y)

In [None]:
from sklearn.model_selection import train_test_split

_, _X_test, _, _y_test = train_test_split(X_unprocessed, y, test_size=0.33)
_y_hat_test = np.array([vader(x)[3] >= 0 for x in _X_test])
"accuracy vader on unprocessed", len(np.where(_y_hat_test == _y_test)[0])/len(_y_test)

In [None]:
_, _X_test, _, _y_test = train_test_split(X_clean, y, test_size=0.33)
_y_hat_test = np.array([vader(x)[3] >= 0 for x in _X_test])
"accuracy vader on unprocessed", len(np.where(_y_hat_test == _y_test)[0])/len(_y_test)

In [None]:
vader("going to save up for new camera wish me luck")

### remove neutral words
We remove neutral words (below a certain threshold) to reduce complexity. 

In [None]:
analyzer.polarity_scores("bias")

In [None]:
def remove_neutral(sentence):
    _words = sentence.split()
    _words = [w if vader(w)[1] <= 0.1 else ' ' for w in _words]
    sentence = ' '.join(_words)
    # remove multiple spaces
    sentence = re.sub('\s+', ' ', sentence)
    sentence = sentence.strip()
    return sentence

df_clean['text'] = df_clean['text'].progress_apply(remove_neutral)
df_test_clean['text'] = df_test_clean['text'].progress_apply(remove_neutral)

## Word2Vec
Using the Gensim implementation of Google's Word2Vec.
Firstly we train it on cleaned data.

In [None]:
from gensim.models import Word2Vec

def w2v_map(_X, _X_test):
    sentences = [sentence.split() for sentence in _X]
    w2v_model = Word2Vec(sentences, window=5, min_count=5, workers=4, hs=1 , negative=0)
    
    def vectorize(sentence):
        words = sentence.split()
        words_vecs = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
        if len(words_vecs) == 0:
            return np.zeros(100)
        words_vecs = np.array(words_vecs)
        return words_vecs.mean(axis=0)
    
    X_clean_w2v = np.array([vectorize(sentence) for sentence in tqdm(_X)])
    X_clean_test_w2v = np.array([vectorize(sentence) for sentence in tqdm(_X)])
    
    return X_clean_w2v, X_clean_test_w2v

X_clean_w2v, X_clean_test_w2v = w2v_map(X_clean, X_test_clean)
X_clean_w2v

Then also uncleaned data.

In [None]:
X_unprocessed_w2v, X_test_unprocessed_w2v = w2v_map(X_unprocessed, X_test_unprocessed)
X_unprocessed_w2v

## combine w2v and VADER

In [None]:
X_w2v_vader = np.copy(X_clean_w2v)
aggression_factor = 100

for i in range(X_clean_w2v.shape[0]):
    X_w2v_vader[i] = X_clean_w2v[i] + (aggression_factor * vader_X_compound[i]) 
X_w2v_vader

## Mapping Visualisation
The 100-dimensional Word2Vec mapping of our features is reduced to a 2-dimensional space. The colouring is according to the label.

In [None]:
import umap
from sklearn.preprocessing import StandardScaler

reducer = umap.UMAP()

scaled_we = StandardScaler().fit(X_unprocessed_w2v)

tf_X_w2v = scaled_we.transform(X_clean_w2v)[:N]
tf_X_w2v_vader = scaled_we.transform(X_w2v_vader)[:N]
tf_X_unprocessed_w2v = scaled_we.transform(X_unprocessed_w2v)[:N]

reducer.fit(tf_X_w2v)

In [None]:
_embedding1 = reducer.transform(tf_X_w2v)
_embedding2 = reducer.transform(tf_X_w2v_vader)
_embedding3 = reducer.transform(tf_X_unprocessed_w2v)

### unprocessed w2v

In [None]:
import matplotlib.pyplot as plt
plt.scatter(
    _embedding3[:, 0],
    _embedding3[:, 1],
    color=['r' if _y == 0 else 'b' for _y in y[:N]],
    alpha=0.2
)
plt.savefig('unprocessed_w2v.png')

### clean w2v

In [None]:
import matplotlib.pyplot as plt
plt.scatter(
    _embedding1[:, 0],
    _embedding1[:, 1],
    color=['r' if _y == 0 else 'b' for _y in y[:N]],
    alpha=0.2
)
plt.ylim(40, 50)
plt.xlim(-24, -13)
plt.savefig('clean_w2v.png')

### w2v with VADER

In [None]:
plt.xlim(-23, -14)
plt.ylim(39, 50)
plt.scatter(
    _embedding2[:, 0],
    _embedding2[:, 1],
    color=['g' if _y == 0 else 'b' for _y in y[:N]],
    alpha=0.2
)
plt.savefig('w2v_x_vader.png')

# Classifiers
This compares different classifiers. 

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression, RidgeClassifier, LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier


embeddings = [
    {"name": "w2v clean", "data": X_clean_w2v},
    # {"name": "w2v unprocessed", "data": X_unprocessed_w2v},
    # {"name": "vader", "data": vader_X},
    # {"name": "vader x w2v", "data": X_w2v_vader},
    # {"name": "vader poly", "data": vader_X_poly}
]
classifiers = [
    {"name": "LogReg", "cf": LogisticRegression(max_iter=10000)},
    # {"name": "Ridge", "cf": RidgeClassifier(max_iter=10000)},
    # {"name: "Gauss", "cf": GaussianNB()},
    # {"name": "DecisionTree depth=10", "cf": DecisionTreeClassifier(max_depth=10), # train=0.71, test=0.68 on w2v clean
    {"name": "RandomForest estimators=10", "cf": RandomForestClassifier(n_estimators=10, n_jobs=-1)},
    # {"name": "RandomForest estimators=100", "cf": RandomForestClassifier(n_estimators=100, n_jobs=-1)},
    # {"name": "RandomForest estimators=500", "cf": RandomForestClassifier(n_estimators=500, n_jobs=-1)},
    {"name": "Bagging max_samples=0.5, estimators=10", "cf": BaggingClassifier(DecisionTreeClassifier(), max_samples=0.5, max_features=0.5, n_jobs=-1)},
    # {"name": "AdaBoostClassifier(n_estimators=10)", "cf": AdaBoostClassifier(n_estimators=10)},
    # {"name": "AdaBoostClassifier(n_estimators=50)", "cf": AdaBoostClassifier(n_estimators=50)},
    # {"name": "GradientBoosting estimators=100", "cf": GradientBoostingClassifier(n_estimators=100)},
    # {"name": "GradientBoosting estimators=10", "cf": GradientBoostingClassifier(n_estimators=10)},
    # {"name": "ExtraTrees estimators=10", "cf": ExtraTreesClassifier(n_estimators=10, n_jobs=-1)},
    {"name": "ExtraTrees esimators=500", "cf": ExtraTreesClassifier(n_estimators=500, n_jobs=-1)},
    # {"name": "ExtraTrees estimators=1000", "cf": ExtraTreesClassifier(n_estimators=1000, n_jobs=-1)},
]

train_acc = []
test_acc = []
e_o = []
c_o = []
    
for e in tqdm(embeddings, desc="embedding", position=0, leave=False):
    _X_train, _X_test, _y_train, _y_test = train_test_split(e["data"], y, test_size=0.33)
    
    for c in tqdm(classifiers, desc="classifier", position=1, leave=False):
        c["cf"].fit(_X_train, _y_train)
        _y_hat_train = c["cf"].predict(_X_train)
        _y_hat_test = c["cf"].predict(_X_test)
        
        e_o.append(e["name"])
        c_o.append(c["name"])
        train_acc.append(accuracy_score(_y_hat_train, _y_train))
        test_acc.append(accuracy_score(_y_hat_test, _y_test))
       
pd.DataFrame({'embedding': e_o, 'classifier': c_o, 'train accuracy': train_acc, 'test accuracy': test_acc})

In [None]:
pd.DataFrame({'embedding': e_o, 'classifier': c_o, 'train accuracy': train_acc, 'test accuracy': test_acc})

Save prediction with Logistic Regression to file.

In [None]:
def save_pred_to_file(cf, _X_train, _y_train, _X_validation):
    cf.fit(_X_train, _y_train)
    y_pred = cf.predict(_X_train)
    print(accuracy_score(y, y_pred))
    
    df_clean.filter(items=np.where(y_pred != _y_train)[0], axis=0).to_csv("mismatched.csv")
    
    y_pred_test = cf.predict(_X_validation)
    np.save('y_pred.npy', y_pred_test)

cf = LogisticRegression()
save_pred_to_file(cf, X_clean_w2v, y, X_clean_test_w2v)

In [None]:
np.load("y_pred.npy")

## Transformer
Using a pre-trained transformer for sentiment analysis from Huggingface.
Score on the validation set _unprocessed_: 0.71
cleaned: 0.66

This method is therefore worse on unprocessed data than Logistic Regression with w2v.