# AGDML-Lab Final
Task: Sentiment Analysis of Twitter messages

## Preprocessing
For preprocessing I used NLTK library. I removed non-alphabetic characters, made words lowercase, removed mentions of other users, removed stopwords and lemmatized each word to its lemma. This should make the data more consistent and easier to work with. My assumption is: most of the spelling mistakes and special characters are unnecessary for sentiment analysis.

The regex will most likely match stuff that we do not want removed, but that is a tradeoff we accept.

In [137]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

tqdm.pandas()

In [138]:
# read training data
df = pd.read_csv('data.csv')

# read validation data
df_test = pd.read_csv('data_valid.csv')

# preprocessing text messages
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# download stopwords and wordnet
nltk.download('stopwords')
nltk.download('wordnet')

# create object of WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

# function to clean text data
def clean_text(text):
    # make words lowercase, because Go and go will be considered as two words
    text = text.lower()
    # remove URLs from text
    text = re.sub('(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)', ' ', text)
    # remove mentions of other users
    text = re.sub('\B@[._a-z0-9]{3,24}', '', text)
    # replace ampersand html tag with &
    text = re.sub('\&amp;', 'and', text)
    # remove very special characters
    # text = re.sub('[^A-Za-z!?\.\-]', ' ', text)
    # split the sentences into words
    words = text.split()
    # remove stopwords like to, and, or etc.
    words = [word for word in words if word not in set(stopwords.words('english'))]
    # lemmatize each word to its lemma
    words = [wordnet_lemmatizer.lemmatize(word) for word in words]
    # join words to make sentence
    text = ' '.join(words)
    return text

# drop rows with missing values
df = df.dropna()
df_test = df_test.dropna()

# clean text data
df['text'] = df['text'].progress_apply(clean_text)
df.to_csv('data_cleaned.csv', index=False)

df_test['text'] = df_test['text'].progress_apply(clean_text)
df_test.to_csv('data_valid_cleaned.csv', index=False)

  0%|          | 0/500000 [00:00<?, ?it/s]

  0%|          | 0/100000 [00:00<?, ?it/s]

In [139]:
df[df['text'].apply(lambda x: not isinstance(x, str))]

Unnamed: 0,target,text


In [140]:
X = df['text']
y = df['target']

X_test = df_test['text']

# # assure that it's all text
# X = [str(x) for x in X]
# X_test = [str(x) for x in X_test]

In [141]:
from gensim.models import Word2Vec
sentences = [sentence.split() for sentence in X]
w2v_model = Word2Vec(sentences, window=5, min_count=5, workers=4, hs=1 , negative=0)

def vectorize(sentence):
    words = sentence.split()
    words_vecs = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
    if len(words_vecs) == 0:
        return np.zeros(100)
    words_vecs = np.array(words_vecs)
    return words_vecs.mean(axis=0)

X_w2v = np.array([vectorize(sentence) for sentence in tqdm(X)])
X_test_w2v = np.array([vectorize(sentence) for sentence in tqdm(X_test)])

  0%|          | 0/500000 [00:00<?, ?it/s]

  0%|          | 0/100000 [00:00<?, ?it/s]

In [150]:
from datetime import time
from matplotlib.pyplot import plot as plt
from matplotlib.lines import Line2D
import seaborn as sns

from gensim.models import LdaModel
from gensim import corpora
from sklearn.manifold import TSNE

light_grey_tup=(0.9, 0.9, 0.9)



UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe0 in position 3: invalid continuation byte

In [144]:
# use sklearn to learn a logistic regression classifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

classifier = LogisticRegression(max_iter=1000, C=1e5)
classifier.fit(X_w2v, y)

# predict the test set results
y_pred = classifier.predict(X_test_w2v)

# save the results to npy
df_test['target'] = y_pred
np.save('y_pred.npy', y_pred)

In [145]:
# make k-fold cross validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator=classifier, X=X_w2v, y=y, cv=10)
print('mean accuracy: ', accuracies.mean())
print('standard deviation: ', accuracies.std())

mean accuracy:  0.726026
standard deviation:  0.002057300172556244


## Transformer
Using a pre-trained transformer for sentiment analysis from Huggingface.
Score on the validation set: 0.71

This method is therefore worse than what I used before...