# AGDML-Lab Final
Task: Sentiment Analysis of Twitter messages

## Preprocessing
For preprocessing I used NLTK library. I removed non-alphabetic characters, made words lowercase, removed mentions of other users, removed stopwords and lemmatized each word to its lemma. This should make the data more consistent and easier to work with. My assumption is: most of the spelling mistakes and special characters are unnecessary for sentiment analysis.

The regex will most likely match stuff that we do not want removed, but that is a tradeoff we accept.

In [1]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import torch

tqdm.pandas()
torch.cuda.get_device_name(0)

'NVIDIA GeForce GTX 1060 6GB'

In [21]:
# read training data
df = pd.read_csv('data.csv')

# read validation data
df_test = pd.read_csv('data_valid.csv')

# preprocessing text messages
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# download stopwords and wordnet
nltk.download('stopwords')
nltk.download('wordnet')

# create object of WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

# this allows very
stopwords = set(stopwords.words('english'))
stopwords.remove('very')
print(stopwords)

# function to clean text data
def clean_text(text):
    # remove mentions of other users
    text = re.sub('\B@[._a-zA-Z0-9]{3,24}', '', text)
    # rewrite words in all caps to "very" followed by word
    text = re.sub('([A-Z]+)', lambda x: 'very ' + x.group(0).lower(), text)
    # make words lowercase, because Go and go will be considered as two words
    text = text.lower()
    # remove URLs from text
    text = re.sub('(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)', ' ', text)
    # replace ampersand html tag with &
    text = re.sub('\&amp;', 'and', text)
    # replace text surrounded by ** with very
    text = re.sub('\*([a-z])\*', r'very \1', text)
    # remove everything but letters
    text = re.sub('[^a-z]', ' ', text)    
    # split the sentences into words
    words = text.split() 
    for i in range(len(words)):
        # remove words with length 1
        if len(words[i]) == 1:
            words[i] = ''
        # remove repetition of letters
        _tmp = re.subn(r'([a-z])\1{3,}', r'\1', words[i])
        if _tmp[1] > 0:
            words[i] = "very " + _tmp[0]
    # remove stopwords like to, and, or etc.
    words = [word for word in words if word not in stopwords]
    # lemmatize each word
    words = [wordnet_lemmatizer.lemmatize(word) for word in words]
    # join words to make sentence
    text = ' '.join(words)
    # remove multiple spaces
    text = re.sub('\s+', ' ', text)
    
    return text

# drop rows with missing values
df = df.dropna()
df_test = df_test.dropna()

# clean text data
df['text'] = df['text'].progress_apply(clean_text)
df.dropna()
df.to_csv('data_cleaned.csv', index=False)    

df_test['text'] = df_test['text'].progress_apply(clean_text)
df_test.dropna()
df_test.to_csv('data_valid_cleaned.csv', index=False)

{'wouldn', 'any', 'off', "isn't", "mightn't", 's', 'until', "it's", 'only', 'yourselves', 'ourselves', "shouldn't", 'why', 'between', 'before', 'not', 'while', 'at', 'below', 'isn', 'yourself', "needn't", 'most', "weren't", 'some', 'were', 'too', 'other', 'their', 'theirs', 'during', 'had', 'hers', 'here', 'shouldn', "wouldn't", 'my', 'once', 'against', 'needn', 'mustn', "aren't", "should've", 'then', 'own', 'she', 'they', "hadn't", 'further', 'did', "that'll", "you're", 'what', 'about', 'weren', 'than', 'by', 'because', 'out', 'will', 'a', 'if', 'can', 't', 'in', 'as', 'just', 'doing', 'same', "you'll", 'hasn', 'no', "couldn't", 'few', 'll', 'but', 'should', "haven't", 'have', 'there', 'shan', 'having', 'above', 'and', "won't", 'ain', 'into', 'you', 'under', "she's", 'be', 'won', 'from', 'these', 'wasn', 're', 'o', 'd', 'couldn', 'herself', "you've", 'for', 'your', 'doesn', 'mightn', 'after', 'whom', 'myself', 'itself', 'aren', 'ours', 'so', 'over', 'was', 'its', 'been', 'has', 'the',

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\frand\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\frand\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


  0%|          | 0/500000 [00:00<?, ?it/s]

  0%|          | 0/100000 [00:00<?, ?it/s]

In [5]:
# test cleaning
assert clean_text("LOOOOOOOOOOVE") == "very very love"
assert clean_text("@herrmotz") == ""
assert clean_text("https://youtube.com/test") == ""

clean_text("can't find my phone charger.. So I'm switching my service over to my old phone")

' find phone charger very very switching service old phone'

In [22]:
# read the csv file
df = pd.read_csv('data_cleaned.csv')
df_test = pd.read_csv('data_valid_cleaned.csv')

df

Unnamed: 0,target,text
0,0,last tweet day goodnight twitter world hope be...
1,1,ahaha okay yeah fer suree vote right thee dire...
2,0,very gonna feel like shit uni today very still...
3,0,find phone charger very very switching servic...
4,0,very fade black
...,...,...
499995,1,very watching twilight very text
499996,1,usual day fighting little inner outer daemon
499997,0,very question very yesterday emotional day tod...
499998,0,okay made sad


In [55]:
# def drop_not_string(_df, column):
#     return _df.drop(_df[_df[column].apply(lambda x: isinstance(x, str)) == False].index)
# 
# drop_not_string(df, 'text')
# drop_not_string(df_test, 'text')

X = df['text']
y = df['target']

X_test = df_test['text']

# make them all strings
X = X.astype(str)
X_test = X_test.astype(str)

In [56]:
from gensim.models import Word2Vec
sentences = [sentence.split() for sentence in X]
w2v_model = Word2Vec(sentences, window=5, min_count=5, workers=4, hs=1 , negative=0)

def vectorize(sentence):
    words = sentence.split()
    words_vecs = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
    if len(words_vecs) == 0:
        return np.zeros(100)
    words_vecs = np.array(words_vecs)
    return words_vecs.mean(axis=0)

X_w2v = np.array([vectorize(sentence) for sentence in tqdm(X)])
X_test_w2v = np.array([vectorize(sentence) for sentence in tqdm(X_test)])

X_w2v.shape

  0%|          | 0/99988 [00:00<?, ?it/s]

(491883, 100)

In [59]:
# use sklearn to learn a logistic regression classifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_w2v, y)

# predict the test set results
y_pred = classifier.predict(X_test_w2v)

# save the results to npy
df_test['target'] = y_pred
np.save('y_pred.npy', y_pred)

In [60]:
# make k-fold cross validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator=classifier, X=X_w2v, y=y, cv=10)
print('mean accuracy: ', accuracies.mean())
print('standard deviation: ', accuracies.std())

mean accuracy:  0.7310803417328062
standard deviation:  0.0031705444646507553


## Transformer
Using a pre-trained transformer for sentiment analysis from Huggingface.
Score on the validation set: 0.71

This method is therefore worse than what I used before...