# AGDML-Lab Final
Task: Sentiment Analysis of Twitter messages

## Preprocessing
For preprocessing I used NLTK library. I removed non-alphabetic characters, made words lowercase, removed mentions of other users, removed stopwords and lemmatized each word to its lemma. This should make the data more consistent and easier to work with. My assumption is: most of the spelling mistakes and special characters are unnecessary for sentiment analysis.

The regex will most likely match stuff that we do not want removed, but that is a tradeoff we accept.

In [2]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

tqdm.pandas()

In [87]:
# read training data
df = pd.read_csv('data.csv')

# read validation data
df_test = pd.read_csv('data_valid.csv')

# preprocessing text messages
import re
# import nltk
# from nltk.corpus import stopwords
# from nltk.stem import WordNetLemmatizer
# 
# # download stopwords and wordnet
# nltk.download('stopwords')
# nltk.download('wordnet')
# 
# # create object of WordNetLemmatizer
# wordnet_lemmatizer = WordNetLemmatizer()

# function to clean text data
def clean_text(text):
    # make words lowercase, because Go and go will be considered as two words
    text = text.lower()
    # remove URLs from text
    text = re.sub('(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)', ' ', text)
    # remove mentions of other users
    text = re.sub('\B@[._a-z0-9]{3,24}', '', text)
    # replace ampersand html tag with &
    text = re.sub('\&amp;', 'and', text)
    # remove very special characters
    text = re.sub('[^A-Za-z!?\.\-]', ' ', text)
    # remove double white spaces
    text = re.sub('\s+', ' ', text)
    return text

# drop rows with missing values
df = df.dropna()
df_test = df_test.dropna()

# clean text data
df['text'] = df['text'].progress_apply(clean_text)
df.to_csv('data_cleaned.csv', index=False)

df_test['text'] = df_test['text'].progress_apply(clean_text)
df_test.to_csv('data_valid_cleaned.csv', index=False)

  0%|          | 0/500000 [00:00<?, ?it/s]

  0%|          | 0/100000 [00:00<?, ?it/s]

In [86]:
# read cleaned data
df = pd.read_csv('data_cleaned.csv')
df_test = pd.read_csv('data_valid_cleaned.csv')
df

Unnamed: 0,target,text
0,0,this is my last tweet of the day so goodnight...
1,1,ahaha okay yeah fer suree! i ll vote for her...
2,0,i m gonna feel like shit at uni today i m sti...
3,0,can t find my phone charger.. so i m switching...
4,0,it was just fade-to-black
...,...,...
499995,1,watching twilight teeeext me
499996,1,for me it is a usual day for me fighting with...
499997,0,question for ? yesterday was an emotional da...
499998,0,okay then u made me sad


In [None]:
# read raw data
df = pd.read_csv('data.csv')
df_test = pd.read_csv('data_valid.csv')
df

In [41]:
X = df['text']
y = df['target']

X_test = df_test['text']

# assure that it's all text
X = [str(x) for x in X]
X_test = [str(x) for x in X_test]

In [42]:
from gensim.models import Word2Vec
sentences = [sentence.split() for sentence in X]
w2v_model = Word2Vec(sentences, window=5, min_count=5, workers=4, hs=1 , negative=0)

def vectorize(sentence):
    words = sentence.split()
    words_vecs = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
    if len(words_vecs) == 0:
        return np.zeros(100)
    words_vecs = np.array(words_vecs)
    return words_vecs.mean(axis=0)

X_w2v = np.array([vectorize(sentence) for sentence in X])
X_test_w2v = np.array([vectorize(sentence) for sentence in X_test])

In [43]:
X_w2v

array([[-0.12257263,  0.28916484, -0.1190823 , ...,  0.24229461,
         0.3195329 , -0.13502343],
       [ 0.04496434, -0.21986333,  0.13902381, ..., -0.01016132,
         0.02926087, -0.14712319],
       [ 0.10466962,  0.20594099, -0.21641675, ..., -0.10679854,
        -0.06894603, -0.31013888],
       ...,
       [ 0.05546274,  0.09733719,  0.15488492, ...,  0.3099077 ,
        -0.08956569,  0.2731683 ],
       [-0.33402756, -0.26418677, -0.20447092, ..., -0.45653096,
        -0.03356897, -0.30286217],
       [ 0.32251152, -0.34498209,  0.21685825, ..., -0.34435648,
        -0.35758331, -0.15616232]])

In [44]:
# use sklearn to learn a logistic regression classifier
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_w2v, y)

# predict the test set results
y_pred = classifier.predict(X_test_w2v)

# save the results to npy
df_test['target'] = y_pred
np.save('y_pred.npy', y_pred)

In [45]:
# make k-fold cross validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator=classifier, X=X_w2v, y=y, cv=10)
print('mean accuracy: ', accuracies.mean())
print('standard deviation: ', accuracies.std())

mean accuracy:  0.736964
standard deviation:  0.00252191673137714


In [53]:
from transformers import pipeline
sentiment_pipeline = pipeline("sentiment-analysis")
y_pred_transformer = sentiment_pipeline(df_test['text'].tolist())

y_pred_transformer

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


[0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,


In [77]:
# np.array([1 if x['label'] == 'POSITIVE' else 0 for x in y_pred_transformer]).tofile('y_pred_transformer.npy')
y_pred_transformer_array = np.array(y_pred_transformer)
np.save('y_pred_transformer.npy', y_pred_transformer_array)
np.array(y_pred_transformer).shape[0] == len(df_test)

True

In [81]:
# type(df['text'].tolist())
_x = np.load("y_pred_transformer.npy", allow_pickle=True)
np.all(_x == y_pred_transformer_array)

True