In [3]:

"""
This script preprocesses the tweet data similarily to preprocess_tf_binary.py
It uses the same script for processing hashtags, emojis and usernames
(found at https://gist.github.com/tokestermw/cb87a97113da12acb388)
It then uses sklearn libraries to perform multilabel classification on
the Bag of Words Model algorithms contained below
"""

import pandas as pd
import re
from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, learning_curve, ShuffleSplit
from sklearn.metrics import accuracy_score, jaccard_similarity_score, \
    classification_report, precision_score, recall_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
from sklearn.linear_model import RidgeClassifierCV
from sklearn.ensemble import RandomForestClassifier
import numpy as np


In [4]:

FLAGS = re.MULTILINE | re.DOTALL


def fix_split(pattern, string):
    splits = list((m.start(), m.end()) for m in re.finditer(pattern, string))
    starts = [0] + [i[1] for i in splits]
    ends = [i[0] for i in splits] + [len(string)]
    return [string[start:end] for start, end in zip(starts, ends)]


def hashtag(text):
    text = text.group()
    hashtag_body = text[1:]
    if hashtag_body.isupper():
        result = " {} ".format(hashtag_body.lower())
    else:
        result = " ".join(["<hashtag>"] + fix_split(r"(?=[A-Z])", hashtag_body))  # , flags=FLAGS))
    return result


def allcaps(text):
    text = text.group()
    return text.lower() + " <allcaps>"


def tokenize(text):
    # Different regex parts for smiley faces
    eyes = r"[8:=;]"
    nose = r"['`\-]?"

    def re_sub(pattern, repl):
        return re.sub(pattern, repl, text, flags=FLAGS)

    text = re_sub(r"https?:\/\/\S+\b|www\.(\w+\.)+\S*", "<url>")
    text = re_sub(r"@\w+", "<user>")
    text = re_sub(r"{}{}[)dD]+|[)dD]+{}{}".format(eyes, nose, nose, eyes), "<smile>")
    text = re_sub(r"{}{}p+".format(eyes, nose), "<lolface>")
    text = re_sub(r"{}{}\(+|\)+{}{}".format(eyes, nose, nose, eyes), "<sadface>")
    text = re_sub(r"{}{}[\/|l*]".format(eyes, nose), "<neutralface>")
    text = re_sub(r"/", " / ")
    text = re_sub(r"<3", "<heart>")
    text = re_sub(r"[-+]?[.\d]*[\d]+[:,.\d]*", "<number>")
    text = re_sub(r"#\S+", hashtag)
    text = re_sub(r"([!?.]){2,}", r"\1 <repeat>")
    text = re_sub(r"\b(\S*?)(.)\2{2,}\b", r"\1\2 <elong>")
    text = re_sub(r"([A-Z]){2,}", allcaps)

    return text.lower()



In [5]:
labeling_data = pd.read_csv("C:/Users/Muhammad/Deep Learning/Final Project/To Label Data/Pre-processed_English_Tweets.csv")
labeling_data['text'] = labeling_data['text'].fillna("")

In [6]:

print ('Data loading')
train_df = pd.read_csv('C:/Users/Muhammad/Deep Learning/Final Project/SemEval2018-Task1-all-data/English/E-c/train.txt', sep = '\t')
test_df = pd.read_csv('C:/Users/Muhammad/Deep Learning/Final Project/SemEval2018-Task1-all-data/English/E-c/test.txt', sep = '\t')
val_df = pd.read_csv('C:/Users/Muhammad/Deep Learning/Final Project/SemEval2018-Task1-all-data/English/E-c/dev.txt', sep = '\t')



print ('Data loaded...')

df = train_df.append(val_df, ignore_index=True)
df = df.append(test_df, ignore_index = True)

# Clean up training data
# df.dropna(axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)
df['text'] = df['text'].apply(tokenize)
emotions = df.columns[2:]


Data loading
Data loaded...


In [7]:
tweets = df['text']
# tweets = tweets.append(labeling_data['text'])

In [8]:
# len(tweets), df['text'].shape, labeling_data['text'].shape

In [9]:
tweets= tweets.values
cv = CountVectorizer()
tweets_transformed = cv.fit_transform(tweets)


In [10]:
# to_predict_tokens = tweets_transformed[:232070]
# tweets = tweets_transformed[232070:]
tweets = tweets_transformed

In [11]:
tweets.shape, to_predict_tokens.shape

NameError: name 'to_predict_tokens' is not defined

In [12]:

# separate into data and labels
# tweets = df['text'].values
labels = df[emotions].values

# map tweets to vector representation of the unique words it contains
# cv = CountVectorizer()
# x_tokens = cv.fit_transform(tweets)

x_train, x_val, y_train, y_val = train_test_split(tweets, labels,
                                                  test_size=0.4)
print ('Data Splitted...')

# suitable classifier models
clf = MLPClassifier(hidden_layer_sizes=(100,), max_iter=200,)
# clf = KNeighborsClassifier()
# clf = DecisionTreeClassifier()
# clf = RandomForestClassifier()

print ('Model fitting...')
# fit on the training data
clf.fit(x_train, y_train)

print ("predicting...")
# make predictions on testing data
predicted = clf.predict(x_val)
print("calculating matrices...")
# calculate a variety of metrics for comparison
jaccard_sim = jaccard_similarity_score(y_val, predicted)
prec_score_micro = precision_score(y_val, predicted, average='micro')
prec_score_macro = precision_score(y_val, predicted, average='macro')
rec_score_micro = recall_score(y_val, predicted, average='micro')
rec_score_macro = recall_score(y_val, predicted, average='macro')
f1_micro = f1_score(y_val, predicted, average='micro')
f1_macro = f1_score(y_val, predicted, average='macro')
class_report = classification_report(y_val, predicted, target_names=emotions)

# print metrics to terminal
print(f"Jaccard Similarity (accuracy): {jaccard_sim}")
print(f"Classification Report: \n{class_report}")
print(f"Precision Score (micro): {prec_score_micro}")
print(f"Precision Score (macro): {prec_score_macro}")
print(f"Recall Score (micro): {rec_score_micro}")
print(f"Recall Score (macro): {rec_score_macro}")
print(f"f1 Score (micro): {f1_micro}")
print(f"f1 Score (macro): {f1_macro}")


Data Splitted...
Model fitting...
predicting...
calculating matrices...
Jaccard Similarity (accuracy): 0.430938292476754
Classification Report: 
              precision    recall  f1-score   support

       anger       0.69      0.66      0.68      1604
anticipation       0.28      0.21      0.24       622
     disgust       0.62      0.57      0.59      1628
        fear       0.68      0.63      0.65       749
         joy       0.74      0.70      0.72      1703
        love       0.45      0.40      0.42       500
    optimism       0.57      0.55      0.56      1338
   pessimism       0.25      0.21      0.23       508
     sadness       0.52      0.54      0.53      1300
    surprise       0.37      0.19      0.26       222
       trust       0.11      0.07      0.09       190

   micro avg       0.58      0.54      0.56     10364
   macro avg       0.48      0.43      0.45     10364
weighted avg       0.57      0.54      0.56     10364
 samples avg       0.58      0.55      0.53

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [13]:
print ("Prediction Accuracy:", accuracy_score(y_val, predicted))

Prediction Accuracy: 0.1406463359126081


In [None]:
label_list = ['anger','anticipation','disgust','fear','joy','love','optimism','pessimism','sadness','surprise','trust']

In [None]:

labels = clf.predict(to_predict_tokens)

In [None]:
results = pd.merge(pd.DataFrame(labeling_data), pd.DataFrame(labels, columns=label_list), left_index=True, right_index=True)

In [None]:
results.keys()

In [None]:
results.to_csv("C:/Users/Muhammad/Deep Learning/Final Project/To Label Data/BOG_labeled.csv", index =False)

In [21]:
results

Unnamed: 0,status_id,user_id,date,time,text,is_quote,display_text_width,favorite_count,retweet_count,hashtags,...,anticipation,disgust,fear,joy,love,optimism,pessimism,sadness,surprise,trust
0,1240849521011113984,840438614,2020-03-20,03:55:54,corona time,False,43,0,0,,...,0,0,0,0,0,0,0,0,0,0
1,1240849520381976576,1143505456121307136,2020-03-20,03:55:54,flexing king u<number>f<number>,True,16,0,0,,...,0,0,0,0,0,0,0,0,0,0
2,1240849519727857664,3449353153,2020-03-20,03:55:54,everyone looks sick,False,67,0,51508,,...,0,0,1,0,0,1,0,0,0,0
3,1240849521866952704,1511537016,2020-03-20,03:55:54,corona day <number> feels like sunday,False,73,0,89818,,...,0,0,0,0,0,0,0,0,0,0
4,1240849528414244864,334488003,2020-03-20,03:55:56,everyone stay home please corona fucks quickly...,False,140,0,25397,,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
232065,1245982290003591168,787704370527866880,2020-04-03,07:51:41,please please please people stay fuck home got...,False,259,10,0,,...,1,0,0,0,0,0,0,0,0,0
232066,1245982289902940160,106725571,2020-04-03,07:51:41,within means please help fund wildlifeorphan<n...,False,140,0,11,,...,0,0,0,0,0,0,0,0,0,0
232067,1245982281132584960,862838468136779776,2020-04-03,07:51:39,migrant amp refugee camps potential catalysts ...,False,271,0,3,"c(""migrants"", ""MigrantsOnTheRoad"", ""Refugees"",...",...,0,0,0,0,0,0,1,0,0,1
232068,1245982253731233792,4898066856,2020-04-03,07:51:33,think good thing pesach tell hashem corona cha...,False,138,2,0,Covid_19,...,1,0,0,0,0,0,0,0,0,0
