# Setup

In [1]:
%%capture
!pip install nltk

In [2]:
%%capture
import numpy as np
import sklearn
import pandas as pd

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# constants and global variables
DATA_PATH = '/content/drive/MyDrive/Colab Notebooks/CIL/Dataset/{}'

# if set to True the preprocessing for the bert model will be done, otherswise 
# the preprocessig for w2v and Tf-idf will be performed
is_bert_preprocessing_enabled = True
use_additional_dataset = False

# Read Data

In [5]:
tweets = []
labels = []

if use_additional_dataset:
  # read the additional dataset
  df = pd.read_csv(DATA_PATH.format('additional_dataset.csv'), delimiter=',', encoding = 'latin',header=None)
  # df = df.drop(df.columns[[1, 2, 3, 4]], axis=1)
  nRow, nCol = df.shape
  print(f'There are {nRow} rows and {nCol} columns')


  # the additional tweets
  tweets = df[5].tolist()
  # substitute links with <url>
  import re
  tweets = list(map(lambda tweet : re.sub("((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*", "<url>", tweet), tweets))
  # substitute usernames with <user>
  tweets = list(map(lambda tweet : re.sub("@\w+", "<user>", tweet), tweets))

  # the additional labels
  labels = df[0].tolist()

  for i in range(len(labels)):
    if labels[i] == 4:
      labels[i] = 1

  for i in range(30):
    print(tweets[i])
    print(labels[i])
  df.head()

In [6]:
def load_tweets(filename, label):
    with open(filename, 'r', encoding='utf-8') as f:
        for line in f:
            tweets.append(line.rstrip())
            labels.append(label)

# load training tweets   
load_tweets(DATA_PATH.format('train_neg_full.txt'), 0)
load_tweets(DATA_PATH.format('train_pos_full.txt'), 1)
# Convert to NumPy array to facilitate indexing
print(f'{len(tweets)} training/dev tweets loaded')
tweets = np.array(tweets)
labels = np.array(labels)

# load the test file
f = open(DATA_PATH.format('test_data.txt'), 'r', encoding='utf-8')
X_test = []
for line in f:
  X_test.append(",".join(line.split(',')[1:]).strip())
X_test = np.array(X_test)
print(f'{len(X_test)} test tweets loaded')


for i in range(10):
    print(tweets[i])
    print(labels[i])

for i in range(10):
  print(X_test[i])

print(len(tweets))

2500000 training/dev tweets loaded
10000 test tweets loaded
vinco tresorpack 6 ( difficulty 10 of 10 object : disassemble and reassemble the wooden pieces this beautiful wo ... <url>
0
glad i dot have taks tomorrow ! ! #thankful #startho
0
1-3 vs celtics in the regular season = were fucked if we play them in the playoffs
0
<user> i could actually kill that girl i'm so sorry ! ! !
0
<user> <user> <user> i find that very hard to believe im afraid
0
wish i could be out all night tonight ! <user>
0
<user> i got kicked out the wgm
0
rt <user> <user> <user> yes she is ! u tell it ! my lips are closed okay
0
why is she so perfect <url>
0
<user> hi harry ! did u havea good time in aus ? i didnt get 2 see u maybe next year ! follow me back if u can , would bea dreamcome truex
0
sea doo pro sea scooter ( sports with the portable sea-doo seascootersave air , stay longer in the water and ... <url>
<user> shucks well i work all week so now i can't come cheer you on ! oh and put those batteries in y

# Preprocess

In [7]:
# preprocess for bert
if is_bert_preprocessing_enabled:

  # Lowercase sentence
  tweets = list(map(lambda tweet : tweet.lower(), tweets))
  X_test = list(map(lambda tweet : tweet.lower(), X_test))

  # # remove hashtags
  # import re
  # tweets = list(map(lambda tweet : re.sub("#\w+", "", tweet), tweets))
  # X_test = list(map(lambda tweet : re.sub("#\w+", "", tweet), X_test))

  # remove duplicates from the training data (decreases accuracy)
  # print(len(tweets))
  # tweets_df = pd.DataFrame({'tweets':tweets, 'labels':labels}).drop_duplicates(subset=['tweets'], keep='last')
  # tweets = tweets_df["tweets"].to_numpy()
  # labels = tweets_df["labels"].to_numpy()
  # print(len(tweets))

  # split each tweet into separate words
  # tweets = list(map(lambda tweet : tweet.split(), tweets))
  # X_test = list(map(lambda tweet : tweet.split(), X_test))
  
  # remove user, url and other commond words (decreases accuracy)
  # forbidden_words = ["<url>", "<user>"]
  # tweets = list(map(lambda tweet : [w for w in tweet if not w in forbidden_words], tweets))
  # X_test = list(map(lambda tweet : [w for w in tweet if not w in forbidden_words], X_test))

  # from nltk.tokenize import RegexpTokenizer
  # # Tokenize sentence
  # tokenizer = RegexpTokenizer(r'\w+')
  # tweets = list(map(lambda tweet : tokenizer.tokenize(tweet), tweets))
  # X_test = list(map(lambda tweet : tokenizer.tokenize(tweet), X_test))

  # Remove stopwords (decrease accuracy)
  # from nltk.corpus import stopwords
  # stopwords_set = stopwords.words('english')
  # tweets = list(map(lambda tweet  : [w for w in tweet if not w in stopwords_set], tweets))
  # X_test = list(map(lambda tweet  : [w for w in tweet if not w in stopwords_set], X_test))

  # remove numbers (decreases accuracy)
  # import re
  # tweets = list(map(lambda tweet : [w for w in tweet if re.match("([0-9])+", w) == None], tweets))
  # X_test = list(map(lambda tweet : [w for w in tweet if re.match("([0-9])+", w) == None], X_test))

  # # join to back the tweets into a phrase
  # tweets = list(map(lambda tweet : " ".join(tweet), tweets))
  # X_test = list(map(lambda tweet : " ".join(tweet), X_test))

  # # remove duplicates from the training data
  print(len(tweets))
  tweets_df = pd.DataFrame({'tweets':tweets, 'labels':labels}).drop_duplicates(subset=['tweets', 'labels'], keep='last')
  tweets = tweets_df["tweets"].to_numpy()
  labels = tweets_df["labels"].to_numpy()
  print(len(tweets))


2500000
2270482


In [8]:
# preprocess for non bert
if not is_bert_preprocessing_enabled:
  # Lowercase sentence
  tweets = list(map(lambda tweet : tweet.lower(), tweets))
  X_test = list(map(lambda tweet : tweet.lower(), X_test))

  from nltk.tokenize import RegexpTokenizer
  # Tokenize sentence
  tokenizer = RegexpTokenizer(r'\w+')
  tweets = list(map(lambda tweet : tokenizer.tokenize(tweet), tweets))
  X_test = list(map(lambda tweet : tokenizer.tokenize(tweet), X_test))

  # remove hashtags
  tweets = list(map(lambda tweet : [w for w in tweet if not w.startswith("#")], tweets))
  X_test = list(map(lambda tweet : [w for w in tweet if not w.startswith("#")], X_test))

  from nltk.corpus import stopwords
  # Remove stopwords
  stopwords_set = stopwords.words('english')
  tweets = list(map(lambda tweet  : [w for w in tweet if not w in stopwords_set], tweets))
  X_test = list(map(lambda tweet  : [w for w in tweet if not w in stopwords_set], X_test))

  from nltk.stem import WordNetLemmatizer
  # Lemmatize
  lemmatizer = WordNetLemmatizer()
  tweets = list(map(lambda tweet : [lemmatizer.lemmatize(w) for w in tweet], tweets))
  X_test = list(map(lambda tweet : [lemmatizer.lemmatize(w) for w in tweet], X_test))

  # remove numbers
  import re
  tweets = list(map(lambda tweet : [w for w in tweet if re.match("([0-9])+", w) == None], tweets))
  X_test = list(map(lambda tweet : [w for w in tweet if re.match("([0-9])+", w) == None], X_test))

  # remove user, and url
  forbidden_words = ["url", "user"]
  tweets = list(map(lambda tweet : [w for w in tweet if not w in forbidden_words], tweets))
  X_test = list(map(lambda tweet : [w for w in tweet if not w in forbidden_words], X_test))

  # # remove duplicates from the training data
  # tweets = list(map(lambda tweet : " ".join(tweet), tweets))
  # print(len(tweets))
  # tweets_df = pd.DataFrame({'tweets':tweets, 'labels':labels}).drop_duplicates(subset=['tweets'], keep='last')
  # tweets = tweets_df["tweets"].to_numpy()
  # labels = tweets_df["labels"].to_numpy()
  # print(len(tweets))

  # join back the tweets into a phrase
  X_test = list(map(lambda tweet : " ".join(tweet), X_test))
  tweets = list(map(lambda tweet : " ".join(tweet), tweets))


In [9]:
for i in range(20):
  print(tweets[i])

print("\n\n\n")

for i in range(20):
  print(X_test[i])

vinco tresorpack 6 ( difficulty 10 of 10 object : disassemble and reassemble the wooden pieces this beautiful wo ... <url>
glad i dot have taks tomorrow ! ! #thankful #startho
1-3 vs celtics in the regular season = were fucked if we play them in the playoffs
<user> i could actually kill that girl i'm so sorry ! ! !
<user> <user> <user> i find that very hard to believe im afraid
wish i could be out all night tonight ! <user>
<user> i got kicked out the wgm
rt <user> <user> <user> yes she is ! u tell it ! my lips are closed okay
why is she so perfect <url>
<user> hi harry ! did u havea good time in aus ? i didnt get 2 see u maybe next year ! follow me back if u can , would bea dreamcome truex
introduction to programming with c + + ( 2nd edition this solid foundation in the basics of c + + programming will ... <url>
<user> i'm white . #aw
<user> dan i love and miss you ! don't be sad #wheresthegeneral
so many wonderful building in dc but still miss you <user>
<user> it's annoying because 

In [10]:
# shuffle the training data
from sklearn.utils import shuffle

X_train, y_train = shuffle(tweets, labels, random_state=84)

In [11]:
# save X data to a file
def save_X_data(file_name, X):
  f = open(DATA_PATH.format(file_name), "w")
  for element in X:
      # f.write(" ".join(element) + "\n")
      f.write(element + "\n")
  f.close()

save_X_data("X_train_processed_bert_full.txt" if is_bert_preprocessing_enabled else "X_train_processed.txt", X_train)
save_X_data("X_test_processed_bert_full.txt" if is_bert_preprocessing_enabled else "X_test_processed.txt", X_test)

In [12]:
# save labels to file
f = open(DATA_PATH.format("y_train.txt"), "w")
for label in y_train:
  f.write(str(label) + "\n")
f.close()

# INITIAL EXPLORATION

In [13]:
# get number of positive and negative tweets
print(f"In our dataset there are {(labels == 0).sum()} negative tweets")
print(f"In our dataset there are {(labels == 1).sum()} positive tweets")

In our dataset there are 1142838 negative tweets
In our dataset there are 1127644 positive tweets


In [14]:
# get the average length of positive and negative examples
def get_average_length(target_label):
  len_tweets = []
  for tweet, label in zip(tweets, labels):
    if label == target_label:
      len_tweets.append(len(tweet))
  
  return np.array(len_tweets).mean()

# NOTE: postive tweets seem to be longer on average than negative tweets.
print(f"The average length of tweets with a negative sentiment is: {get_average_length(0)}")
print(f"The average length of tweets with a positive sentiment is: {get_average_length(1)}")

The average length of tweets with a negative sentiment is: 83.0450676298828
The average length of tweets with a positive sentiment is: 68.3103497203018


In [15]:
# get most frequent words in positive and negative examples
def count_words(target_label):
  words_occurrences = {}
  for tweet, label in zip(tweets, labels):
    if label == target_label:
      for word in tweet:
        words_occurrences[word] = words_occurrences.get(word, 0) + 1

  return words_occurrences


from collections import Counter

tweets = list(map(lambda x : x.split(), tweets))
# negative tweets
print("NEGATIVE TWEETS:")
negative = count_words(0)
print(dict(Counter(negative).most_common(100)))
# print uniques words in the negative tweets
print(len(negative))

# positive tweets
print("\n\n\n POSITIVE TWEETS:")
positive = count_words(1)
print(dict(Counter(positive).most_common(100)))
# print uniques words in the positive tweets
print(len(positive))

NEGATIVE TWEETS:
{'<user>': 557268, 'i': 514794, '(': 461668, 'the': 384629, '...': 361101, '<url>': 359419, ',': 357807, '!': 356189, '.': 337791, 'to': 330581, 'a': 230752, 'and': 220914, 'of': 193821, 'my': 190918, 'you': 183583, 'is': 161676, 'in': 152527, 'me': 152130, '-': 136086, 'for': 133763, '?': 133155, '"': 129374, 'this': 123870, 'it': 117566, ':': 109284, 'so': 97197, 'with': 95146, 'frame': 86014, 'on': 85768, 'that': 83819, 'but': 82429, "i'm": 81933, 'have': 79842, '/': 73215, 'be': 69675, ')': 68844, 'not': 68645, 'just': 63839, 'was': 63622, 'at': 55280, '..': 52972, 'rt': 52687, 'like': 51955, 'no': 51219, 'all': 49390, 'are': 48510, 'now': 47197, 'get': 46593, 'your': 46463, 'up': 46455, 'go': 45615, 'when': 45394, "don't": 44834, 'do': 41269, 'one': 40712, '&': 40295, 'want': 39987, 'know': 38845, '2': 38684, 'miss': 38611, 'from': 37613, 'u': 36790, 'out': 36603, 'paperback': 35618, 'really': 34951, 'too': 34408, "can't": 33935, 'what': 33443, 'we': 33327, 'why':