# Setup

In [58]:
!pip install nltk



In [59]:
import numpy as np
import sklearn
import pandas as pd

import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [60]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [61]:
# constants and global variables
DATA_PATH = '/content/drive/MyDrive/Colab Notebooks/CIL/Dataset/{}'

# Read Data

In [62]:
tweets = []
labels = []

def load_tweets(filename, label):
    with open(filename, 'r', encoding='utf-8') as f:
        for line in f:
            tweets.append(line.rstrip())
            labels.append(label)

# load training tweets   
load_tweets(DATA_PATH.format('train_neg_full.txt'), 0)
load_tweets(DATA_PATH.format('train_pos_full.txt'), 1)
# Convert to NumPy array to facilitate indexing
print(f'{len(tweets)} training/dev tweets loaded')
tweets = np.array(tweets)
labels = np.array(labels)

# load the test file
f = open(DATA_PATH.format('test_data.txt'), 'r', encoding='utf-8')
X_test = []
for line in f:
  X_test.append(line.split(',')[1].rstrip())
X_test = np.array(X_test)
print(f'{len(X_test)} test tweets loaded')


for i in range(10):
    print(tweets[i])
    print(labels[i])

for i in range(10):
  print(X_test[i])

2500000 training/dev tweets loaded
10000 test tweets loaded
vinco tresorpack 6 ( difficulty 10 of 10 object : disassemble and reassemble the wooden pieces this beautiful wo ... <url>
0
glad i dot have taks tomorrow ! ! #thankful #startho
0
1-3 vs celtics in the regular season = were fucked if we play them in the playoffs
0
<user> i could actually kill that girl i'm so sorry ! ! !
0
<user> <user> <user> i find that very hard to believe im afraid
0
wish i could be out all night tonight ! <user>
0
<user> i got kicked out the wgm
0
rt <user> <user> <user> yes she is ! u tell it ! my lips are closed okay
0
why is she so perfect <url>
0
<user> hi harry ! did u havea good time in aus ? i didnt get 2 see u maybe next year ! follow me back if u can , would bea dreamcome truex
0
sea doo pro sea scooter ( sports with the portable sea-doo seascootersave air
<user> shucks well i work all week so now i can't come cheer you on ! oh and put those batteries in your calculator ! ! !
i cant stay away fro

# Preprocess

In [63]:
# Lowercasing sentence
tweets = list(map(lambda tweet : tweet.lower(), tweets))
X_test = list(map(lambda tweet : tweet.lower(), X_test))

from nltk.tokenize import RegexpTokenizer
# Tokenizing sentence
tokenizer = RegexpTokenizer(r'\w+')
tweets = list(map(lambda tweet : tokenizer.tokenize(tweet), tweets))
X_test = list(map(lambda tweet : tokenizer.tokenize(tweet), X_test))

from nltk.corpus import stopwords
# Removing stopwords
stopwords_set = stopwords.words('english')
tweets = list(map(lambda tweet  : [w for w in tweet if not w in stopwords_set], tweets))
X_test = list(map(lambda tweet  : [w for w in tweet if not w in stopwords_set], X_test))

from nltk.stem import WordNetLemmatizer
# Lemmatizing
lemmatizer = WordNetLemmatizer()
tweets = list(map(lambda tweet : [lemmatizer.lemmatize(w) for w in tweet], tweets))
X_test = list(map(lambda tweet : [lemmatizer.lemmatize(w) for w in tweet], X_test))

# remove numbers
import re
tweets = list(map(lambda tweet : [w for w in tweet if re.match("([0-9])+", w) == None], tweets))
X_test = list(map(lambda tweet : [w for w in tweet if re.match("([0-9])+", w) == None], X_test))

# remove user, url and other commond words
forbidden_words = ["url", "user"]
tweets = list(map(lambda tweet : [w for w in tweet if not w in forbidden_words], tweets))
X_test = list(map(lambda tweet : [w for w in tweet if not w in forbidden_words], X_test))


In [64]:
for i in range(20):
  print(tweets[i])

print("\n\n\n")

for i in range(20):
  print(X_test[i])

['vinco', 'tresorpack', 'difficulty', 'object', 'disassemble', 'reassemble', 'wooden', 'piece', 'beautiful', 'wo']
['glad', 'dot', 'taks', 'tomorrow', 'thankful', 'startho']
['v', 'celtic', 'regular', 'season', 'fucked', 'play', 'playoff']
['could', 'actually', 'kill', 'girl', 'sorry']
['find', 'hard', 'believe', 'im', 'afraid']
['wish', 'could', 'night', 'tonight']
['got', 'kicked', 'wgm']
['rt', 'yes', 'u', 'tell', 'lip', 'closed', 'okay']
['perfect']
['hi', 'harry', 'u', 'havea', 'good', 'time', 'au', 'didnt', 'get', 'see', 'u', 'maybe', 'next', 'year', 'follow', 'back', 'u', 'would', 'bea', 'dreamcome', 'truex']
['introduction', 'programming', 'c', 'edition', 'solid', 'foundation', 'basic', 'c', 'programming']
['introduction', 'programming', 'c', 'edition', 'solid', 'foundation', 'basic', 'c', 'programming']
['introduction', 'programming', 'c', 'edition', 'solid', 'foundation', 'basic', 'c', 'programming']
['white', 'aw']
['dan', 'love', 'miss', 'sad', 'wheresthegeneral']
['many', 

In [65]:
# shuffle the training data
from sklearn.utils import shuffle

X_train, y_train = shuffle(tweets, labels, random_state=84)

In [66]:
# save X data to a file
def save_X_data(file_name, X):
  f = open(DATA_PATH.format(file_name), "w")
  for element in X:
      f.write(" ".join(element) + "\n")
  f.close()

save_X_data("X_train_processed.txt", X_train)
save_X_data("X_test_processed.txt", X_test)

In [67]:
# save labels to file
f = open(DATA_PATH.format("y_train.txt"), "w")
for label in y_train:
  f.write(str(label) + "\n")
f.close()

# INITIAL EXPLORATION

In [68]:
# get number of positive and negative tweets
print(f"In our dataset there are {(labels == 0).sum()} negative tweets")
print(f"In our dataset there are {(labels == 1).sum()} positive tweets")

In our dataset there are 1250000 negative tweets
In our dataset there are 1250000 positive tweets


In [69]:
# get the average length of positive and negative examples
def get_average_length(target_label):
  len_tweets = []
  for tweet, label in zip(tweets, labels):
    if label == target_label:
      len_tweets.append(len(tweet))
  
  return np.array(len_tweets).mean()

# NOTE: postive tweets seem to be longer on average than negative tweets.
# TODO: check if this thing can be used at our advantange in training
print(f"The average length of tweets with a negative sentiment is: {get_average_length(0)}")
print(f"The average length of tweets with a positive sentiment is: {get_average_length(1)}")

The average length of tweets with a negative sentiment is: 8.3099912
The average length of tweets with a positive sentiment is: 6.3471352


In [70]:
# get most frequent words in positive and negative examples
def count_words(target_label):
  words_occurrences = {}
  for tweet, label in zip(tweets, labels):
    if label == target_label:
      for word in tweet:
        words_occurrences[word] = words_occurrences.get(word, 0) + 1

  return words_occurrences


from collections import Counter


# negative tweets
print("NEGATIVE TWEETS:")
negative = count_words(0)
print(dict(Counter(negative).most_common(100)))
# print uniques words in the negative tweets
print(len(negative))

# positive tweets
print("\n\n\n POSITIVE TWEETS:")
positive = count_words(1)
print(dict(Counter(positive).most_common(100)))
# print uniques words in the positive tweets
print(len(positive))

NEGATIVE TWEETS:
{'frame': 102464, 'rt': 56658, 'like': 55673, 'u': 52488, 'get': 52067, 'go': 50980, 'one': 48103, 'want': 45193, 'paperback': 43726, 'miss': 42609, 'know': 42561, 'day': 38829, 'really': 36500, 'time': 35358, 'love': 34868, 'pack': 32850, 'black': 32752, 'see': 32029, 'x': 31429, 'please': 31428, 'complete': 31283, 'back': 31109, 'picture': 30957, 'need': 30432, 'lol': 30146, 'wish': 29888, 'got': 28315, 'im': 28047, 'going': 27808, 'today': 27585, 'wide': 27324, 'feel': 26855, 'poster': 26620, 'new': 26317, 'custom': 26146, 'make': 24955, 'sad': 24903, 'think': 24530, 'work': 24335, 'good': 24202, 'come': 23695, 'never': 23377, 'much': 23338, 'still': 23055, 'edition': 22846, 'hate': 22774, 'follow': 21341, 'home': 20523, 'wanna': 20324, 'would': 20053, 'sorry': 19781, 'friend': 19595, 'right': 19427, 'life': 19338, 'year': 19174, 'book': 18919, 'school': 18628, 'bad': 18589, 'last': 18375, 'oh': 18193, 'even': 17355, 'night': 17016, 'could': 16928, 'hardcover': 1688