## Import statements

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset

import numpy as np
import pandas as pd

import re

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import nltk
from nltk.tokenize import word_tokenize

import matplotlib.pyplot as plt


In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/haran/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Getting the Dataset

In [3]:
test_path = "../data/test.csv"
train_path = "../data/train.csv"

### Specifying the device  that's resposnible for loading a tensor into memory

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

## Importing the dataset

In [5]:
train_df = pd.read_csv(train_path, on_bad_lines='skip')
train_df

Unnamed: 0,Id,Category,Tweet
0,635769805279248384,negative,Not Available
1,635930169241374720,neutral,IOS 9 App Transport Security. Mm need to check...
2,635950258682523648,neutral,"Mar if you have an iOS device, you should down..."
3,636030803433009153,negative,@jimmie_vanagon my phone does not run on lates...
4,636100906224848896,positive,Not sure how to start your publication on iOS?...
...,...,...,...
5965,639016598477651968,neutral,@YouAreMyArsenal Wouldn't surprise me if we en...
5966,640276909633486849,neutral,Rib injury for Zlatan against Russia is a big ...
5967,640296841725235200,neutral,Noooooo! I was hoping to see Zlatan being Zlat...
5968,641017384908779520,neutral,Not Available


In [6]:
test_df = pd.read_csv(test_path, on_bad_lines='skip')
test_df

Unnamed: 0,Id,Category
0,6.289494e+17,dear @Microsoft the newOoffice for Mac is grea...
1,6.289766e+17,@Microsoft how about you make a system that do...
2,6.290232e+17,Not Available
3,6.291792e+17,Not Available
4,6.291863e+17,If I make a game as a #windows10 Universal App...
...,...,...
9963,,
9964,,
9965,,
9966,,


# Data Pre-Processing

### First we clean up the training data.
-> We get rid of the "Id" column as its not useful for our model 


-> We then remove all the Null values in our data

In [7]:
train_df = train_df.drop(columns=["Id"])
train_df = train_df.dropna()
train_df = train_df[train_df["Tweet"] != "Not Available"]
train_df

Unnamed: 0,Category,Tweet
1,neutral,IOS 9 App Transport Security. Mm need to check...
2,neutral,"Mar if you have an iOS device, you should down..."
3,negative,@jimmie_vanagon my phone does not run on lates...
4,positive,Not sure how to start your publication on iOS?...
5,neutral,"Two Dollar Tuesday is here with Forklift 2, Qu..."
...,...,...
5963,positive,"Ok ed let's do this, Zlatan, greizmann and Lap..."
5964,neutral,Goal level: Zlatan 90k by Friday? = Posting e...
5965,neutral,@YouAreMyArsenal Wouldn't surprise me if we en...
5966,neutral,Rib injury for Zlatan against Russia is a big ...


### Now we will look at the test data

-> We rename the category column to tweets

-> And then we apply the same steps as above

In [8]:
test_df = test_df.rename(columns={"Category":"Tweet"})

In [9]:
test_df = test_df.drop(columns=["Id"])
test_df = test_df.dropna()
test_df = test_df[test_df["Tweet"] != "Not Available"]
test_df

Unnamed: 0,Tweet
0,dear @Microsoft the newOoffice for Mac is grea...
1,@Microsoft how about you make a system that do...
4,If I make a game as a #windows10 Universal App...
5,"Microsoft, I may not prefer your gaming branch..."
6,@MikeWolf1980 @Microsoft I will be downgrading...
...,...
3994,Anybody with a Steak &amp; Shake or IHOP move ...
3995,I am assembling an epic Pancake Posse for an I...
3996,do you work at Ihop tomorrow @carlysunshine_
3997,23 Aug 00;30 #771NAS Rescue193 returned from T...


### Dealing with the Class Imbalance

In [10]:
train_df['Category'].value_counts()

positive    2599
neutral     1953
negative     869
Tweet          1
Name: Category, dtype: int64

When we have surpervised datasets, the number of instances of each class needs to be balanced. An imbalance in the training set could skew our model and reduce its accurate. 

There are many methods that can be used to deal with this such as oversampling,undersampling and augmentation. In this case undersampling is used.

In undersampling we decrease the number of instances of the majority class. In this case we remove tweets from the neutral and positive classes until we end up with 869 tweets

In [11]:
neg_df = train_df[train_df["Category"] == "negative"]
pos_df = train_df[train_df["Category"] == "positive"]
neu_df = train_df[train_df["Category"] == "neutral"]

pos_drop = np.random.choice(pos_df.index,2599-869,replace=False)
neu_drop = np.random.choice(neu_df.index,1953-869,replace=False)

pos_undersampled = pos_df.drop(pos_drop)
neu_undersampled = neu_df.drop(neu_drop)

balanced_train_df = pd.concat([neg_df,neu_undersampled,pos_undersampled])

In [12]:
balanced_train_df

Unnamed: 0,Category,Tweet
3,negative,@jimmie_vanagon my phone does not run on lates...
29,negative,@_tomcc @bartwerf please stop walking and work...
36,negative,"@DeltaAssist yes, I asked an attendant to prin..."
91,negative,ok inbox on iOS has a pretty animation for the...
100,negative,Lost my iPad for the 2nd time today
...,...,...
5908,positive,I have a sneaking feeling we may just see Zlat...
5917,positive,"Orun ni zlatan sun niyen smh""@ZIatanFacts: Zla..."
5921,positive,Zlatan says he may have played his last match ...
5945,positive,Reserving the right to put Milan 3rd if Zlatan...


In [13]:
balanced_train_df["Category"].value_counts()

negative    869
neutral     869
positive    869
Name: Category, dtype: int64

### Splitting the data into test and training sets

This has to be done as our test data is not labelled

In [14]:
train_b_df,test_b_df = train_test_split(balanced_train_df,test_size = 0.15)

In [15]:
train_b_df

Unnamed: 0,Category,Tweet
1956,neutral,It's the Atlanta Falcons (1-0) against the New...
1163,positive,@realDonaldTrump @DanScavino @MichaelCohen212 ...
1529,neutral,I wonder why Drake didn't release the 2nd vers...
2114,positive,@CollinceBey I admire Michelle Obama more. Can...
4934,positive,@LoriJulia what is that book that is supposedl...
...,...,...
2592,neutral,Nintendo of Europe decreased the size Target's...
4576,negative,@MelissaPonzio1 crying over the teen wolf fina...
623,neutral,Joe Biden or Donald Trump may be the next pres...
5840,negative,@KonamiSupport yo wtf why can't you fix the se...


In [16]:
test_b_df

Unnamed: 0,Category,Tweet
3617,negative,@politibunny @brakinggnus @dangoddu @jbryant71...
5586,positive,Oi Taco Tuesday's is the one! Best Burrito's o...
4574,positive,Im going to finish now the 2nd season of teen ...
4426,positive,Can Taylor Swift play her cover of Drops Of Ju...
5072,negative,"Fwiw, my own parental household may make me ba..."
...,...,...
4246,positive,"And @TripleH may have "" hand picked "" @WWERoll..."
2969,positive,"Oracle, memcached, Engage API all running in D..."
3703,negative,What's with this Twitter account? It's like th...
393,negative,"I'm crushed ""It's a hoax: Jay-Z and Beyonce ar..."


#### Conversion of both dataframes to lists for easier manipulation. This can be done as the data size is relatively small

In [17]:
train_set = list(train_b_df.to_records(index=False))
test_set = list(test_b_df.to_records(index=False))

In [18]:
train_set[:5]

[('neutral', "It's the Atlanta Falcons (1-0) against the New York Jets (0-1) at 7:30 p.m. on Friday in MetLife stadi... http://t.co/juvQA71td5 #RiseUp"),
 ('positive', '@realDonaldTrump @DanScavino @MichaelCohen212 -YOU R THE BEST NEGOTIATOR.Wish it was U instead of Kerry that went. https://t.co/5Rcn2UJ5d4'),
 ('neutral', "I wonder why Drake didn't release the 2nd verse on Madonna all at once. I think it's better than the first one."),
 ('positive', '@CollinceBey I admire Michelle Obama more. Cant imagine how she survived being in a remote African village for 1st time! She deserved WH!'),
 ('positive', "@LoriJulia what is that book that is supposedly about Katie Holmes and Tom Cruise? I'm going to Barnes&amp; Noble tomorrow! #help")]

In [19]:
test_set[:5]

[('negative', '@politibunny @brakinggnus @dangoddu @jbryant710 SCOTUS violated the 10th Amendment to the U.S. Constitution not allowing #KimDavis #1A right'),
 ('positive', "Oi Taco Tuesday's is the one! Best Burrito's on the west coast awards 1. San Diego, PB 2. San Francisco, Mission 3. Venice Beach, LA"),
 ('positive', 'Im going to finish now the 2nd season of teen wolf'),
 ('positive', 'Can Taylor Swift play her cover of Drops Of Jupiter tomorrow at the concert ok pls and thank you'),
 ('negative', 'Fwiw, my own parental household may make me batty if "hopeful" Trump stays legitimate into the primaries ... I\'ll snap.')]

#### As we can see the tweets have to be cleaned, special symbols and links have to be removed

In [20]:
# We create a function to do the cleaning, which can then be applied on the list
def tweet_clean(tweet):
    link_pattern = "https?:\/\/t.co/[\w]+"
    mention_pattern = "@\w+"
    tweet = re.sub(link_pattern, "", tweet)
    tweet = re.sub(mention_pattern, "", tweet)
    return tweet.lower()

### Tokenization


Here we use the `word_tokenize()` function from the NLTK library to split the sentence into tokens or words

In [21]:
train_set = [(label,word_tokenize(tweet_clean(tweet))) for label,tweet in train_set]
train_set[:1]

[('neutral',
  ['it',
   "'s",
   'the',
   'atlanta',
   'falcons',
   '(',
   '1-0',
   ')',
   'against',
   'the',
   'new',
   'york',
   'jets',
   '(',
   '0-1',
   ')',
   'at',
   '7:30',
   'p.m.',
   'on',
   'friday',
   'in',
   'metlife',
   'stadi',
   '...',
   '#',
   'riseup'])]

In [22]:
test_set = [(label,word_tokenize(tweet_clean(tweet))) for label,tweet in test_set]
test_set[:1]

[('negative',
  ['scotus',
   'violated',
   'the',
   '10th',
   'amendment',
   'to',
   'the',
   'u.s.',
   'constitution',
   'not',
   'allowing',
   '#',
   'kimdavis',
   '#',
   '1a',
   'right'])]