## Import statements

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset

import numpy as np
import pandas as pd

import re

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import nltk
from nltk.tokenize import word_tokenize

import matplotlib.pyplot as plt


In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/haran/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Getting the Dataset

In [3]:
test_path = "../data/test.csv"
train_path = "../data/train.csv"

### Specifying the device  that's resposnible for loading a tensor into memory

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

## Importing the dataset

In [5]:
train_df = pd.read_csv(train_path, on_bad_lines='skip')
train_df

Unnamed: 0,Id,Category,Tweet
0,635769805279248384,negative,Not Available
1,635930169241374720,neutral,IOS 9 App Transport Security. Mm need to check...
2,635950258682523648,neutral,"Mar if you have an iOS device, you should down..."
3,636030803433009153,negative,@jimmie_vanagon my phone does not run on lates...
4,636100906224848896,positive,Not sure how to start your publication on iOS?...
...,...,...,...
5965,639016598477651968,neutral,@YouAreMyArsenal Wouldn't surprise me if we en...
5966,640276909633486849,neutral,Rib injury for Zlatan against Russia is a big ...
5967,640296841725235200,neutral,Noooooo! I was hoping to see Zlatan being Zlat...
5968,641017384908779520,neutral,Not Available


In [6]:
test_df = pd.read_csv(test_path, on_bad_lines='skip')
test_df

Unnamed: 0,Id,Category
0,6.289494e+17,dear @Microsoft the newOoffice for Mac is grea...
1,6.289766e+17,@Microsoft how about you make a system that do...
2,6.290232e+17,Not Available
3,6.291792e+17,Not Available
4,6.291863e+17,If I make a game as a #windows10 Universal App...
...,...,...
9963,,
9964,,
9965,,
9966,,


# Data Pre-Processing

### First we clean up the training data.
-> We get rid of the "Id" column as its not useful for our model 


-> We then remove all the Null values in our data

In [7]:
train_df = train_df.drop(columns=["Id"])
train_df = train_df.dropna()
train_df = train_df[train_df["Tweet"] != "Not Available"]
train_df

Unnamed: 0,Category,Tweet
1,neutral,IOS 9 App Transport Security. Mm need to check...
2,neutral,"Mar if you have an iOS device, you should down..."
3,negative,@jimmie_vanagon my phone does not run on lates...
4,positive,Not sure how to start your publication on iOS?...
5,neutral,"Two Dollar Tuesday is here with Forklift 2, Qu..."
...,...,...
5963,positive,"Ok ed let's do this, Zlatan, greizmann and Lap..."
5964,neutral,Goal level: Zlatan 90k by Friday? = Posting e...
5965,neutral,@YouAreMyArsenal Wouldn't surprise me if we en...
5966,neutral,Rib injury for Zlatan against Russia is a big ...


### Now we will look at the test data

-> We rename the category column to tweets

-> And then we apply the same steps as above

In [8]:
test_df = test_df.rename(columns={"Category":"Tweet"})

In [9]:
test_df = test_df.drop(columns=["Id"])
test_df = test_df.dropna()
test_df = test_df[test_df["Tweet"] != "Not Available"]
test_df

Unnamed: 0,Tweet
0,dear @Microsoft the newOoffice for Mac is grea...
1,@Microsoft how about you make a system that do...
4,If I make a game as a #windows10 Universal App...
5,"Microsoft, I may not prefer your gaming branch..."
6,@MikeWolf1980 @Microsoft I will be downgrading...
...,...
3994,Anybody with a Steak &amp; Shake or IHOP move ...
3995,I am assembling an epic Pancake Posse for an I...
3996,do you work at Ihop tomorrow @carlysunshine_
3997,23 Aug 00;30 #771NAS Rescue193 returned from T...


### Dealing with the Class Imbalance

In [10]:
train_df['Category'].value_counts()

positive    2599
neutral     1953
negative     869
Tweet          1
Name: Category, dtype: int64

When we have surpervised datasets, the number of instances of each class needs to be balanced. An imbalance in the training set could skew our model and reduce its accurate. 

There are many methods that can be used to deal with this such as oversampling,undersampling and augmentation. In this case undersampling is used.

In undersampling we decrease the number of instances of the majority class. In this case we remove tweets from the neutral and positive classes until we end up with 869 tweets

In [11]:
neg_df = train_df[train_df["Category"] == "negative"]
pos_df = train_df[train_df["Category"] == "positive"]
neu_df = train_df[train_df["Category"] == "neutral"]

pos_drop = np.random.choice(pos_df.index,2599-869,replace=False)
neu_drop = np.random.choice(neu_df.index,1953-869,replace=False)

pos_undersampled = pos_df.drop(pos_drop)
neu_undersampled = neu_df.drop(neu_drop)

balanced_train_df = pd.concat([neg_df,neu_undersampled,pos_undersampled])

In [12]:
balanced_train_df

Unnamed: 0,Category,Tweet
3,negative,@jimmie_vanagon my phone does not run on lates...
29,negative,@_tomcc @bartwerf please stop walking and work...
36,negative,"@DeltaAssist yes, I asked an attendant to prin..."
91,negative,ok inbox on iOS has a pretty animation for the...
100,negative,Lost my iPad for the 2nd time today
...,...,...
5932,positive,Scenes when Benzema walks out of tunnel tomorr...
5943,positive,"Zlatan , will your perfumes make me perform ov..."
5945,positive,Reserving the right to put Milan 3rd if Zlatan...
5952,positive,"Mourinho, Zlatan, Serie A - who are the winner..."


In [13]:
balanced_train_df["Category"].value_counts()

negative    869
neutral     869
positive    869
Name: Category, dtype: int64

### Splitting the data into test and training sets

This has to be done as our test data is not labelled

In [15]:
train_b_df,test_b_df = train_test_split(balanced_train_df,test_size = 0.15)

In [16]:
train_b_df

Unnamed: 0,Category,Tweet
2021,negative,I was sat with my auntie who has seen Michael ...
4838,negative,@DylanBuckingham @ColbyJPowell @ToddOnFranchis...
177,neutral,The Accounting Career Fair is this Friday! Be ...
940,negative,"@justinlarson34 Justin, I think you got that b..."
2061,positive,"On 6 September 2001, Michael Jackson made a su..."
...,...,...
4994,neutral,"@BlissettCarl @GeneralBoles @guardian 434,000 ..."
2821,negative,@rainbowrowell Now envisioning Tues night FTs ...
5660,positive,There are still a few tix left for the 8PM Liv...
530,negative,@AndreaTantaros Jeb Bush appears to be afraid...


In [17]:
test_b_df

Unnamed: 0,Category,Tweet
2723,positive,Facebook just achieved 1 billion active users ...
2543,positive,I may have screamed when I saw this in the Nik...
23,positive,@androidcentral Even though there are other di...
4455,neutral,@RoyHarperInRed [Dino sat and Abby babbled tha...
3196,neutral,We add our regrets for yesterday's tragedy at ...
...,...,...
3900,neutral,"The moon is an errant thief, And her pale fire..."
5284,neutral,@francesbarber13 Did you read this Frances? Co...
5334,negative,My Ma's so fucking UKIP. She may as well be sm...
3463,negative,"Line of the day: ""Rick Perry has the mental ap..."


#### Conversion of both dataframes to lists for easier manipulation. This can be done as the data size is relatively small

In [20]:
train_set = list(train_b_df.to_records(index=False))
test_set = list(test_b_df.to_records(index=False))

In [23]:
train_set[:5]

[('negative', 'I was sat with my auntie who has seen Michael Jackson live &amp; I had to watch a shitty Bruno Mars tribute try to sing Billie Jean'),
 ('negative', '@DylanBuckingham @ColbyJPowell @ToddOnFranchise Tiger Woods did not play well on Sunday, wait make that all year #Tigersucks'),
 ('neutral', 'The Accounting Career Fair is this Friday! Be sure to engage with us by using #kubusfair--you might win an iPad mini!'),
 ('negative', '@justinlarson34 Justin, I think you got that backwards. See my Sep 3 post on Obamanomics outpacing Daugaardonomics: http://t.co/hYwRzTs4B0'),
 ('positive', 'On 6 September 2001, Michael Jackson made a surprise appearance at the mtv vma where she danced with * Nsync. http://t.co/EwlitvXQML')]

In [24]:
test_set[:5]

[('positive', 'Facebook just achieved 1 billion active users in a day on Monday. At its peak, 1.1 billion people carried a Nokia phone with them.'),
 ('positive', 'I may have screamed when I saw this in the Nike store http://t.co/AvcDJgF64x'),
 ('positive', '@androidcentral Even though there are other differences in the IOS features this still may make people lean a little more towards Apple.'),
 ('neutral', '@RoyHarperInRed [Dino sat and Abby babbled that she liked Taylor Swift, The Fray, One republic, and such.]'),
 ('neutral', "We add our regrets for yesterday's tragedy at the Pride Parade. May Peace Happen during our time.  Shabbat Shalimar  https://t.co/vh3nLT2Yef")]

#### As we can see the tweets have to be cleaned, special symbols and links have to be removed

In [None]:
# We create a function to do the cleaning, which can then be applied on the list
def tweet_clean(tweet):
    link_pattern = "https?:\/\/t.co/[\w]+"
    mention_pattern = "@\w+"
    tweet = re.sub(link_pattern, "", tweet)
    tweet = re.sub(mention_pattern, "", tweet)
    return tweet.lower()