# Machine Learning Project 2

In [1]:
# Imports
import pandas as pd
import numpy as np

## Data Loading

In [2]:
data_path = '../data/'

# Load training set
pos = pd.read_table(data_path+'train_pos_full.txt', sep='.\n', names=['tweet'], engine='python')
pos['label']=1
print(f"Loaded POS data, correctly interpreted 1-tweet-per-line fashion : {pos.shape[0]==1_250_000}")
neg = pd.read_table(data_path+'train_neg_full.txt', sep='.\n', names=['tweet'], engine='python')
neg['label']=-1
print(f"Loaded NEG data, correctly interpreted 1-tweet-per-line fashion : {neg.shape[0]==1_250_000}")
print(f"Data sizes : (POS) {pos.shape[0]} (NEG) {neg.shape[0]}\n")

Loaded POS data, correctly interpreted 1-tweet-per-line fashion : True
Loaded NEG data, correctly interpreted 1-tweet-per-line fashion : True
Data sizes : (POS) 1250000 (NEG) 1250000



In [3]:
tweets = pos.merge(neg, how='outer')
tweets

Unnamed: 0,tweet,label
0,<user> i dunno justin read my mention or not ....,1
1,"because your logic is so dumb , i won't even c...",1
2,""" <user> just put casper in a box ! "" looved t...",1
3,<user> <user> thanks sir > > don't trip lil ma...,1
4,visiting my brother tmr is the bestest birthda...,1
...,...,...
2499995,im so sorry ! <user> & to <user> & <user> u gu...,-1
2499996,i can't find food coloring anywhere,-1
2499997,<user> same here ! ! but tort ! ! wonder why y...,-1
2499998,keyless entry remote fob clicker for 2005 buic...,-1


In [4]:
# Load word embeddings
embeddings = np.load(data_path + 'embeddings.npy')
#print(embeddings)


# Loading vocab for verification
words = pd.read_table(data_path + 'vocab_cut.txt', sep='.\n', names=['word'], engine='python', squeeze=True, na_values=np.nan)

print(f'Both the embeddings and the vocabulary are same size :  {len(embeddings)==words.shape[0]}')
print(f"Embeddings: {embeddings.shape}, vocab: {words.shape}")

# Drop NaN values
nas = words.isna()
words.dropna(inplace=True)
embeddings = np.delete(embeddings, nas[nas].index.values, axis=0)
embeddings = pd.DataFrame(embeddings)
print(f'NA values were dropped in both tables: {len(embeddings)==words.shape[0]}')
print(f"Embeddings: {embeddings.shape}, vocab: {words.shape}")

# Index by words for faster index-for-word search
words = pd.DataFrame(data=words.index, index=words.values)

Both the embeddings and the vocabulary are same size :  True
Embeddings: (101298, 20), vocab: (101298,)
NA values were dropped in both tables: True
Embeddings: (101296, 20), vocab: (101296,)


In [5]:
embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.009185,0.011479,0.063221,0.000375,-0.007730,0.025430,0.082799,-0.048565,-0.024218,-0.006323,-0.023891,0.002975,0.016852,-0.001515,0.033423,-0.087211,0.020931,-0.018832,-0.031254,-0.088040
1,0.042616,0.041186,0.164827,0.011299,-0.028849,0.079215,0.197703,-0.134167,-0.039585,-0.037801,-0.051231,0.031978,0.050672,-0.018207,0.079526,-0.207281,0.043563,-0.058997,-0.037311,-0.170586
2,0.022646,0.071115,0.205304,-0.021073,-0.011913,0.110779,0.252850,-0.194063,-0.095455,-0.066921,-0.090069,0.033882,0.076715,-0.009607,0.133667,-0.288021,0.001275,-0.101983,-0.039446,-0.240086
3,0.007738,0.017361,0.056530,0.000076,-0.004024,0.017014,0.084006,-0.063602,-0.018888,-0.011406,-0.029956,0.002158,0.023653,0.006917,0.039024,-0.073221,0.022820,-0.011876,-0.021259,-0.074184
4,0.033395,0.026300,0.118941,-0.010973,-0.022398,0.058411,0.139001,-0.110853,-0.049507,-0.066235,-0.069652,0.034014,0.058216,-0.017054,0.068081,-0.144027,0.011787,-0.056608,-0.055205,-0.156504
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101291,1.026938,0.078492,1.341995,1.600022,-0.539390,1.451380,-0.036710,-0.653556,0.190002,0.163548,1.309862,0.060239,-1.037890,0.123047,1.005304,-0.195947,-0.707598,-0.073776,-1.048488,-0.459352
101292,0.615891,0.720551,1.028040,-1.063300,0.901292,-0.527022,-0.928894,1.239876,-1.698276,-0.034342,1.248783,-0.925440,-0.068863,-0.617603,0.363297,0.443043,-1.017576,-1.381443,1.155187,-0.294162
101293,0.144772,-0.035020,-0.201682,0.599005,1.532933,0.959338,0.514556,-0.307952,0.053528,-0.527661,0.563200,0.304749,0.526832,0.338695,0.089288,0.265958,-0.267435,0.718279,0.338424,-1.710349
101294,-1.112161,-0.323110,0.000280,1.141701,2.165659,0.801806,-1.877650,1.277558,-1.527056,0.855168,0.733934,-0.658135,-1.131002,2.098100,-0.900002,-1.253614,-0.662051,0.145260,-0.430915,-0.573660


## Preprocessing

### Cleaning tags 

In [6]:
# Preprocessing by 
## removing HTML tags
import re
from collections import defaultdict

# We check if the tags are relevant information between both pos and neg cases
def count_HTML_tags(series) :
    """
    Returns stats about the HTML tags in the tweet series.
    Returns :
    dic (defaultdict) : dict of all tags occurences.
    count (int) : count of all tags."""
    dic = defaultdict(lambda:0)
    def a(k):
        dic[k]+=1
        return None
    series.apply(lambda s : [a(k) for k in re.findall('<\/*[a-zA-Z]+>', s)])
    count = series.str.count('<\/*[a-zA-Z]+>').sum()
    return dic, count

# We query stats about the tags
d_pos, n_pos = count_HTML_tags(pos['tweet'])
d_neg, n_neg = count_HTML_tags(neg['tweet'])
all_keys = set(d_pos.keys()) | set(d_neg.keys())

print(f"|{'KEY':14s}|{'POS':6s}|{'NEG':6s}|")
for k in all_keys : 
    print(f"|{k:14s}|{d_pos[k]:6d}|{d_neg[k]:6d}|")

print(f"\nPOS tweets contain {n_pos} ({(n_pos-n_neg)*100/n_neg:.2f}%) HTML tags.")
print(f"NEG tweets contain {n_neg} ({(n_neg-n_pos)*100/n_pos:.2f}%) HTML tags.")

|KEY           |POS   |NEG   |
|<b>           |     1|    26|
|<trans>       |     0|     3|
|<naive>       |     0|     1|
|<hugs>        |     1|     1|
|<moan>        |     0|     1|
|<time>        |     1|     0|
|<dynamic>     |     1|     0|
|<screams>     |     0|     1|
|<gardenstuff> |     2|     0|
|</strong>     |     0|     6|
|<weirdarms>   |     1|     0|
|<parenthood>  |     0|     1|
|<ummm>        |     0|     1|
|</del>        |     0|     1|
|<mikel>       |     1|     0|
|<please>      |     0|     1|
|<ages>        |     0|     1|
|<emotional>   |     0|     1|
|<strong>      |     0|     6|
|<cfoutput>    |     0|     1|
|<ht>          |     0|     1|
|<w>           |     0|     1|
|</em>         |     0|     2|
|<content>     |     0|     1|
|</cfoutput>   |     0|     1|
|<impressive>  |     1|     0|
|<twinkle>     |     0|     1|
|<laugh>       |     1|     0|
|<sigh>        |     0|     3|
|<g>           |     1|     0|
|<blushing>    |     1|     0|
|<hot>  

**Note**: Although the difference in number of tags is not significant. The distribution of them is quite significant (i.e. for tags `<url>` and `<user>`). Thus we choose to leave the tags as part of the tweet. **THIS COULD BE REVIEWED TO IMPROVE PERF**

In [7]:
# Clean the HTML tags from the tweets
## CHANGE RETURN VAR IF RELEVANT

def clean_HTML_tags(series) :
    return series.str.replace('<\/*[a-zA-Z]+>', '', regex=True)

t = clean_HTML_tags(pos['tweet'])
t2 = clean_HTML_tags(neg['tweet'])

# Training

In [8]:
def compute_mean(tweet):
    split_by_words = tweet.split()
    embed_list = []
    for i in range(len(split_by_words)):
        try:
            embed_list.append(  embeddings.loc[words.loc[split_by_words[i]][0]].values  )
        except Exception:
            pass
    mean = np.zeros(20) if not embed_list else np.mean(embed_list, axis=0) 
    return mean

In [9]:
# Take only 10'000 samples for faster computation
from sklearn.utils import resample
pos_ = resample(pos, n_samples=10000, replace=False)
neg_ = resample(neg, n_samples=10000, replace=False)
tweets_ = pos_.merge(neg_, how='outer')
tweets_['mean_embed']= tweets_['tweet'].map(compute_mean)

In [10]:
# Compute mean_embed with the full dataset 
#tweets['mean_embed']= tweets['tweet'].map(compute_mean)

In [11]:
tweets_

Unnamed: 0,tweet,label,mean_embed
0,<user> thankyou babe ! you will be absolutely ...,1,"[0.07298373484494713, 0.21242536470532547, 0.4..."
1,<user> or the boston bread bowls . they are re...,1,"[0.07637531212175129, -0.016053322052290066, 0..."
2,<user> pretty peaceful so far ! only one bicke...,1,"[0.08750596253830949, -0.0458335797357182, 0.3..."
3,<user> aw thank you girly drinks soon ?,1,"[0.006710808366222228, -0.10774004483784785, 0..."
4,<user> it hurts so bad . but i've gotten stronger,1,"[0.013535181601590975, 0.4399413518462622, 0.4..."
...,...,...,...
19995,candy painted kawasaki gsxr ( richiep industre...,-1,"[0.016361891768391347, 0.02647313786571369, 0...."
19996,hp compaq pavilion tx1000 notebook / laptop ba...,-1,"[0.16603694801428107, 0.19136587685746978, 0.3..."
19997,i feel like someone punched me in the stomach ...,-1,"[0.03435597569602863, 0.08540168828262823, 0.4..."
19998,"etnies malto - men's ( sz . 13.0 width - d , r...",-1,"[0.07815597366829553, 0.13977988798164792, 0.1..."


# Testing

In [12]:
# To format the testing data
def extract_tweet(tweet):
    return tweet.split(",", 1)[1]

In [13]:
# Load the testing data
test = pd.read_fwf(data_path+ 'test_data.txt', sep="\n", header=None)
test.index = pd.RangeIndex(start=1, stop=10001, step=1) # Format asked by AI Crowd
test = test[0].map(extract_tweet)
test = pd.DataFrame(test)
test.columns = ['tweet']
test

Unnamed: 0,tweet
1,sea doo pro sea scooter ( sports with the port...
2,<user> shucks well i work all week so now i ca...
3,i cant stay away from bug thats my baby
4,<user> no ma'am ! ! ! lol im perfectly fine an...
5,"whenever i fall asleep watching the tv , i alw..."
...,...
9996,had a nice time w / my friend lastnite
9997,<user> no it's not ! please stop !
9998,not without my daughter ( dvd two-time oscar (...
9999,<user> have fun in class sweetcheeks


In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
X = tweets_['mean_embed'].values
y = tweets_['label'].values
X=X.tolist()
y=y.tolist()

In [15]:
from sklearn import ensemble
from sklearn import datasets

clf = ensemble.GradientBoostingClassifier()
clf = clf.fit(X, y)

In [16]:
# Making predictions
test['mean_embed']= test['tweet'].map(compute_mean)
test['label'] = clf.predict(test['mean_embed'].values.tolist())

In [17]:
# Creating submission file
import csv
def create_csv_submission(ids, y_pred, name):
    """
    Creates an output file in csv format for submission to kaggle
    Arguments: ids (event ids associated with each prediction)
               y_pred (predicted class labels)
               name (string name of .csv output file to be created)
    """
    with open(name, 'w', newline='') as csvfile:
        fieldnames = ['Id', 'Prediction']
        writer = csv.DictWriter(csvfile, delimiter=",", fieldnames=fieldnames)
        writer.writeheader()
        for r1, r2 in zip(ids, y_pred):
            writer.writerow({'Id':int(r1),'Prediction':int(r2)})
            
create_csv_submission(test.index, test['label'].values, '../submission/submission.csv')