# Machine Learning Project 2

In [1]:
# Imports
import pandas as pd
import numpy as np

## Data Loading

In [2]:
data_path = 'data/'

# Load training set
pos = pd.read_table(data_path+'train_pos_full.txt', sep='.\n', names=['tweet'], engine='python')
pos['label']=1
print(f"Loaded POS data, correctly interpreted 1-tweet-per-line fashion : {pos.shape[0]==1_250_000}")
neg = pd.read_table(data_path+'train_neg_full.txt', sep='.\n', names=['tweet'], engine='python')
neg['label']=-1
print(f"Loaded NEG data, correctly interpreted 1-tweet-per-line fashion : {neg.shape[0]==1_250_000}")
print(f"Data sizes : (POS) {pos.shape[0]} (NEG) {neg.shape[0]}\n")

Loaded POS data, correctly interpreted 1-tweet-per-line fashion : True
Loaded NEG data, correctly interpreted 1-tweet-per-line fashion : True
Data sizes : (POS) 1250000 (NEG) 1250000



In [3]:
tweets = pos.merge(neg, how='outer')
tweets

Unnamed: 0,tweet,label
0,<user> i dunno justin read my mention or not ....,1
1,"because your logic is so dumb , i won't even c...",1
2,""" <user> just put casper in a box ! "" looved t...",1
3,<user> <user> thanks sir > > don't trip lil ma...,1
4,visiting my brother tmr is the bestest birthda...,1
...,...,...
2499995,im so sorry ! <user> & to <user> & <user> u gu...,-1
2499996,i can't find food coloring anywhere,-1
2499997,<user> same here ! ! but tort ! ! wonder why y...,-1
2499998,keyless entry remote fob clicker for 2005 buic...,-1


In [4]:
# Load word embeddings
embeddings = np.load('embeddings.npy')
#print(embeddings)


# Loading vocab for verification
words = pd.read_table('vocab_cut.txt', sep='.\n', names=['word'], engine='python', squeeze=True, na_values=np.nan)

print(f'Both the embeddings and the vocabulary are same size :  {len(embeddings)==words.shape[0]}')
print(f"Embeddings: {embeddings.shape}, vocab: {words.shape}")

# Drop NaN values
nas = words.isna()
words.dropna(inplace=True)
embeddings = np.delete(embeddings, nas[nas].index.values, axis=0)
embeddings = pd.DataFrame(embeddings)
print(f'NA values were dropped in both tables: {len(embeddings)==words.shape[0]}')
print(f"Embeddings: {embeddings.shape}, vocab: {words.shape}")

Both the embeddings and the vocabulary are same size :  True
Embeddings: (101298, 20), vocab: (101298,)
NA values were dropped in both tables: True
Embeddings: (101296, 20), vocab: (101296,)


In [5]:
words = pd.DataFrame(words)

## Preprocessing

### Cleaning tags 

In [6]:
# Preprocessing by 
## removing HTML tags
import re
from collections import defaultdict

# We check if the tags are relevant information between both pos and neg cases
def count_HTML_tags(series) :
    """
    Returns stats about the HTML tags in the tweet series.
    Returns :
    dic (defaultdict) : dict of all tags occurences.
    count (int) : count of all tags."""
    dic = defaultdict(lambda:0)
    def a(k):
        dic[k]+=1
        return None
    series.apply(lambda s : [a(k) for k in re.findall('<\/*[a-zA-Z]+>', s)])
    count = series.str.count('<\/*[a-zA-Z]+>').sum()
    return dic, count

# We query stats about the tags
d_pos, n_pos = count_HTML_tags(pos['tweet'])
d_neg, n_neg = count_HTML_tags(neg['tweet'])
all_keys = set(d_pos.keys()) | set(d_neg.keys())

print(f"|{'KEY':14s}|{'POS':6s}|{'NEG':6s}|")
for k in all_keys : 
    print(f"|{k:14s}|{d_pos[k]:6d}|{d_neg[k]:6d}|")

print(f"\nPOS tweets contain {n_pos} ({(n_pos-n_neg)*100/n_neg:.2f}%) HTML tags.")
print(f"NEG tweets contain {n_neg} ({(n_neg-n_pos)*100/n_pos:.2f}%) HTML tags.")

|KEY           |POS   |NEG   |
|</cfoutput>   |     0|     1|
|<content>     |     0|     1|
|</popcorn>    |     1|     0|
|<script>      |     0|     2|
|<weirdarms>   |     1|     0|
|<waves>       |     1|     0|
|</body>       |     0|     1|
|</moan>       |     0|     1|
|<grunt>       |     1|     0|
|<mournfully>  |     0|     1|
|<time>        |     1|     0|
|<impressive>  |     1|     0|
|<c>           |     1|     2|
|<moan>        |     0|     1|
|<p>           |     0|    16|
|<ages>        |     0|     1|
|<ht>          |     0|     1|
|<w>           |     0|     1|
|<g>           |     1|     0|
|<laugh>       |     1|     0|
|<blink>       |     0|     1|
|<calc>        |     1|     0|
|<update>      |     0|     2|
|<name>        |     0|     1|
|<outstanding> |     1|     0|
|</em>         |     0|     2|
|<likewise>    |     1|     0|
|</html>       |     0|     1|
|<justkiddin>  |     0|     1|
|<iostream>    |     0|     1|
|<ummm>        |     0|     1|
|</span>

**Note**: Although the difference in number of tags is not significant. The distribution of them is quite significant (i.e. for tags `<url>` and `<user>`). Thus we choose to leave the tags as part of the tweet. **THIS COULD BE REVIEWED TO IMPROVE PERF**

In [7]:
# Clean the HTML tags from the tweets
## CHANGE RETURN VAR IF RELEVANT

def clean_HTML_tags(series) :
    return series.str.replace('<\/*[a-zA-Z]+>', '', regex=True)

t = clean_HTML_tags(pos['tweet'])
t2 = clean_HTML_tags(neg['tweet'])

# Training

In [8]:
def compute_mean(tweet):
    split_by_words = tweet.split()
    embed_list = []
    for i in range(len(split_by_words)):
        try:
            embed_list.append(  embeddings.loc[vocab.loc[split_by_words[i]][0]].values  )
        except Exception:
            pass
    mean = np.zeros(20) if not embed_list else np.mean(embed_list, axis=0) 
    return mean

In [9]:
# Take only 10'000 samples for faster computation
from sklearn.utils import resample
pos_ = resample(pos, n_samples=10000, replace=False)
neg_ = resample(neg, n_samples=10000, replace=False)
tweets_ = pos_.merge(neg_, how='outer')
tweets_['mean_embed']= tweets_['tweet'].map(compute_mean)

# Testing

In [10]:
# To format the testing data
def extract_tweet(tweet):
    return tweet.split(",", 1)[1]

In [11]:
# Load the testing data
test = pd.read_fwf(data_path+ 'test_data.txt', sep="\n", header=None)
test.index = pd.RangeIndex(start=1, stop=10001, step=1)
test = test[0].map(extract_tweet)
test = pd.DataFrame(test)
test.columns = ['tweet']
test

Unnamed: 0,tweet
1,sea doo pro sea scooter ( sports with the port...
2,<user> shucks well i work all week so now i ca...
3,i cant stay away from bug thats my baby
4,<user> no ma'am ! ! ! lol im perfectly fine an...
5,"whenever i fall asleep watching the tv , i alw..."
...,...
9996,had a nice time w / my friend lastnite
9997,<user> no it's not ! please stop !
9998,not without my daughter ( dvd two-time oscar (...
9999,<user> have fun in class sweetcheeks


In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
X = tweets_['mean_embed'].values
y = tweets_['label'].values
X=X.tolist()
y=y.tolist()

In [13]:
from sklearn import ensemble
from sklearn import datasets

clf = ensemble.GradientBoostingClassifier()
clf = clf.fit(X, y)

In [14]:
# Making predictions
test['mean_embed']= test['tweet'].map(compute_mean)
test['label'] = clf.predict(test['mean_embed'].values.tolist())

In [15]:
# Creating submission file
import csv
def create_csv_submission(ids, y_pred, name):
    """
    Creates an output file in csv format for submission to kaggle
    Arguments: ids (event ids associated with each prediction)
               y_pred (predicted class labels)
               name (string name of .csv output file to be created)
    """
    with open(name, 'w', newline='') as csvfile:
        fieldnames = ['Id', 'Prediction']
        writer = csv.DictWriter(csvfile, delimiter=",", fieldnames=fieldnames)
        writer.writeheader()
        for r1, r2 in zip(ids, y_pred):
            writer.writerow({'Id':int(r1),'Prediction':int(r2)})
            
create_csv_submission(test.index, test['label'].values, 'submission.csv')