# Machine Learning Project 2

In [1]:
# Imports
import pandas as pd
import numpy as np

---
## Data Loading

In this section we load the data for :
* positive tweets, label= `:)` ($1$ for classification) 
* negative tweets, label= `:(` ($-1$ for classification)

Full data is used below (1'250'000 tweets).

In [2]:
data_path = '../data/'

## Load full training sets
# positive
pos = pd.read_table(data_path+'train_pos_full.txt', sep='.\n', names=['tweet'], engine='python')
pos['label']=1
print(f"Loaded POS data, correctly interpreted 1-tweet-per-line fashion : {pos.shape[0]==1_250_000}")

# negative
neg = pd.read_table(data_path+'train_neg_full.txt', sep='.\n', names=['tweet'], engine='python')
neg['label']=-1
print(f"Loaded NEG data, correctly interpreted 1-tweet-per-line fashion : {neg.shape[0]==1_250_000}")

# Data sizes
print(f"Number of tweets : (POS) {pos.shape[0]} (NEG) {neg.shape[0]}\n")

# Merge datasets to get a complete training set
tweets = pos.append(neg)
tweets

Loaded POS data, correctly interpreted 1-tweet-per-line fashion : True
Loaded NEG data, correctly interpreted 1-tweet-per-line fashion : True
Number of tweets : (POS) 1250000 (NEG) 1250000



Unnamed: 0,tweet,label
0,<user> i dunno justin read my mention or not ....,1
1,"because your logic is so dumb , i won't even c...",1
2,""" <user> just put casper in a box ! "" looved t...",1
3,<user> <user> thanks sir > > don't trip lil ma...,1
4,visiting my brother tmr is the bestest birthda...,1
...,...,...
1249995,im so sorry ! <user> & to <user> & <user> u gu...,-1
1249996,i can't find food coloring anywhere,-1
1249997,<user> same here ! ! but tort ! ! wonder why y...,-1
1249998,keyless entry remote fob clicker for 2005 buic...,-1


In [3]:
## Load word embeddings and vocabulary to compute word vectors of tweets

# Load word embeddings
embeddings = np.load(data_path + 'embeddings_full.npy')
print(f'Loaded word embeddings in structure of type {type(embeddings)}.')

# Loading vocab
words = pd.read_table(data_path + 'vocab_cut.txt', sep='.\n', names=['word'], engine='python', squeeze=True, na_values=np.nan)
print(f'Loaded word embeddings in structure of type {type(words)}.')

# Check that the vocabulary encompasses all embedded words
print(f'\nBoth the embeddings and the vocabulary are same length :  {len(embeddings)==words.shape[0]}')
print(f"Embeddings: {embeddings.shape}, vocab: {words.shape}")

## Clean the data

# Drop NaN values in words
nas = words.isna()
words.dropna(inplace=True)
embeddings = np.delete(embeddings, nas[nas].index.values, axis=0)
print(f'NA values were dropped in both tables: {len(embeddings)==words.shape[0]}')
print(f"Embeddings: {embeddings.shape}, vocab: {words.shape}")

# Index by words for faster index-for-word search
words = pd.DataFrame(data=words.index, index=words.values)
embeddings = pd.DataFrame(embeddings, index=words.index)

Loaded word embeddings in structure of type <class 'numpy.ndarray'>.
Loaded word embeddings in structure of type <class 'pandas.core.series.Series'>.

Both the embeddings and the vocabulary are same length :  True
Embeddings: (101298, 20), vocab: (101298,)
NA values were dropped in both tables: True
Embeddings: (101296, 20), vocab: (101296,)


In [4]:
embeddings.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
<user>,0.000327,0.028452,0.016525,0.000613,0.005188,-0.002815,0.009533,-0.044277,0.009413,0.031187,0.010009,-0.002514,0.00792,0.036842,-0.012512,0.022829,-0.000567,-0.016734,-0.008299,-0.008789
!,-0.019161,0.027524,0.033832,0.012091,-0.027132,-0.012628,-0.000841,-0.03312,0.013693,-0.004513,0.016704,0.015588,-0.008836,0.035705,0.007875,0.015618,0.009505,-0.01182,-0.003584,-0.005041
i,0.019321,0.030986,0.014057,0.007218,-0.002602,-0.022695,-0.004233,-0.033344,-0.00147,0.013007,0.002135,-0.007511,0.003676,0.010762,-0.002978,-0.001809,0.010544,-0.023608,-0.014181,-0.004194
the,0.0024,0.02129,0.007768,-0.005454,-0.016949,-0.001816,-0.00895,-0.03733,-0.029912,-0.008472,0.005187,-0.016195,0.006317,-0.010477,-0.022874,-0.00161,-0.006108,-0.008685,-0.011349,0.00613
.,-0.002507,0.011608,0.027699,0.013425,-0.020171,0.008285,-0.012358,-0.045863,-0.002258,0.031533,-0.014051,0.008234,0.008522,0.012892,0.017243,-0.006924,0.004531,-0.021841,-0.005605,-0.02242


In [5]:
words.head()

Unnamed: 0,0
<user>,0
!,1
i,2
the,3
.,4


---
## Exploratory Data Analysis
In this part we analyse our data in order to optimize its information.

### Cleaning tags 
Here we explore the non-spoken tags present in the tweets and determine if they are relevant for our sentiment analysis.

In [6]:
import re
from collections import defaultdict

## We check if the tags are relevant information between both pos and neg cases

def count_HTML_tags(series) :
    """
    Returns stats about the HTML tags in the tweet series.
    Returns :
    dic (defaultdict) : dict of all tags occurences.
    count (int) : count of all tags."""
    dic = defaultdict(lambda:0)
    def a(k):
        dic[k]+=1
        return None
    series.apply(lambda s : [a(k) for k in re.findall('<\/*[a-zA-Z]+>', s)])
    count = series.str.count('<\/*[a-zA-Z]+>').sum()
    return dic, count

# We query stats about the tags
d_pos, n_pos = count_HTML_tags(pos['tweet'])
d_neg, n_neg = count_HTML_tags(neg['tweet'])
all_keys = set(d_pos.keys()) | set(d_neg.keys())

print(f"|{'KEY':14s}|{'POS':6s}|{'NEG':6s}|")
for k in all_keys : 
    print(f"|{k:14s}|{d_pos[k]:6d}|{d_neg[k]:6d}|")

print(f"\nPOS tweets contain {n_pos} ({(n_pos-n_neg)*100/n_neg:.2f}%) HTML tags.")
print(f"NEG tweets contain {n_neg} ({(n_neg-n_pos)*100/n_pos:.2f}%) HTML tags.")

|KEY           |POS   |NEG   |
|</strong>     |     0|     6|
|<g>           |     1|     0|
|<weirdarms>   |     1|     0|
|<impressive>  |     1|     0|
|</cfoutput>   |     0|     1|
|<gardenstuff> |     2|     0|
|</script>     |     0|     4|
|<c>           |     1|     2|
|<i>           |     0|    10|
|<sigh>        |     0|     3|
|<thing>       |     0|     1|
|<dynamic>     |     1|     0|
|<twinkle>     |     0|     1|
|<grunt>       |     1|     0|
|<strong>      |     0|     6|
|</em>         |     0|     2|
|<mikel>       |     1|     0|
|<p>           |     0|    16|
|<hahahahhahaha>|     0|     1|
|<cfoutput>    |     0|     1|
|<summary>     |     1|     0|
|<content>     |     0|     1|
|<atomic>      |     0|     1|
|</del>        |     0|     1|
|<haha>        |     0|     1|
|<script>      |     0|     2|
|<b>           |     1|    26|
|</a>          |     1|     7|
|<del>         |     0|     1|
|<popcorn>     |     1|     0|
|<calc>        |     1|     0|
|<iostr

**Note**: Although the difference in number of tags is not significant. The distribution of them is quite significant (i.e. for tags `<url>` and `<user>`). Thus we choose to leave the tags as part of the tweet. **THIS COULD BE REVIEWED TO IMPROVE PERF**

In [7]:
# Clean the HTML tags from the tweets
## CHANGE RETURN VAR IF RELEVANT

def clean_HTML_tags(series) :
    return series.str.replace('<\/*[a-zA-Z]+>', '', regex=True)

t = clean_HTML_tags(pos['tweet'])
t2 = clean_HTML_tags(neg['tweet'])

---
## Training
In this part we train the models on our data.
Thus, we perform
* a resampling of our data to work locally on a smaller set.
* the creation of word vectors for our tweets.
* a train-test-split to locally estimate the model's performance.
* cross-validation trainin on a series of models :
    * Linear Regression
    * Logistic Regression
    * SVM
    * Neural Networks

### Resampling the Training set
Using only a set of 200'000 tweets locally to decrease computation time.

In [8]:
from sklearn.utils import resample

# Take only 100'000 samples from both classes for faster computation
pos_ = resample(pos, n_samples=100_000, replace=False)
neg_ = resample(neg, n_samples=100_000, replace=False)
tweets_ = pos_.append(neg_)

tweets_

Unnamed: 0,tweet,label
274596,<user> haha then zaxbys . all the way . then b...,1
851053,"today was a crazy day , but i'm glad it made m...",1
55221,<user> i have follow you back lol,1
730864,! ! ! <user> pressing the middle of your amala...,1
448240,"the little things that he does , like call me ...",1
...,...,...
1053471,my mind is running everywhere right noow !,-1
16725,i wanted towork go tanning today (,-1
438297,<user> are you going ? got no transport here t...,-1
617168,<user> when ate you going to come visit me ?,-1


### Word vectors creation

In [9]:
# Recall our data
print(f"Embeddings for {embeddings.shape[0]:,.1f} with {embeddings.shape[1]} features for each word.") 
print(f'Embeddings shape : {embeddings.shape}.\n')
embeddings.head(2)

Embeddings for 101,296.0 with 20 features for each word.
Embeddings shape : (101296, 20).



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
<user>,0.000327,0.028452,0.016525,0.000613,0.005188,-0.002815,0.009533,-0.044277,0.009413,0.031187,0.010009,-0.002514,0.00792,0.036842,-0.012512,0.022829,-0.000567,-0.016734,-0.008299,-0.008789
!,-0.019161,0.027524,0.033832,0.012091,-0.027132,-0.012628,-0.000841,-0.03312,0.013693,-0.004513,0.016704,0.015588,-0.008836,0.035705,0.007875,0.015618,0.009505,-0.01182,-0.003584,-0.005041


In [10]:
def word_vector(tweet):
    """
    Creates the feature vector corresponding to the tweet.
    To do so, computes the mean of the word embeddings corresponding to the vocabulary words in the tweet.
    
    Parameters 
    ----------
    tweet : str
        Input tweet from which the word vector is created.
    """
    split_by_words = tweet.split()
    embed_list = []
    
    # Get vocab word embeddings
    for w in split_by_words:
        if w in words.index :
            embed_list.append(  embeddings.loc[w].values  )
        
    # Compute mean if any vocab word was found
    mean = np.zeros(20) if not embed_list else np.mean(embed_list, axis=0) 
    return mean.tolist()

In [11]:
%%time

# Create word vectors for the local dataset
tweets_['mean_embed']= tweets_['tweet'].map(word_vector)

Wall time: 3min 35s


In [12]:
tweets_

Unnamed: 0,tweet,label,mean_embed
274596,<user> haha then zaxbys . all the way . then b...,1,"[-0.06881217522765033, 0.45840950946147213, 0...."
851053,"today was a crazy day , but i'm glad it made m...",1,"[0.08794695168408025, 0.6880247076430547, 0.05..."
55221,<user> i have follow you back lol,1,"[0.07642400683817782, 0.5859221483221717, 0.05..."
730864,! ! ! <user> pressing the middle of your amala...,1,"[-0.02727947819421687, 0.3734735041200142, -0...."
448240,"the little things that he does , like call me ...",1,"[0.09163293220188184, 0.6439685696521983, 0.04..."
...,...,...,...
1053471,my mind is running everywhere right noow !,-1,"[0.31459488060319407, 0.49239944436094624, 0.0..."
16725,i wanted towork go tanning today (,-1,"[0.04906116757469841, 0.7395756877314188, 0.10..."
438297,<user> are you going ? got no transport here t...,-1,"[0.2351083857852727, 0.6235748976143777, 0.097..."
617168,<user> when ate you going to come visit me ?,-1,"[0.06290765244354832, 0.5247022462756916, 0.05..."


### Train-Test Split for our local dataset
We divide our local training set into a 75% training set and a 25% local testing set.

In [13]:
from sklearn.model_selection import train_test_split

train_, test_ = train_test_split(tweets_, test_size=0.25)
print(f"Local training set size : {train_.shape}.")
print(f"Local testing set size : {test_.shape}.")

Local training set size : (150000, 3).
Local testing set size : (50000, 3).


In [14]:
# Create features and label datasets
xtrain_, ytrain_ = train_.mean_embed.copy().tolist(), train_.label.copy().to_list()
xtest_, ytest_ = test_.mean_embed.copy().tolist(), test_.label.copy().tolist()

xtrain_[:1]

[[0.1209916583854169,
  0.5500444768004302,
  0.0797898461402203,
  0.21527290357057108,
  -0.28439339438026656,
  0.03354627175028572,
  -0.07695506751693128,
  -0.3607828787770581,
  -0.16231577244144532,
  -0.12736482483874875,
  -0.1886278613049773,
  0.24260564368495885,
  -0.22745608148409577,
  0.10051371651498053,
  -0.1352814523158706,
  -0.1419014185301333,
  -0.03755463367676509,
  -0.020084670432222437,
  -0.31973405847606373,
  0.0878311622429423]]

### Pre-Processing
Here we compute our pre-processing on features.

In [15]:
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

def preprocess(X) :
    x=X.copy()
    
    # Standardize data
    standardizer=StandardScaler().fit_transform(x)
    
    # TODO Polynomial features and interactions
    
    # other data preprocessing
    
    return x

In [16]:
# Pre-process training set

xtrain_ = preprocess(xtrain_)
xtest_ = preprocess(xtest_)

### Models Training
* Linear Regression
* Logistic Regression
* SVM
* Neural Networks

In [17]:
from sklearn.linear_model import LinearRegression, LogisticRegression, LogisticRegressionCV
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier

classifiers = dict()

In [18]:
%%time 
# Linear Regression
name = 'Linear Model'

linear_classifier = LinearRegression().fit(xtrain_, ytrain_)
score = linear_classifier.score(xtest_, ytest_)

classifiers[name] = (linear_classifier, score)

print(f"R2 score for {name} is {score}.")

R2 score for Linear Model is 0.014346360335973252.
Wall time: 831 ms


In [19]:
%%time 
#### BASELINE : Logistic Regression

# Logistic Regression
name = 'Logistic Regression'

logistic_classifier = LogisticRegression().fit(xtrain_, ytrain_)
score = logistic_classifier.score(xtest_, ytest_)

classifiers[name] = (logistic_classifier, score)

print(f"R2 score for {name} is {score}.")

# Logistic Regression using Crossvalidation
name = 'Logistic Regression using cross-validation'

logisticCV_classifier = LogisticRegressionCV().fit(xtrain_, ytrain_)
score = logisticCV_classifier.score(xtest_, ytest_)

classifiers[name] = (logisticCV_classifier, score)

print(f"R2 score for Logistic Regression model using cross-validation is {score}.")

R2 score for Logistic Regression is 0.55076.
R2 score for Logistic Regression model using cross-validation is 0.5507.
Wall time: 6.46 s


In [20]:
%%time 
# Support Vector Machines
name = 'SVM classifier'

SVM_classifier = LinearSVC().fit(xtrain_, ytrain_)
score = SVM_classifier.score(xtest_, ytest_)

classifiers[name] = (SVM_classifier, score)

print(f"R2 score for {name} model is {score}.")

R2 score for SVM classifier model is 0.55092.
Wall time: 17.1 s


In [21]:
%%time
# Neural Network
name = 'Neural Network'

nn_classifier = MLPClassifier().fit(xtrain_,ytrain_)
score = nn_classifier.score(xtest_,ytest_)

classifiers[name] = (nn_classifier, score)

print(f"R2 score for {name} classifier is {score}.")

R2 score for Neural Network classifier is 0.58108.
Wall time: 1min 48s


---
## Testing
This section is dedicated to using the previous classifiers to predict the labels of the provided testing set.

In [22]:
# To format the testing data
def extract_tweet(tweet):
    return tweet.split(",", 1)[1]

In [23]:
## Loading data

# Load the testing data
test = pd.read_fwf(data_path+ 'test_data.txt', sep="\n", header=None)
test = test.rename(columns={0:'tweet', 1:'na1', 2:'na2'})

# Reformating it for submission
test.index = test.index+1 # Format asked by AI Crowd
test = test['tweet'].map(extract_tweet).to_frame()

test

Unnamed: 0,tweet
1,sea doo pro sea scooter ( sports with the port...
2,<user> shucks well i work all week so now i ca...
3,i cant stay away from bug thats my baby
4,<user> no ma'am ! ! ! lol im perfectly fine an...
5,"whenever i fall asleep watching the tv , i alw..."
...,...
9996,had a nice time w / my friend lastnite
9997,<user> no it's not ! please stop !
9998,not without my daughter ( dvd two-time oscar (...
9999,<user> have fun in class sweetcheeks


In [24]:
## Preparing data

# Create word vectors for tweets
test['mean_embed'] = test['tweet'].map(word_vector)

# Preprocess test data
xtest = preprocess(test.mean_embed.copy().tolist())

xtest[:1]

[[0.06600514685663708,
  0.582784556899372,
  -0.04151115815776958,
  0.13666445264367844,
  -0.1977931317665288,
  -0.016802999768011986,
  -0.038712791935644156,
  -0.2545744725436583,
  0.029000791532073186,
  0.06613123578801591,
  -0.14071631816239816,
  0.147312012718148,
  -0.2390666239629726,
  0.08578977417972455,
  -0.0034186956612259254,
  -0.052978385981814986,
  0.16010788866987907,
  -0.07031762344866747,
  -0.2815485402300905,
  0.16404086032326892]]

In [25]:
# Recalling classifiers 
# stored in format : 'classifier name'=(classifier, R2 score) 

print(f"Models computed so far are the following.\n ")
print(f"{'Classifier':50s} | {'R2 Score':20s}")
print(f"-----------------------------------------------------------------")
for k,v in classifiers.items() :
    print(f"{k:50s} | {v[1]:10.10f}")
print('\n')

Models computed so far are the following.
 
Classifier                                         | R2 Score            
-----------------------------------------------------------------
Linear Model                                       | 0.0143463603
Logistic Regression                                | 0.5507600000
Logistic Regression using cross-validation         | 0.5507000000
SVM classifier                                     | 0.5509200000
Neural Network                                     | 0.5810800000




In [26]:
# Making predictions
model = classifiers['Neural Network'][0]

predictions = model.predict(xtest)

In [27]:
# Creating submission file
import csv
def create_csv_submission(ids, y_pred, name):
    """
    Creates an output file in csv format for submission to kaggle
    Arguments: ids (event ids associated with each prediction)
               y_pred (predicted class labels)
               name (string name of .csv output file to be created)
    """
    with open(name, 'w', newline='') as csvfile:
        fieldnames = ['Id', 'Prediction']
        writer = csv.DictWriter(csvfile, delimiter=",", fieldnames=fieldnames)
        writer.writeheader()
        for r1, r2 in zip(ids, y_pred):
            writer.writerow({'Id':int(r1),'Prediction':int(r2)})
            
create_csv_submission(test.index, predictions, '../submission/submission.csv')