# Project 2- Text Classification

### Classification using Word-Vectors

For building a good text classifier, it is crucial to find a good feature representation of the input text. Here we will start by using the word vectors (word embeddings) of each word in the given tweet. For simplicity of a first baseline, we will construct the feature representation of the entire text by simply averaging the word vectors.

In [125]:
import pandas as pd
import numpy as np
import pickle

from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split

from sklearn.linear_model import SGDClassifier

from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

### Constructing the features

In [10]:
#Load original training sets
train_neg = pd.read_table("train_neg.txt", names = {'Tweet'}, 'r')
train_pos = pd.read_table("train_pos.txt", names = {'Tweet'})

#Use 1 for positive sentiment, 0 for negative. Shouldn't forget to map 0 to -1 in the submission.
y = np.concatenate((np.ones(len(train_pos)), np.zeros(len(train_neg))))

x_train, x_test, y_train, y_test = train_test_split(np.concatenate((train_pos, train_neg)), y, test_size=0.2)


#Import the GloVe word embeddings
glove = np.load("embeddings.npy")

SyntaxError: positional argument follows keyword argument (<ipython-input-10-4705eb26b2ee>, line 2)

In [3]:
glove

array([[ -3.81759674e-01,   5.19231251e-01,   2.54549441e-01, ...,
          1.91792831e-01,   3.56942168e-01,   9.28574409e-02],
       [ -5.26154337e-01,   7.10427748e-01,   2.98945462e-01, ...,
          2.31704323e-01,   4.97874558e-01,   9.60398164e-02],
       [ -5.87862712e-01,   8.23010640e-01,   3.20501774e-01, ...,
          2.45922125e-01,   5.09756553e-01,   1.24753998e-01],
       ..., 
       [  8.11289821e-01,   1.05897281e-01,  -1.16428044e+00, ...,
          8.48653673e-01,   1.20359144e+00,   5.19036955e-01],
       [ -2.31821282e+00,   5.29202334e-02,   4.51176901e-01, ...,
         -4.95419071e-01,  -5.60939325e-01,  -4.07276411e-01],
       [ -7.72139569e-01,   1.18280978e+00,   1.52951199e-03, ...,
          1.79640052e-01,  -1.46539920e+00,   4.93587224e-01]])

In [31]:
#Assumption : the value for each word in the dictionnary maps to the row in the gloVe table
with open('vocab.pkl', 'rb') as f:
    vocab = pickle.load(f)


str

In [157]:
def parse(text):
    text = [z.lower().replace('\n','').split() for z in text]
    return text

#Build word vector for training set by using the average value of all word vectors in the tweet, then scale
#vocab.get() will take a string, so we would better convert the train set to str first.
def buildTweetVector(text):
    vec = np.zeros(20).reshape((1, 20))
    count = 0.
    text=parse(text)
    for word in text[0]:
        if word in vocab.keys():
            try:
                vec += glove[vocab.get(word)].reshape((1, 20))
                count += 1.
            except KeyError:
                continue
        #else :
         #   count += 1.
    if count != 0:
        vec /= count
    return vec

In [158]:
train_vecs = np.concatenate([buildTweetVector(z) for z in x_train])
train_vecs = scale(train_vecs)

test_vecs = np.concatenate([buildTweetVector(z) for z in x_test])
test_vecs = scale(test_vecs)

In [159]:
train_vecs

array([[-0.8114704 ,  0.91386584,  0.33460335, ...,  0.45619698,
         0.85179866,  0.2883343 ],
       [-0.33797025,  0.71699241,  1.14368106, ...,  0.15692889,
         0.47306032,  0.60562117],
       [-0.22338584,  1.2202286 ,  1.89466914, ...,  0.6236208 ,
         0.73195409, -0.56194588],
       ..., 
       [-0.90233667,  0.69083005,  0.65631705, ...,  0.39342529,
         0.39453101,  0.13883045],
       [ 1.08129493,  0.85437063,  0.23899664, ...,  0.23389847,
        -0.09565342,  0.67764291],
       [-0.53682688,  0.20073788, -0.46839319, ..., -1.46175691,
        -1.09789763,  0.69226552]])

In [163]:
#Try classification using stochastic logistic regression

model = SGDClassifier(loss='log')
model.fit(train_vecs, y_train)

model.score(test_vecs, y_test)

0.5943290856475606