Importing the useful libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import csv
import pbs
from pickle_vocab import *
from cooc import *

## Loading the data

In [2]:
def load_test_data(data_path):
    """Loads data and returns ids (event ids) and X (feature vector)"""
    
    f = open(data_path)
    x = f.readlines()
    tweets = []
    ids = []
    for str in x :
        id, tweet = str.split(',',1)
        tweets.append(tweet)
        ids.append(id)

    x = pd.DataFrame(tweets,ids,columns=['Tweets'])
    
    return x

test = load_test_data('Datasets/test_data.txt')
test.head()

Unnamed: 0,Tweets
1,sea doo pro sea scooter ( sports with the port...
2,<user> shucks well i work all week so now i ca...
3,i cant stay away from bug thats my baby\n
4,<user> no ma'am ! ! ! lol im perfectly fine an...
5,"whenever i fall asleep watching the tv , i alw..."


In [3]:
def load_train_data(data_path):
    """Loads data and returns ids (event ids) and X (feature vector)"""
    
    f = open(data_path)
    x = f.readlines()
    x = pd.DataFrame(x,columns=['Tweets'])
    
    return x

In [4]:
pos = load_train_data('Datasets/train_pos.txt')
pos.head()

Unnamed: 0,Tweets
0,<user> i dunno justin read my mention or not ....
1,"because your logic is so dumb , i won't even c..."
2,""" <user> just put casper in a box ! "" looved t..."
3,<user> <user> thanks sir > > don't trip lil ma...
4,visiting my brother tmr is the bestest birthda...


In [5]:
neg = load_train_data('Datasets/train_neg.txt')
neg.head()

Unnamed: 0,Tweets
0,vinco tresorpack 6 ( difficulty 10 of 10 objec...
1,glad i dot have taks tomorrow ! ! #thankful #s...
2,1-3 vs celtics in the regular season = were fu...
3,<user> i could actually kill that girl i'm so ...
4,<user> <user> <user> i find that very hard to ...


In [6]:
pos["y"]=1
pos.head()

Unnamed: 0,Tweets,y
0,<user> i dunno justin read my mention or not ....,1
1,"because your logic is so dumb , i won't even c...",1
2,""" <user> just put casper in a box ! "" looved t...",1
3,<user> <user> thanks sir > > don't trip lil ma...,1
4,visiting my brother tmr is the bestest birthda...,1


In [7]:
neg["y"]=-1
neg.head()

Unnamed: 0,Tweets,y
0,vinco tresorpack 6 ( difficulty 10 of 10 objec...,-1
1,glad i dot have taks tomorrow ! ! #thankful #s...,-1
2,1-3 vs celtics in the regular season = were fu...,-1
3,<user> i could actually kill that girl i'm so ...,-1
4,<user> <user> <user> i find that very hard to ...,-1


In [8]:
train = pd.concat([pos,neg])
train.reset_index(drop=True,inplace=True)
train.head()

Unnamed: 0,Tweets,y
0,<user> i dunno justin read my mention or not ....,1
1,"because your logic is so dumb , i won't even c...",1
2,""" <user> just put casper in a box ! "" looved t...",1
3,<user> <user> thanks sir > > don't trip lil ma...,1
4,visiting my brother tmr is the bestest birthda...,1


In [9]:
with open('vocab.pkl','rb') as f:
    vocab = pickle.load(f)
vocab

{'<user>': 0,
 '!': 1,
 'i': 2,
 'the': 3,
 '.': 4,
 ',': 5,
 'to': 6,
 'you': 7,
 '(': 8,
 '<url>': 9,
 'a': 10,
 '...': 11,
 'and': 12,
 'my': 13,
 'me': 14,
 'of': 15,
 '?': 16,
 'is': 17,
 'for': 18,
 'in': 19,
 'it': 20,
 '"': 21,
 'this': 22,
 'so': 23,
 '-': 24,
 'with': 25,
 'on': 26,
 'that': 27,
 ')': 28,
 'be': 29,
 "i'm": 30,
 'have': 31,
 ':': 32,
 'but': 33,
 'just': 34,
 'rt': 35,
 'love': 36,
 'your': 37,
 'all': 38,
 'not': 39,
 'was': 40,
 'at': 41,
 'are': 42,
 '..': 43,
 'like': 44,
 '/': 45,
 'get': 46,
 'up': 47,
 'frame': 48,
 '&': 49,
 'lol': 50,
 'know': 51,
 'good': 52,
 'do': 53,
 'u': 54,
 'now': 55,
 'when': 56,
 'one': 57,
 'if': 58,
 'we': 59,
 'follow': 60,
 'no': 61,
 'can': 62,
 'what': 63,
 'go': 64,
 "don't": 65,
 'out': 66,
 'x': 67,
 'will': 68,
 'day': 69,
 "'": 70,
 'please': 71,
 'from': 72,
 'see': 73,
 'too': 74,
 'want': 75,
 'there': 76,
 'back': 77,
 "it's": 78,
 'today': 79,
 'about': 80,
 'really': 81,
 'how': 82,
 'got': 83,
 'thanks': 8

In [10]:
max(vocab.values())

21160

In [11]:
with open('cooc.pkl','rb') as f:
    cooc = pickle.load(f)
cooc

<21160x21160 sparse matrix of type '<class 'numpy.int32'>'
	with 3184638 stored elements in COOrdinate format>

In [12]:
cooc.data

array([273434,  97020,  57576, ...,      2,      2,      8])

In [13]:
cooc.row

array([    0,     1,     2, ..., 12840, 14651, 21159])

In [14]:
cooc.col

array([    0,     0,     0, ..., 21159, 21159, 21159])

In [15]:
print(cooc)

  (0, 0)	273434
  (1, 0)	97020
  (2, 0)	57576
  (3, 0)	38886
  (4, 0)	47366
  (5, 0)	41118
  (6, 0)	41428
  (7, 0)	58492
  (8, 0)	4744
  (9, 0)	10364
  (10, 0)	31758
  (11, 0)	12300
  (12, 0)	27856
  (13, 0)	21426
  (14, 0)	22728
  (15, 0)	14316
  (16, 0)	29786
  (17, 0)	15902
  (18, 0)	20674
  (19, 0)	14742
  (20, 0)	21172
  (21, 0)	19938
  (22, 0)	8938
  (23, 0)	12654
  (24, 0)	6002
  :	:
  (208, 21159)	4
  (239, 21159)	2
  (250, 21159)	2
  (273, 21159)	2
  (314, 21159)	6
  (349, 21159)	2
  (444, 21159)	2
  (676, 21159)	2
  (693, 21159)	2
  (883, 21159)	2
  (1048, 21159)	2
  (1050, 21159)	2
  (1388, 21159)	2
  (1415, 21159)	2
  (2291, 21159)	2
  (2915, 21159)	2
  (3359, 21159)	2
  (6055, 21159)	2
  (7507, 21159)	2
  (7875, 21159)	2
  (7879, 21159)	2
  (9534, 21159)	2
  (12840, 21159)	2
  (14651, 21159)	2
  (21159, 21159)	8


In [16]:
cooc.toarray()

array([[273434,  97020,  57576, ...,      0,     10,      4],
       [ 97020, 255078,  47124, ...,      0,      2,      0],
       [ 57576,  47124, 110506, ...,      0,      0,      4],
       ...,
       [     0,      0,      0, ...,      0,      0,      0],
       [    10,      2,      0, ...,      0,     10,      0],
       [     4,      0,      4, ...,      0,      0,      8]])

In [17]:
cooc.toarray().shape

(21160, 21160)

In [18]:
we = np.load("embeddings.npy")
we

array([[ 0.08293404,  0.01490157,  0.00622929, ..., -0.16190348,
        -0.1846269 , -0.18189624],
       [ 0.12380148, -0.0421271 , -0.01440238, ..., -0.30058427,
        -0.22970339, -0.25452988],
       [ 0.09969781,  0.00634767, -0.05967447, ..., -0.30113178,
        -0.29646062, -0.36644036],
       ...,
       [-0.03857365, -2.05255734, -0.04303483, ..., -0.78177959,
         0.37153678, -0.62341192],
       [ 2.09123134, -1.04694662, -0.92195585, ...,  0.26456563,
         0.18419511, -0.29720033],
       [ 1.03783875, -0.6643759 , -0.19507841, ...,  0.31626424,
        -3.13279291,  1.45051465]])

In [19]:
we.shape

(21160, 20)

In [20]:
def representation(tweet):
    acc = np.array(0)
    n_ignored_word = 0
    for word in tweet.split():
        if word not in vocab.keys():
            n_ignored_word += 1
        else:
            try:
                acc = np.add(acc,we[vocab[word]])
            except: 
                print("problem with " + word) #last word from vocab is missing in cooc
                n_ignored_word += 1
    n = len(tweet.split()) - n_ignored_word
    acc = acc/n
    return(acc)

In [21]:
train["w"] = train["Tweets"].apply(lambda x: representation(x))

problem with #10factsaboutme
problem with #10factsaboutme
problem with #10factsaboutme
problem with #10factsaboutme
problem with #10factsaboutme


In [22]:
col = ["w" + str(k) for k in range(20)]
col

['w0',
 'w1',
 'w2',
 'w3',
 'w4',
 'w5',
 'w6',
 'w7',
 'w8',
 'w9',
 'w10',
 'w11',
 'w12',
 'w13',
 'w14',
 'w15',
 'w16',
 'w17',
 'w18',
 'w19']

In [23]:
train[col] = train["w"].apply(pd.Series)
train.drop("w",axis=1,inplace=True)
train.drop("Tweets",axis=1,inplace=True)

In [24]:
train

Unnamed: 0,y,w0,w1,w2,w3,w4,w5,w6,w7,w8,...,w10,w11,w12,w13,w14,w15,w16,w17,w18,w19
0,1,0.193815,0.038084,0.040004,-0.668191,0.548065,-0.373069,0.370133,0.348918,0.419950,...,-0.300119,0.397517,0.566418,0.117509,0.803911,0.124804,0.664562,-0.236155,-0.140885,-0.315508
1,1,0.164580,-0.173796,0.008294,-0.758810,0.600187,-0.400949,0.140147,0.408942,0.340775,...,-0.098460,0.442516,0.533248,0.247309,0.549468,0.173431,0.633311,-0.190100,-0.074907,-0.354327
2,1,0.174736,-0.009662,-0.141506,-0.549435,0.538021,-0.390385,0.205312,0.320152,0.582864,...,-0.017894,0.298703,0.484783,0.156315,0.592756,0.358672,0.746317,-0.339098,-0.251291,-0.313407
3,1,-0.053728,-0.128535,0.083332,-0.790976,0.578776,-0.068607,0.170223,0.189749,0.468706,...,-0.177187,0.137006,0.338732,0.324995,0.558550,0.114529,0.642133,-0.077130,-0.280407,-0.424682
4,1,-0.000336,-0.311349,0.063379,-0.652141,0.242091,-0.023745,0.237209,0.254238,0.225467,...,-0.140884,0.474569,0.570961,0.346219,0.603160,0.214515,0.475464,-0.274569,-0.285542,-0.216674
5,1,0.026594,-0.006907,-0.100526,-0.749362,0.524098,-0.346801,0.243778,0.348224,0.376137,...,-0.173890,0.358738,0.535558,0.141857,0.736740,0.082623,0.739355,-0.259015,-0.265909,-0.379111
6,1,0.090643,-0.048549,-0.010043,-0.657629,0.385867,-0.272697,0.067982,0.290498,0.352461,...,-0.208982,0.462297,0.350906,0.373653,0.659256,0.136557,0.673275,-0.289193,-0.106955,-0.293110
7,1,0.105990,0.324250,-0.020413,-0.368366,0.482513,-0.298971,-0.031015,0.283898,0.488445,...,-0.085626,0.093791,0.237290,0.314822,0.559136,0.289033,0.506103,-0.281387,-0.219288,-0.260018
8,1,0.057533,-0.066158,0.033396,-0.654281,0.382003,-0.349268,0.261607,0.506702,0.347005,...,0.090888,0.428261,0.358533,0.156962,0.712901,0.053821,0.711513,-0.304527,-0.313385,-0.482374
9,1,0.195404,0.123427,-0.582616,-0.908895,0.608877,-0.441935,0.092457,0.419204,0.437667,...,-0.341651,0.301421,-0.005913,-0.119228,0.481244,0.218687,0.595256,-0.038230,-0.192498,-0.554049


## Data Analysis

## Word enbeddings matrix

### Generation

### Visualization

## Building the text classifier

### Training texts features

### Linear classifiers

### Predictions

## Output creation and visualization

### Creating the submission file

### Visualization of the results