## Creating feature spaces

In [1]:
##############
# Import libs
##############

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
import pickle

import warnings
warnings.filterwarnings("ignore")

# from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity = "all"  # handle multiple outputs

In [2]:
###########
# Settings
###########

# Data load
with open('data/df_processed.pkl', 'rb') as f:
    df = pickle.load(f)

In [3]:
df.head()

Unnamed: 0,id,label,tweet,clean_tweet,tweet_token,tweet_token_filtered,tweet_stemmed,tweet_lemmatized
0,1,0.0,@user when a father is dysfunctional and is s...,when father is dysfunctional and is so selfish...,"[when, father, is, dysfunctional, and, is, so,...","[father, dysfunctional, selfish, drags, kids, ...","[father, dysfunct, selfish, drag, kid, dysfunc...","[father, dysfunct, selfish, drag, kid, dysfunc..."
1,2,0.0,@user @user thanks for #lyft credit i can't us...,thanks for lyft credit cannot use cause they d...,"[thanks, for, lyft, credit, can, not, use, cau...","[thanks, lyft, credit, use, cause, offer, whee...","[thank, lyft, credit, use, caus, offer, wheelc...","[thank, lyft, credit, use, caus, offer, wheelc..."
2,3,0.0,bihday your majesty,bihday your majesty,"[bihday, your, majesty]","[bihday, majesty]","[bihday, majesti]","[bihday, majesti]"
3,4,0.0,#model i love u take with u all the time in ...,model love you take with you all the time in ur,"[model, love, you, take, with, you, all, the, ...","[model, love, take, time, ur]","[model, love, take, time, ur]","[model, love, take, time, ur]"
4,5,0.0,factsguide: society now #motivation,factsguide society now motivation,"[factsguide, society, now, motivation]","[factsguide, society, motivation]","[factsguid, societi, motiv]","[factsguid, societi, motiv]"


### Count vectorizer

In [4]:
count_vectorizer = CountVectorizer(max_df=.9, max_features=1000, stop_words='english')
values = count_vectorizer.fit_transform(df['tweet_stemmed'].astype('str'))

feature_names = count_vectorizer.get_feature_names()
pd.DataFrame(values.toarray(), columns = feature_names)

Unnamed: 0,abl,absolut,accept,account,act,action,actor,actual,ad,adapt,...,year,yesterday,yo,yoga,york,young,youth,youtub,yr,yummi
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49154,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
49155,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
49156,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
49157,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
count_vectorizer = CountVectorizer(max_df=.9, max_features=1000, stop_words='english')
values = count_vectorizer.fit_transform(df['tweet_lemmatized'].astype('str'))

feature_names = count_vectorizer.get_feature_names()
pd.DataFrame(values.toarray(), columns = feature_names)

Unnamed: 0,abl,absolut,accept,account,act,action,actor,actual,ad,adapt,...,year,yesterday,yo,yoga,york,young,youth,youtub,yr,yummi
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49154,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
49155,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
49156,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
49157,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### TF IDF vectorizer

In [6]:
tfidf_vectorizer = TfidfVectorizer(max_df=.9, max_features=1000, stop_words='english')
values = tfidf_vectorizer.fit_transform(df['tweet_stemmed'].astype('str'))

feature_names = tfidf_vectorizer.get_feature_names()
pd.DataFrame(values.toarray(), columns = feature_names)

Unnamed: 0,abl,absolut,accept,account,act,action,actor,actual,ad,adapt,...,year,yesterday,yo,yoga,york,young,youth,youtub,yr,yummi
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49154,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49155,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49156,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49157,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
tfidf_vectorizer = TfidfVectorizer(max_df=.9, max_features=1000, stop_words='english')
values = tfidf_vectorizer.fit_transform(df['tweet_lemmatized'].astype('str'))

feature_names = tfidf_vectorizer.get_feature_names()
pd.DataFrame(values.toarray(), columns = feature_names)

Unnamed: 0,abl,absolut,accept,account,act,action,actor,actual,ad,adapt,...,year,yesterday,yo,yoga,york,young,youth,youtub,yr,yummi
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49154,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49155,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49156,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49157,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Word2Vec

In [8]:
modelW2V = Word2Vec(sentences=df['tweet_token'],
                    size=200,
                    window=5,
                    min_count=2, # min_count=1
                    sg=1,
                    hs=0,
                    negative=10,
                    workers=32,
                    seed=34)

In [9]:
modelW2V.train(df['tweet_token'],
               total_examples=len(df['tweet_token']),
               epochs=20)

(9684805, 11615940)

In [None]:

# (8990699, 11615940)

In [10]:
modelW2V.save('w2v_tweet.model')

In [10]:
modelW2V.most_similar(positive=['dinner'])

[('bolognese', 0.5805073380470276),
 ('sleepnumber', 0.5727723836898804),
 ('marinabaysands', 0.5642914772033691),
 ('eastvillage', 0.5617031455039978),
 ('spaghetti', 0.5598551630973816),
 ('saturdaynightout', 0.55781090259552),
 ('fixings', 0.5556278228759766),
 ('thegafford', 0.5551360249519348),
 ('roamingaround', 0.5543537139892578),
 ('teammatte', 0.5505848526954651)]

In [11]:
modelW2V.most_similar(positive=['trump'])

[('donald', 0.6029958128929138),
 ('trumptrain', 0.5573083162307739),
 ('suppoer', 0.5506103038787842),
 ('unfit', 0.5495684742927551),
 ('melo', 0.5369281768798828),
 ('dumptrump', 0.5304207801818848),
 ('phony', 0.5282039642333984),
 ('conman', 0.5272598266601562),
 ('trumpleaks', 0.5270400047302246),
 ('cuck', 0.523677408695221)]

**Word Vectors**

In [13]:
# Look for vectorized word
modelW2V['food']  # modelW2V.wv['food']

array([-5.79978406e-01, -1.23044527e+00, -2.08441928e-01,  3.32042009e-01,
        8.98821056e-01,  3.67579423e-02,  9.27956309e-03,  3.01610798e-01,
        3.74299645e-01,  1.27605379e-01,  4.07915592e-01,  3.02949637e-01,
       -1.12242484e+00,  6.89602971e-01, -8.16020593e-02,  4.55297023e-01,
       -4.76921618e-01, -2.52067149e-01,  6.71057820e-01,  2.28192300e-01,
        3.53904486e-01, -6.15709662e-01,  6.85080662e-02,  8.53025258e-01,
        5.01547992e-01, -2.43392155e-01, -6.87380731e-01, -4.03361589e-01,
        5.60569227e-01, -3.58057558e-04, -2.19178542e-01, -8.07761133e-01,
        3.46050024e-01,  6.75301552e-01, -8.88870433e-02,  4.01456952e-01,
        4.99923587e-01,  3.39778990e-01, -2.36649498e-01, -3.55975091e-01,
        5.35666496e-02, -7.26230741e-01,  1.00696743e+00,  1.86540172e-01,
       -5.27336538e-01,  2.53938884e-01,  2.63197392e-01, -1.09478697e-01,
        1.57659352e-01, -4.08074111e-01,  1.14112273e-01, -2.39075035e-01,
       -1.51180074e-01, -

**Tweet sentence vectorizing**

In [72]:

vec_size = 200
tweet_vec = np.zeros((len(df), vec_size))
mean_vec = []

for n, item in df.iterrows():
    vec = 0
    target_field = item['tweet_token']   
    
    for word in target_field:
        
        # Model limitations handler:
        # Skips words that model doesn't know
        # Or use train_param: min_count=1
        try:
            vec += modelW2V[word].reshape((1, vec_size))
        except:
            continue
    
    try:
        mean_vec.append(np.mean(vec, axis=0))
    except:
        #list is empty
        mean_vec.append(np.zeros(200))


In [84]:
mean_vec = np.array(mean_vec)
mean_vec.shape

(49159, 200)

In [95]:
wordvec_df = pd.DataFrame(mean_vec, columns=['pos_' + str(i+1) for i in range(mean_vec.shape[1])])

In [96]:
wordvec_df

Unnamed: 0,pos_1,pos_2,pos_3,pos_4,pos_5,pos_6,pos_7,pos_8,pos_9,pos_10,...,pos_191,pos_192,pos_193,pos_194,pos_195,pos_196,pos_197,pos_198,pos_199,pos_200
0,-0.010198,-0.179238,-1.025236,-1.867354,-1.823680,3.850661,-1.477779,-4.279097,-2.521760,2.594143,...,-3.125776,-1.866833,-1.707454,2.861161,1.975053,1.899742,-5.026517,3.659999,0.724748,-1.651187
1,1.732615,-1.252173,-4.108949,-1.968499,-1.235110,3.162641,1.550849,-3.302919,-3.636766,-3.335399,...,-0.263989,-6.821144,-0.076265,2.487707,1.950684,0.177493,-3.687494,5.028077,0.800709,-0.915920
2,-0.499873,-1.106983,-0.680760,-1.285683,0.345690,0.274244,-0.450435,-0.648394,-0.503568,-0.220936,...,-0.669514,-0.140549,0.584011,-0.059528,1.437418,-0.414156,-0.092404,0.745604,-0.795846,1.568461
3,-0.248466,-0.849085,-1.981010,1.500270,0.554931,2.556098,0.049797,-2.512040,-0.139958,-2.464003,...,0.221153,-3.241053,0.545477,0.528109,1.388469,-1.255121,-1.885528,-0.320378,1.721015,-1.286729
4,0.246611,-0.674946,-0.401917,-1.033009,0.857117,0.437187,-0.412318,-0.212342,-1.377201,0.155113,...,-0.673309,-1.233078,-0.169850,1.375468,0.402532,-0.804037,-0.513254,0.997859,0.528226,-0.118943
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49154,3.415200,1.689574,-0.000834,4.092791,-0.969920,-2.832614,-0.801927,-1.482139,-9.751247,0.108661,...,-2.338891,-6.123116,0.457207,3.737474,4.033326,2.485408,-1.182146,1.995655,0.117977,-1.946669
49155,-0.315851,-0.591216,0.350823,-0.007840,1.490175,0.097395,-0.432450,-1.371999,0.262432,-0.788796,...,-0.864714,-0.948977,-0.074738,0.301928,-1.337809,0.815703,-0.677713,1.232031,0.371035,-0.023739
49156,1.757997,-0.652099,-2.704468,0.121732,0.696068,1.091159,-0.896384,-0.885209,-3.427787,-0.479855,...,-2.586801,-5.062278,0.420763,2.801873,3.365159,-0.788514,-1.769330,0.596128,0.915029,-1.051118
49157,1.193100,-3.067238,-3.964396,-3.868015,-0.780873,1.481489,-2.697142,-5.077333,-3.490107,-4.763548,...,-1.325351,-7.504296,0.279931,2.246134,-3.609040,0.442359,-2.554225,2.310891,-1.323209,-1.519198


In [97]:
# Save result
with open('output/wordvec_df.pkl', 'wb') as f:
    pickle.dump(wordvec_df, f)