In [1]:
import numpy as np
import pandas as pd
import gzip
import pickle
from collections import Counter
from utils import stem_token, split_all_data


from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline

In [2]:
RANDOM_SEED = 2856

In [3]:
data_dict = split_all_data(.8)
train_df = data_dict['train']

Loading the data...
Splitting the data...


In [6]:
%%timeit
stem_token(train_df['tweet'].iloc[0][0])

435 µs ± 12.7 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [21]:
# testing the tokenizer/lemmatizer
for item in train_df.sample(5).itertuples():
    tokenized = stem_token(item[3][0])
    count = Counter(tokenized)
    print(f'user: {item[1]} - label: {item[-1]}\n', count.most_common(10), '\n')

user: 982944228304187392 - label: 1
 [('rt', 1), ('buffer', 1), ('but', 1), ('summersalt', 1), ('fast', 1), ('trajectori', 1), ('fuel', 1), ('earli', 1), ('fund', 1), ('style', 1)] 

user: 2408243229 - label: 1
 [('http', 1), ('co', 1), ('svg1mv9jj7', 1)] 

user: 861643695430524928 - label: 1
 [('rt', 1), ('graphixsli', 1), ('my', 1), ('entri', 1), ('artwork', 1), ('challeng', 1), ('thegenesisep', 1), ('realkaash', 1), ('http', 1), ('co', 1)] 

user: 197974105 - label: 0
 [('restaur', 2), ('wesweav', 1), ('order', 1), ('hardena', 1), ('waroeng', 1), ('surabaya', 1), ('intrigu', 1), ('long', 1), ('time', 1), ('ago', 1)] 

user: 1266022889725341696 - label: 1
 [('rt', 1), ('patehogan', 1), ('super', 1), ('bless', 1), ('receiv', 1), ('offer', 1), ('lagrang', 1), ('colleg', 1), ('coachbcrump', 1), ('coach_ruth01', 1)] 



In [88]:
def extract_topic_feature(row, components, thresh, random_state=None):
    if row is not None:
        vectorize = TfidfVectorizer(ngram_range=(1, 2), stop_words='english', min_df=1)
        nmf_model = NMF(n_components=components, init='nndsvd', max_iter=100000, random_state=random_state)
        nmf_pipe = make_pipeline(vectorize, nmf_model)
        
        tweets = np.array(row)
        W = nmf_pipe.fit_transform(tweets)
        print(W)
        
        index_max = []
        for index in range(W.shape[0]):
            max_val_index = np.argmax(W[index]) if np.max(W[index]) > thresh else -1
            index_max.append(max_val_index)
        print(index_max)    
        return len(set([index for index in index_max if index != -1]))/tweets.shape[0]
    else:
        return np.nan

In [92]:
simple_documents_train = ['The cat, dog, and duck were friends. \
                          The cat and duck met at the dog\'s house despite the dog\'s objections.', 
                   'Computers have power supplies that regulate power consumption.', 
                   'Plug in the monitor and turn on the computer. \
                     the monitor is now ready for use.', 
                   'You will find the plug on the right side of the screen.', 
                   'My friend likes coffee and cats.', 
                   'His dog gets along well with my friend.']
extract_topic_feature(simple_documents_train, 3, 0.7)

[[0.60622293 0.         0.        ]
 [0.         0.         1.        ]
 [0.         0.7210456  0.        ]
 [0.         0.7210456  0.        ]
 [0.42884922 0.         0.        ]
 [0.74257518 0.         0.        ]]
[-1, 2, 1, 1, -1, 0]




0.5

In [28]:
vectorize = TfidfVectorizer(ngram_range=(1, 2), stop_words=None, tokenizer=stem_token, min_df=1)
nmf_model = NMF(n_components=20, init='nndsvd', max_iter=1000, random_state=None)
nmf_pipe = make_pipeline(vectorize, nmf_model)

In [29]:
tweets = np.array(train_df['tweet'].loc[0])
W = nmf_pipe.fit_transform(tweets)

In [33]:
print(len(train_df['tweet'].loc[0]))
print(tweets.shape)
W.shape

200
(200,)


(200, 20)

In [37]:
for tw in range(5):
    print(tweets[tw])
    print(W[tw])

RT @CarnivalCruise: 🎉 Are you ready to see what our newest ship’s name will be? 🎉 Thanks to all our partners for helping us unbox the name.…

[0.         0.01299792 0.         0.08002773 0.         0.
 0.0151254  0.02365543 0.         0.         0.         0.00308785
 0.         0.0846081  0.         0.01539741 0.         0.
 0.02472308 0.        ]
Who has time for receipts? Not me. @epson receipt scanners make it easy. No mess = no stress! Check it out at https://t.co/ofqbTdz0Qk. https://t.co/BtYwuyz9N5

[2.62949862e-02 0.00000000e+00 0.00000000e+00 0.00000000e+00
 7.19983142e-03 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 5.10196391e-05 2.81157716e-03 0.00000000e+00
 5.80334800e-02 5.23719170e-04 0.00000000e+00 0.00000000e+00
 4.26264627e-02 1.82492982e-03 1.89216302e-01 0.00000000e+00]
Steady wants to encourage you to invest in your financial future. Connect your bank account to @TheSteadyApp and have access to benefits like income insights, online medical visits, a

In [None]:
topics = 