In [9]:
import time
import os.path
import numpy as np
import pandas as pd
import pickle as pkl
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

Load dataset

In [10]:
df1 = pd.read_csv("./small_2015-06_key_word.csv",index_col=False, header=None, names=['id','text','food','activity','label'])
df2 = pd.read_csv("./small_2015-11_key_word.csv",index_col=False, header=None, names=['id','text','food','activity','label'])
df = pd.concat((df1,df2),axis=0)
lst = []
for i,sentence in enumerate(df['text']):
    try:
        if len(sentence.split(' ')) > 4:
            lst.append(i)
    except:
        continue
df = df.iloc[lst].replace(3,2).reset_index(drop=True)

In [12]:
print(df.shape)
df.head()

(7316139, 5)

Feed LDA's output into a random forest

In [32]:
tf_lst = ["./data/tfVectorizer_topics=25_maxFeatures=12000_maxDf=0.4_minDf=1.pickle", "./data/tfVectorizer_topics=25_maxFeatures=12000_maxDf=0.5_minDf=1.pickle",
          "./data/tfVectorizer_topics=25_maxFeatures=12000_maxDf=0.6_minDf=1.pickle", "./data/tfVectorizer_topics=25_maxFeatures=16000_maxDf=0.4_minDf=1.pickle",
          "./data/tfVectorizer_topics=25_maxFeatures=16000_maxDf=0.5_minDf=1.pickle", "./data/tfVectorizer_topics=50_maxFeatures=12000_maxDf=0.4_minDf=1.pickle",
          "./data/tfVectorizer_topics=50_maxFeatures=12000_maxDf=0.5_minDf=1.pickle", "./data/tfVectorizer_topics=50_maxFeatures=16000_maxDf=0.4_minDf=1.pickle",
          "./data/tfVectorizer_topics=75_maxFeatures=12000_maxDf=0.4_minDf=1.pickle", "./data/tfVectorizer_topics=75_maxFeatures=12000_maxDf=0.5_minDf=1.pickle",
          "./data/tfVectorizer_topics=75_maxFeatures=12000_maxDf=0.6_minDf=1.pickle", "./data/tfVectorizer_topics=75_maxFeatures=16000_maxDf=0.4_minDf=1.pickle",
          "./data/tfVectorizer_topics=75_maxFeatures=16000_maxDf=0.5_minDf=1.pickle", "./data/tfVectorizer_topics=100_maxFeatures=12000_maxDf=0.4_minDf=1.pickle"]
lda_lst = ["./data/LDA_topics=25_maxFeatures=12000_maxDf=0.4_minDf=1.pickle", "./data/LDA_topics=25_maxFeatures=12000_maxDf=0.5_minDf=1.pickle",
           "./data/LDA_topics=25_maxFeatures=12000_maxDf=0.6_minDf=1.pickle", "./data/LDA_topics=25_maxFeatures=16000_maxDf=0.4_minDf=1.pickle",
           "./data/LDA_topics=25_maxFeatures=16000_maxDf=0.5_minDf=1.pickle", "./data/LDA_topics=50_maxFeatures=12000_maxDf=0.4_minDf=1.pickle",
           "./data/LDA_topics=50_maxFeatures=12000_maxDf=0.5_minDf=1.pickle", "./data/LDA_topics=50_maxFeatures=16000_maxDf=0.4_minDf=1.pickle",
           "./data/LDA_topics=75_maxFeatures=12000_maxDf=0.4_minDf=1.pickle", "./data/LDA_topics=75_maxFeatures=12000_maxDf=0.5_minDf=1.pickle",
           "./data/LDA_topics=75_maxFeatures=12000_maxDf=0.6_minDf=1.pickle", "./data/LDA_topics=75_maxFeatures=16000_maxDf=0.4_minDf=1.pickle",
           "./data/LDA_topics=75_maxFeatures=16000_maxDf=0.5_minDf=1.pickle", "./data/LDA_topics=100_maxFeatures=12000_maxDf=0.4_minDf=1.pickle"]

In [40]:
%%time
acc_lst = []
for tf_name, lda_name in zip(*(tf_lst,lda_lst)):
    # load tf_vectorizer and lda model
    with open(tf_name, "rb") as input_file:
        print(tf_name)
        tf_vectorizer = pkl.load(input_file)
    with open(lda_name, "rb") as input_file:
        print(lda_name)
        lda = pkl.load(input_file)
        
    # predicts probs of text using lda
    start = time.time()
    probs = lda.transform(tf_vectorizer.transform(df['text'].tolist()))
    probs = pd.DataFrame(probs) 
    X = pd.concat((df,probs),axis=1)
    print("Transform Time: {}s".format(round(time.time()-start,4)))

    # randomly sample a balanced dataset
    X0 = X[X['label']==0].sample(X.groupby('label').size()[2])
    X = pd.concat((X0,X[X['label']==2]),axis=0)
    X_train, X_test, y_train, y_test = train_test_split(X.iloc[:,5:],X['label'],test_size=0.33,random_state=42)

    # train random forest
    model = RandomForestClassifier(n_estimators=100, max_depth=2,random_state=0)
    model.fit(X_train,y_train)
    acc = round(model.score(X_test,y_test),4)
    acc_lst.append(acc)
    print("Test Accuracy: {}".format(acc))

./data/tfVectorizer_topics=25_maxFeatures=12000_maxDf=0.4_minDf=1.pickle
./data/LDA_topics=25_maxFeatures=12000_maxDf=0.4_minDf=1.pickle
Transform Time: 402.9718s
Test Accuracy: 0.6044
./data/tfVectorizer_topics=25_maxFeatures=12000_maxDf=0.5_minDf=1.pickle
./data/LDA_topics=25_maxFeatures=12000_maxDf=0.5_minDf=1.pickle
Transform Time: 394.3956s
Test Accuracy: 0.6061
./data/tfVectorizer_topics=25_maxFeatures=12000_maxDf=0.6_minDf=1.pickle
./data/LDA_topics=25_maxFeatures=12000_maxDf=0.6_minDf=1.pickle
Transform Time: 391.5192s
Test Accuracy: 0.6097
./data/tfVectorizer_topics=25_maxFeatures=16000_maxDf=0.4_minDf=1.pickle
./data/LDA_topics=25_maxFeatures=16000_maxDf=0.4_minDf=1.pickle
Transform Time: 389.5022s
Test Accuracy: 0.5938
./data/tfVectorizer_topics=25_maxFeatures=16000_maxDf=0.5_minDf=1.pickle
./data/LDA_topics=25_maxFeatures=16000_maxDf=0.5_minDf=1.pickle
Transform Time: 392.8519s
Test Accuracy: 0.592
./data/tfVectorizer_topics=50_maxFeatures=12000_maxDf=0.4_minDf=1.pickle
./d



./data/LDA_topics=75_maxFeatures=12000_maxDf=0.4_minDf=1.pickle
Transform Time: 484.2251s
Test Accuracy: 0.6181
./data/tfVectorizer_topics=75_maxFeatures=12000_maxDf=0.5_minDf=1.pickle




./data/LDA_topics=75_maxFeatures=12000_maxDf=0.5_minDf=1.pickle
Transform Time: 483.2631s
Test Accuracy: 0.6097
./data/tfVectorizer_topics=75_maxFeatures=12000_maxDf=0.6_minDf=1.pickle




./data/LDA_topics=75_maxFeatures=12000_maxDf=0.6_minDf=1.pickle
Transform Time: 480.1674s
Test Accuracy: 0.6109
./data/tfVectorizer_topics=75_maxFeatures=16000_maxDf=0.4_minDf=1.pickle




./data/LDA_topics=75_maxFeatures=16000_maxDf=0.4_minDf=1.pickle
Transform Time: 487.3209s
Test Accuracy: 0.6047
./data/tfVectorizer_topics=75_maxFeatures=16000_maxDf=0.5_minDf=1.pickle




./data/LDA_topics=75_maxFeatures=16000_maxDf=0.5_minDf=1.pickle
Transform Time: 487.4308s
Test Accuracy: 0.6143
./data/tfVectorizer_topics=100_maxFeatures=12000_maxDf=0.4_minDf=1.pickle
./data/LDA_topics=100_maxFeatures=12000_maxDf=0.4_minDf=1.pickle
Transform Time: 587.0208s
Test Accuracy: 0.6269
CPU times: user 2h 2min 53s, sys: 7min 3s, total: 2h 9min 57s
Wall time: 2h 15min 11s


Print result table

In [43]:
result = pd.DataFrame()
result['lda model'] = [lda_lst[i][11:] for i in range(len(lda_lst))]
result['acc'] = acc_lst
result

Unnamed: 0,lda model,acc
0,topics=25_maxFeatures=12000_maxDf=0.4_minDf=1....,0.6044
1,topics=25_maxFeatures=12000_maxDf=0.5_minDf=1....,0.6061
2,topics=25_maxFeatures=12000_maxDf=0.6_minDf=1....,0.6097
3,topics=25_maxFeatures=16000_maxDf=0.4_minDf=1....,0.5938
4,topics=25_maxFeatures=16000_maxDf=0.5_minDf=1....,0.592
5,topics=50_maxFeatures=12000_maxDf=0.4_minDf=1....,0.6153
6,topics=50_maxFeatures=12000_maxDf=0.5_minDf=1....,0.6081
7,topics=50_maxFeatures=16000_maxDf=0.4_minDf=1....,0.6145
8,topics=75_maxFeatures=12000_maxDf=0.4_minDf=1....,0.6181
9,topics=75_maxFeatures=12000_maxDf=0.5_minDf=1....,0.6097


more topics, less maxFeatures performs better