In [1]:
import pandas as pd
import numpy as np
import re
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import pickle
import time
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
with open('zebal_version2.pickle', 'rb') as f:
    df = pickle.load(f)

In [3]:
df = df[df['board']!='가입 인사 (등업 필수)']
df = df.reset_index()

In [4]:
df.shape

(28569, 13)

In [5]:
from gensim.corpora import Dictionary
from gensim.models.tfidfmodel import TfidfModel
from gensim.matutils import sparse2full

In [6]:
docs_dict = Dictionary(df['morphs'])
docs_dict.filter_extremes(no_below=20, no_above=0.2)
docs_dict.compactify()

In [7]:
docs_corpus = [docs_dict.doc2bow(doc) for doc in df['morphs']]
model_tfidf = TfidfModel(docs_corpus, id2word=docs_dict)
docs_tfidf  = model_tfidf[docs_corpus]

In [8]:
docs_vecs   = np.vstack([sparse2full(c, len(docs_dict)) for c in docs_tfidf])

In [9]:
docs_vecs.shape

(28569, 9798)

In [114]:
tfidf = pd.DataFrame(docs_vecs)
print(tfidf.shape)
tfidf.head()

(28569, 9798)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9788,9789,9790,9791,9792,9793,9794,9795,9796,9797
0,0.042929,0.07239,0.043678,0.032439,0.041437,0.027148,0.038549,0.03487,0.0194,0.048566,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.109378,0.051771,0.0,0.026054,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.035297,0.089282,0.0,0.0,0.0,0.0,0.047545,0.0,0.079758,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.229646,0.0,0.0,0.0,0.0,0.0,0.02946,0.053296,0.029652,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
df['L1'] = df['type'].apply(lambda x: x[0].upper())
df['L2'] = df['type'].apply(lambda x: x[1].upper())
df['L3'] = df['type'].apply(lambda x: x[2].upper())
df['L4'] = df['type'].apply(lambda x: x[3].upper())

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(tfidf, df['L1'], test_size=0.33, shuffle=True)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(19141, 9798)
(19141,)
(9428, 9798)
(9428,)


In [15]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)
print("Accuracy on training set: {:.3f}".format(rf.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(rf.score(X_test, y_test)))

Accuracy on training set: 0.999
Accuracy on test set: 0.723


In [16]:
with open('all_embbeding_df_v2.pickle', 'rb') as f:
    em = pickle.load(f)

In [17]:
def make_matrix(ndf):
    target=[]
    for i in ndf:
        target.append(i)
    return pd.DataFrame(target)

In [18]:
d2v_dvow = make_matrix(em['doc2vec_PV_DBOW'])
d2v_dm   = make_matrix(em['doc2vec_PV_DMO'])
w2v_cbow = make_matrix(em['w2v_cbow_avg'])
w2v_sg   = make_matrix(em['w2v_skipgram_avg'])
ft_cbow  = make_matrix(em['ft_cbow_avg'])
ft_sg    = make_matrix(em['ft_skip_gram_avg'])
bert     = make_matrix(em['bert'])

In [21]:
print(d2v_dvow.shape)
print(d2v_dm.shape)
print(w2v_cbow.shape)
print(w2v_sg.shape)
print(ft_cbow.shape)
print(ft_sg.shape)
print(bert.shape)

(28569, 300)
(28569, 300)
(28569, 300)
(28569, 300)
(28569, 300)
(28569, 300)
(28569, 768)


In [23]:
X_train, X_test, y_train, y_test = train_test_split(d2v_dvow, df['L1'], test_size=0.33, shuffle=True)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(19141, 300)
(19141,)
(9428, 300)
(9428,)


In [24]:
rf1 = RandomForestClassifier(n_estimators=100)
rf1.fit(X_train, y_train)
print("Accuracy on training set: {:.3f}".format(rf1.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(rf1.score(X_test, y_test)))

Accuracy on training set: 1.000
Accuracy on test set: 0.678


In [25]:
X_train, X_test, y_train, y_test = train_test_split(d2v_dm, df['L1'], test_size=0.33, shuffle=True)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(19141, 300)
(19141,)
(9428, 300)
(9428,)


In [26]:
rf2 = RandomForestClassifier(n_estimators=100)
rf2.fit(X_train, y_train)
print("Accuracy on training set: {:.3f}".format(rf2.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(rf2.score(X_test, y_test)))

Accuracy on training set: 1.000
Accuracy on test set: 0.674


In [31]:
X_train, X_test, y_train, y_test = train_test_split(bert, df['L1'], test_size=0.33, shuffle=True)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(19141, 768)
(19141,)
(9428, 768)
(9428,)


In [32]:
rf3 = RandomForestClassifier(n_estimators=100)
rf3.fit(X_train, y_train)
print("Accuracy on training set: {:.3f}".format(rf3.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(rf3.score(X_test, y_test)))

Accuracy on training set: 0.998
Accuracy on test set: 0.673


In [33]:
X_train, X_test, y_train, y_test = train_test_split(ft_sg, df['L1'], test_size=0.33, shuffle=True)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(19141, 300)
(19141,)
(9428, 300)
(9428,)


In [34]:
rf4 = RandomForestClassifier(n_estimators=100)
rf4.fit(X_train, y_train)
print("Accuracy on training set: {:.3f}".format(rf4.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(rf4.score(X_test, y_test)))

Accuracy on training set: 1.000
Accuracy on test set: 0.685


In [35]:
X_train, X_test, y_train, y_test = train_test_split(ft_cbow, df['L1'], test_size=0.33, shuffle=True)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(19141, 300)
(19141,)
(9428, 300)
(9428,)


In [36]:
rf5 = RandomForestClassifier(n_estimators=100)
rf5.fit(X_train, y_train)
print("Accuracy on training set: {:.3f}".format(rf5.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(rf5.score(X_test, y_test)))

Accuracy on training set: 1.000
Accuracy on test set: 0.678


In [39]:
X_train, X_test, y_train, y_test = train_test_split(w2v_cbow, df['L1'], test_size=0.33, shuffle=True)

X_train.replace([np.nan, np.inf, -np.inf], 0, inplace=True)
X_test.replace([np.nan, np.inf, -np.inf], 0, inplace=True)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  method=method)


(19141, 300)
(19141,)
(9428, 300)
(9428,)


In [40]:
rf6 = RandomForestClassifier(n_estimators=100)
rf6.fit(X_train, y_train)
print("Accuracy on training set: {:.3f}".format(rf6.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(rf6.score(X_test, y_test)))

Accuracy on training set: 1.000
Accuracy on test set: 0.672


In [41]:
X_train, X_test, y_train, y_test = train_test_split(w2v_sg, df['L1'], test_size=0.33, shuffle=True)

X_train.replace([np.nan, np.inf, -np.inf], 0, inplace=True)
X_test.replace([np.nan, np.inf, -np.inf], 0, inplace=True)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  method=method)


(19141, 300)
(19141,)
(9428, 300)
(9428,)


In [42]:
rf7 = RandomForestClassifier(n_estimators=100)
rf7.fit(X_train, y_train)
print("Accuracy on training set: {:.3f}".format(rf7.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(rf7.score(X_test, y_test)))

Accuracy on training set: 1.000
Accuracy on test set: 0.681


In [43]:
with open('tfidf_weighted_ft_cbow.pickle', 'rb') as f:
    tfidf_ft_cbow = pickle.load(f)

In [44]:
with open('tfidf_weighted_w2v_cbow.pickle', 'rb') as f:
    tfidf_w2v_cbow = pickle.load(f)

In [45]:
print(tfidf_ft_cbow.shape)
print(tfidf_w2v_cbow.shape)

(28569, 300)
(28569, 300)


In [46]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_ft_cbow, df['L1'], test_size=0.33, shuffle=True)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(19141, 300)
(19141,)
(9428, 300)
(9428,)


In [47]:
rf8 = RandomForestClassifier(n_estimators=100)
rf8.fit(X_train, y_train)
print("Accuracy on training set: {:.3f}".format(rf8.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(rf8.score(X_test, y_test)))

Accuracy on training set: 1.000
Accuracy on test set: 0.674


In [48]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_w2v_cbow, df['L1'], test_size=0.33, shuffle=True)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(19141, 300)
(19141,)
(9428, 300)
(9428,)


In [49]:
rf9 = RandomForestClassifier(n_estimators=100)
rf9.fit(X_train, y_train)
print("Accuracy on training set: {:.3f}".format(rf9.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(rf9.score(X_test, y_test)))

Accuracy on training set: 0.999
Accuracy on test set: 0.677


In [57]:
tfidf_ft_cbow = pd.DataFrame(tfidf_ft_cbow)

In [66]:
tfidf_ft_cbow['doc_length'] = tfidf.astype(bool).sum(axis=1)

In [67]:
tfidf_ft_cbow.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,doc_length
0,-3.056679,-14.323002,22.047594,5.649813,-7.647969,-17.724348,8.904348,7.41914,-11.971848,6.308992,...,-9.683761,-0.663285,-7.988602,-4.617202,-18.561714,12.227542,14.009789,-20.620363,5.808738,187
1,-3.492559,-13.700104,15.758248,14.620514,-13.091576,-18.462344,11.730476,3.339061,-8.214804,3.016187,...,-12.188702,-3.803913,-7.276903,-5.728725,-6.777703,10.424125,14.788713,-11.396765,11.397697,107
2,12.133795,-10.051956,21.768468,20.949859,-24.642687,-32.412084,36.642935,29.364848,-27.717601,24.385785,...,18.062141,3.661026,-18.630547,-4.459971,-33.010792,-2.097863,27.439975,-24.291529,10.235375,167
3,38.749646,-4.481841,-13.748722,-12.367349,-85.487468,-120.258891,28.924157,59.043577,-55.906252,49.075628,...,32.391058,6.640453,-4.400569,26.604966,-11.427141,49.428109,47.06149,-32.930481,51.669129,264
4,0.875521,-1.576987,0.038765,2.45256,-1.454332,1.15621,2.818536,3.561648,-1.414714,0.645277,...,0.631251,0.663567,-1.746502,1.425575,-2.103482,0.785906,1.702815,-0.052868,1.221824,11


In [69]:
tfidf_ft_cbow_avg = tfidf_ft_cbow.iloc[:,:300].div(tfidf_ft_cbow.doc_length, axis=0)

In [72]:
tfidf_ft_cbow_avg.shape

(28569, 300)

In [76]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_ft_cbow_avg, df['L1'], test_size=0.33, shuffle=True)

X_train.replace([np.nan, np.inf, -np.inf], 0, inplace=True)
X_test.replace([np.nan, np.inf, -np.inf], 0, inplace=True)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  method=method)


(19141, 300)
(19141,)
(9428, 300)
(9428,)


In [77]:
rf10 = RandomForestClassifier(n_estimators=100)
rf10.fit(X_train, y_train)
print("Accuracy on training set: {:.3f}".format(rf10.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(rf10.score(X_test, y_test)))

Accuracy on training set: 0.999
Accuracy on test set: 0.684


In [78]:
tfidf_w2v_cbow = pd.DataFrame(tfidf_w2v_cbow)

In [79]:
tfidf_w2v_cbow['doc_length'] = tfidf.astype(bool).sum(axis=1)

In [80]:
tfidf_w2v_cbow.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,doc_length
0,-2.712694,-2.99715,-0.249982,-2.232253,2.971276,1.190884,-0.944313,2.621672,1.226795,1.030179,...,-1.288647,-1.731501,-1.89742,2.995213,-3.231183,2.72075,0.953915,0.65788,-2.042107,187
1,-2.393256,-1.623572,-2.017376,-1.911446,4.251178,0.799256,0.666524,3.280085,1.146256,1.251537,...,-0.276895,-0.666388,-1.126608,2.113791,-3.473832,1.518529,0.758649,-0.501081,-1.834546,107
2,-3.972993,-1.804513,-0.836238,-1.886061,2.800836,-0.093594,-2.576355,1.560733,-0.600077,-2.543403,...,-1.053787,0.003788,0.031028,1.425051,-4.326657,3.479216,-0.029903,0.794681,0.495996,167
3,-3.637395,-0.174189,-0.958779,-1.405303,2.341508,0.965515,-0.777458,0.900278,0.597169,-2.583456,...,-0.516564,-2.606686,-1.218247,0.352698,-3.161194,3.026933,1.384906,-1.281274,-0.704735,264
4,-1.778204,-2.209571,-1.155508,-0.38233,2.177818,-0.451603,-0.466486,1.105094,0.16192,-0.612459,...,0.202685,-0.049516,0.390669,0.055274,-0.1235,-0.019647,1.608512,1.103746,-0.645787,11


In [81]:
tfidf_w2v_cbow_avg = tfidf_w2v_cbow.iloc[:,:300].div(tfidf_w2v_cbow.doc_length, axis=0)

In [82]:
tfidf_w2v_cbow_avg.shape

(28569, 300)

In [83]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_w2v_cbow_avg, df['L1'], test_size=0.33, shuffle=True)

X_train.replace([np.nan, np.inf, -np.inf], 0, inplace=True)
X_test.replace([np.nan, np.inf, -np.inf], 0, inplace=True)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  method=method)


(19141, 300)
(19141,)
(9428, 300)
(9428,)


In [84]:
rf11 = RandomForestClassifier(n_estimators=100)
rf11.fit(X_train, y_train)
print("Accuracy on training set: {:.3f}".format(rf11.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(rf11.score(X_test, y_test)))

Accuracy on training set: 0.999
Accuracy on test set: 0.688


In [88]:
grand = pd.concat([tfidf, d2v_dvow, d2v_dm, w2v_cbow, w2v_sg, ft_cbow, ft_sg, bert, tfidf_w2v_cbow_avg, tfidf_ft_cbow_avg], axis=1)

MemoryError: 

In [104]:
grand.shape

(28569, 12966)

In [106]:
9798+(300*8)+768

12966

In [107]:
X_train, X_test, y_train, y_test = train_test_split(grand, df['L1'], test_size=0.33, shuffle=True)

X_train.replace([np.nan, np.inf, -np.inf], 0, inplace=True)
X_test.replace([np.nan, np.inf, -np.inf], 0, inplace=True)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  method=method)


(19141, 12966)
(19141,)
(9428, 12966)
(9428,)


In [108]:
rf12 = RandomForestClassifier(n_estimators=100)
rf12.fit(X_train, y_train)
print("Accuracy on training set: {:.3f}".format(rf12.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(rf12.score(X_test, y_test)))

Accuracy on training set: 1.000
Accuracy on test set: 0.685


## skip-gram

In [109]:
with open('tfidf_weighted_fasttext_sg.pickle', 'rb') as f:
    tfidf_ft_sg = pickle.load(f)

In [110]:
with open('tfidf_weighted_w2v_sg.pickle', 'rb') as f:
    tfidf_w2v_sg = pickle.load(f)

In [111]:
print(tfidf_ft_sg.shape)
print(tfidf_w2v_sg.shape)

(28569, 300)
(28569, 300)


In [112]:
tfidf_ft_sg = pd.DataFrame(tfidf_ft_sg)

In [115]:
tfidf_ft_sg['doc_length'] = tfidf.astype(bool).sum(axis=1)

In [116]:
tfidf_ft_sg.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,doc_length
0,0.055281,0.129965,0.779223,-0.194621,0.310045,1.087189,0.129187,-0.658543,-0.800134,1.122928,...,0.010272,0.751759,1.288994,0.583079,0.513856,0.964674,1.055155,-0.477381,1.310911,187
1,-0.028055,0.02702,0.589116,0.083634,0.743619,0.632939,0.038787,-0.454959,-0.448542,0.940666,...,-0.591633,0.559054,0.879247,0.655048,0.965386,0.499161,1.007352,0.12136,1.062095,107
2,0.077384,0.37244,0.867954,-0.793481,0.079906,1.1003,0.561361,-0.536753,-0.971745,0.41193,...,0.693052,0.332443,1.190908,-0.027092,-0.597123,0.214816,1.221441,-1.092815,1.118877,167
3,0.578316,0.774726,0.745852,-0.403864,0.224637,1.346764,0.297837,-0.751705,-0.330156,0.140048,...,0.133265,0.843804,0.734855,0.714973,-0.510664,0.173528,1.169966,-0.61462,1.341421,264
4,0.111679,-0.160458,-0.067474,-0.759882,0.555862,0.561684,0.328941,-0.38534,-0.295898,0.27312,...,-0.035486,-0.126238,0.24396,0.059136,-0.057643,-0.080028,0.743419,-0.203569,0.367991,11


In [117]:
tfidf_ft_sg_avg = tfidf_ft_sg.iloc[:,:300].div(tfidf_ft_sg.doc_length, axis=0)

In [118]:
tfidf_ft_sg_avg.shape

(28569, 300)

In [119]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_ft_sg_avg, df['L1'], test_size=0.33, shuffle=True)

X_train.replace([np.nan, np.inf, -np.inf], 0, inplace=True)
X_test.replace([np.nan, np.inf, -np.inf], 0, inplace=True)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  method=method)


(19141, 300)
(19141,)
(9428, 300)
(9428,)


In [120]:
rf13 = RandomForestClassifier(n_estimators=100)
rf13.fit(X_train, y_train)
print("Accuracy on training set: {:.3f}".format(rf13.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(rf13.score(X_test, y_test)))

Accuracy on training set: 0.999
Accuracy on test set: 0.684


In [121]:
tfidf_w2v_sg = pd.DataFrame(tfidf_w2v_sg)

In [122]:
tfidf_w2v_sg['doc_length'] = tfidf.astype(bool).sum(axis=1)

In [123]:
tfidf_w2v_sg.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,doc_length
0,-0.099167,-1.419174,-0.297549,-0.20169,-0.964325,1.842195,-0.843325,-1.526516,0.355111,-0.772252,...,-1.811234,0.072788,0.327495,0.046594,-1.218108,0.968988,1.247389,1.805132,-2.234951,187
1,0.556868,-1.494847,-0.375744,-0.680782,0.052378,1.904577,-0.325941,-1.277456,0.31204,0.22152,...,-1.815255,-0.240381,-0.062291,-0.306186,-1.203015,1.08708,1.159048,1.582046,-1.835209,107
2,-0.10531,-2.040135,0.615345,-0.586227,-1.447999,1.809291,0.059345,-0.031875,-0.034386,-0.50147,...,-1.335117,0.283257,0.329117,-0.2208,-0.994899,0.33584,1.707354,0.968053,-2.307026,167
3,0.381661,-2.35981,0.501509,-0.865403,-0.662938,2.013371,-0.397091,-0.836548,0.252063,0.349931,...,-1.775375,0.396823,-0.137661,-0.241579,-0.947303,0.696959,1.590799,1.333666,-2.760601,264
4,-0.191938,-0.999453,-0.291717,-0.04465,-0.485681,0.740915,-0.125883,-0.177642,0.082964,-0.417643,...,-0.349799,-0.214773,0.396289,0.481057,-0.356253,0.181578,0.553737,0.251194,-1.004305,11


In [124]:
tfidf_w2v_sg_avg = tfidf_w2v_sg.iloc[:,:300].div(tfidf_w2v_sg.doc_length, axis=0)

In [125]:
tfidf_w2v_sg_avg.shape

(28569, 300)

In [126]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_w2v_sg_avg, df['L1'], test_size=0.33, shuffle=True)

X_train.replace([np.nan, np.inf, -np.inf], 0, inplace=True)
X_test.replace([np.nan, np.inf, -np.inf], 0, inplace=True)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  method=method)


(19141, 300)
(19141,)
(9428, 300)
(9428,)


In [127]:
rf14 = RandomForestClassifier(n_estimators=100)
rf14.fit(X_train, y_train)
print("Accuracy on training set: {:.3f}".format(rf14.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(rf14.score(X_test, y_test)))

Accuracy on training set: 0.999
Accuracy on test set: 0.674
