# まずは分析を始める準備から

In [1]:
from django.conf import settings
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import GridSearchCV, cross_val_score, RandomizedSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from gensim import corpora, matutils, models
from gensim.models import LsiModel
import xgboost as xgb
import scipy as sp

from preprocess import *
from utils import get_train_data

In [2]:
tags, data = get_train_data()

In [3]:
dictionary = corpora.Dictionary(data)
len(dictionary)

63437

## 1.1特に前処理なしバージョンでモデルを作る

In [45]:
bow = make_bow(data, dictionary)

ナイーブベイズ

In [5]:
params = {'alpha': np.arange(0.1,2,0.1)}
grid = GridSearchCV(MultinomialNB(), params, cv=5)
grid.fit(bow, tags)
grid.best_params_
grid.best_score_

0.86862244897959184

ロジスティック回帰

In [7]:
params = [{'penalty': ['l1'], 'C': [1,10,100]},
        {'penalty': ['l2'], 'C': [1,10,100]}]
grid = GridSearchCV(LogisticRegression(), params, cv=5)
grid.fit(bow, tags)
print(grid.best_params_)
print(grid.best_score_)

{'penalty': 'l2', 'C': 1}
0.861394557823


ランダムフォレスト

In [46]:
params = {'n_estimators': [5, 10, 20, 30, 50, 100, 300, 1000, 10000], "max_features": ["auto", "sqrt", "log2"]}
grid = GridSearchCV(RandomForestClassifier(), params, cv=5, n_jobs=-1)
grid.fit(bow, tags)
print(grid.best_params_)
print(grid.best_score_)

{'max_features': 'log2', 'n_estimators': 10000}
0.831632653061


## 2.頻出語・レアワードを削減したモデル

In [3]:
dictionary = corpora.Dictionary(data)
dictionary.filter_extremes(no_below=4, no_above=0.6)
len(dictionary)

18584

In [4]:
bow_reduce = make_bow(data, dictionary)

ナイーブベイズ

In [19]:
params = {'alpha': np.arange(0.1, 2,0.1 )}
grid = GridSearchCV(MultinomialNB(), params, cv=5)
grid.fit(bow_reduce, tags)
print(grid.best_params_)
print(grid.best_score_)

{'alpha': 0.5}
0.876003854802


ロジスティック回帰

In [15]:
params = [{'penalty': ['l1'], 'C': [1,10,100]},
        {'penalty': ['l2'], 'C': [1,10,100]}]
grid = GridSearchCV(LogisticRegression(), params, cv=5)
grid.fit(bow_reduce, tags)
print(grid.best_params_)
print(grid.best_score_)

{'C': 1, 'penalty': 'l2'}
0.863154513331


ランダムフォレスト

In [6]:
params = {'n_estimators': [5, 10, 20, 30, 50, 100, 300, 1000, 10000], "max_features": ["auto", "sqrt", "log2"]}
grid = GridSearchCV(RandomForestClassifier(), params, cv=5, n_jobs=-1)
grid.fit(bow_reduce, tags)
print(grid.best_params_)
print(grid.best_score_)

{'n_estimators': 10000, 'max_features': 'log2'}
0.838419530999


SVM

In [7]:
params = tuned_parameters = [
    {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
    {'C': [1, 10, 100, 1000], 'kernel': ['rbf'], 'gamma': [0.001, 0.0001]}]
grid = GridSearchCV(SVC(), params, cv=5, n_jobs=-1)
grid.fit(bow_reduce, tags)
print(grid.best_params_)
print(grid.best_score_)

{'C': 100, 'kernel': 'rbf', 'gamma': 0.0001}
0.841631866367


xgb

In [20]:
bow_reduce.shape

(3113, 13957)

In [13]:
param_test1 = {
    'max_depth':[1,2]
}
gsearch1 = GridSearchCV(estimator = xgb.XGBClassifier( learning_rate =0.1, n_estimators=10, max_depth=5,
                                        min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
                                        objective= 'multi:softprob', nthread=4, scale_pos_weight=1, seed=27), 
                       param_grid = param_test1,n_jobs=4,iid=False, cv=5)
gsearch1.fit(bow_reduce, tags)
print(gsearch1.best_params_)
print(gsearch1.best_score_)



{'max_depth': 2}
0.686308938804


## 3.tfidf

In [11]:
dictionary = corpora.Dictionary(data)

In [12]:
bow_tfidf = tf_idf_bow(data, dictionary)

ナイーブベイズ

In [6]:
params = {'alpha': np.arange(0.1, 2,0.1 )}
grid = GridSearchCV(MultinomialNB(), params, cv=5)
grid.fit(bow_tfidf, tags)
print(grid.best_params_)
print(grid.best_score_)

{'alpha': 0.10000000000000001}
0.867651782846


ロジスティック回帰

In [16]:
params = [{'penalty': ['l2'], 'C': np.arange(1, 200, 10)}]
grid = GridSearchCV(LogisticRegression(), params, cv=5)
grid.fit(bow_tfidf, tags)
print(grid.best_params_)
print(grid.best_score_)

{'penalty': 'l2', 'C': 81}
0.884677160296


SVM

In [10]:
params = tuned_parameters = [
    {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
    {'C': [1, 10, 100, 1000], 'kernel': ['rbf'], 'gamma': [0.001, 0.0001]}]
grid = GridSearchCV(SVC(), params, cv=5, n_jobs=-1)
grid.fit(bow_tfidf, tags)
print(grid.best_params_)
print(grid.best_score_)

{'C': 10, 'kernel': 'linear'}
0.871827818824


ランダムフォレスト

In [11]:
params = {'n_estimators': [5, 10, 20, 30, 50, 100, 300, 1000, 10000], "max_features": ["auto", "sqrt", "log2"]}
grid = GridSearchCV(RandomForestClassifier(), params, cv=5, n_jobs=-1)
grid.fit(bow_tfidf, tags)
print(grid.best_params_)
print(grid.best_score_)

{'n_estimators': 1000, 'max_features': 'log2'}
0.850626405397


## 4.tfidfかつデータ削減

In [15]:
dictionary = corpora.Dictionary(data)
dictionary.filter_extremes(no_below=6, no_above=0.6)
len(dictionary)

12870

In [16]:
bow_tfidf_reduce = tf_idf_bow(data, dictionary)

ナイーブベイズ

In [12]:
params = {'alpha': np.arange(0.1, 1,0.1 )}
grid = GridSearchCV(MultinomialNB(), params, cv=5)
grid.fit(bow_tfidf_reduce, tags)
print(grid.best_params_)
print(grid.best_score_)

{'alpha': 0.10000000000000001}
0.875361387729


ロジスティック回帰

In [55]:
params = {'penalty': ['l2'], 'C': [1,10,100]}
grid = GridSearchCV(LogisticRegression(), params, cv=5)
grid.fit(bow_tfidf_reduce, tags)
print(grid.best_params_)
print(grid.best_score_)

{'C': 10, 'penalty': 'l2'}
0.880501124317


## <font color="DarkTurquoise">⬆️ベンチマーク！</font>

ランダムフォレスト

In [14]:
params = {'n_estimators': [10, 50, 100, 300, 1000, 10000], "max_features": ["auto", "sqrt", "log2"]}
grid = GridSearchCV(RandomForestClassifier(), params, cv=5, n_jobs=-1)
grid.fit(bow_tfidf_reduce, tags)
print(grid.best_params_)
print(grid.best_score_)

{'n_estimators': 1000, 'max_features': 'log2'}
0.854802441375


SVM

In [10]:
params = tuned_parameters = [
    {'C': [1, 10, 100], 'kernel': ['linear']},
    {'C': [1, 10, 100], 'kernel': ['rbf'], 'gamma': [0.001, 0.0001]}]
grid = GridSearchCV(SVC(), params, cv=5, n_jobs=-1)
grid.fit(bow_tfidf_reduce, tags)
print(grid.best_params_)
print(grid.best_score_)

{'kernel': 'linear', 'C': 1}
0.892033086635


xgb

In [17]:
param_test1 = {
    'max_depth':range(3, 10, 2),
    'min_child_weight':range(1, 6, 2)
    
}
gsearch1 = GridSearchCV(estimator = xgb.XGBClassifier( learning_rate =0.1, n_estimators=100, gamma=0, subsample=0.8, colsample_bytree=0.8,
                                        objective= 'multi:softprob', nthread=4, scale_pos_weight=1, seed=27), 
                       param_grid = param_test1,n_jobs=4,iid=False, cv=5)
gsearch1.fit(bow_tfidf_reduce, tags)
print(gsearch1.best_params_)
print(gsearch1.best_score_)











{'max_depth': 9, 'min_child_weight': 3}
0.844796454863


## 5.LSI

In [28]:
dictionary = corpora.Dictionary(data)
dictionary.filter_extremes(no_below=3, no_above=0.6)
len(dictionary)

18348

In [30]:
bow_lsi = lsi_bow(data, dictionary, num_topics=400, tfidf=None)

In [31]:
params = {'penalty': ['l2'], 'C': [1,10,100]}
grid = GridSearchCV(LogisticRegression(), params, cv=5)
grid.fit(bow_lsi, tags)
print(grid.best_params_)
print(grid.best_score_)

{'C': 1, 'penalty': 'l2'}
0.847414070029


## 6.LSI tfidf

In [6]:
dictionary = corpora.Dictionary(data)
dictionary.filter_extremes(no_below=6, no_above=0.6)
len(dictionary)

12870

In [None]:
num_topics = [200, 300, 400, 500]

In [None]:
for num_topic in num_topics:
    bow_lsi_tfidf = lsi_bow(data, dictionary, num_topics=num_topic, tfidf=True)
    params = {'penalty': ['l2'], 'C': np.arange(1,21,4)}
    grid = GridSearchCV(LogisticRegression(), params, cv=5)
    grid.fit(bow_lsi_tfidf, tags)
    print(grid.best_params_)
    print(grid.best_score_)
    

In [7]:
bow_lsi_tfidf = lsi_bow(data, dictionary, num_topics=400, tfidf=True)

In [61]:
params = {'penalty': ['l2'], 'C': np.arange(1,21,4)}
grid = GridSearchCV(LogisticRegression(), params, cv=5)
grid.fit(bow_lsi_tfidf, tags)
print(grid.best_params_)
print(grid.best_score_)

{'C': 13, 'penalty': 'l2'}
0.889816896884


In [18]:
params = {'n_estimators': [5, 10, 20, 30, 50, 100, 300, 1000, 10000], "max_features": ["auto", "sqrt", "log2"]}
grid = GridSearchCV(RandomForestClassifier(), params, cv=5, n_jobs=-1)
grid.fit(bow_lsi_tfidf, tags)
print(grid.best_params_)
print(grid.best_score_)

{'n_estimators': 10000, 'max_features': 'sqrt'}
0.867973016383


In [8]:
params = tuned_parameters = [
    {'C': [1, 10, 100], 'kernel': ['linear']},
    {'C': [1, 10, 100], 'kernel': ['rbf'], 'gamma': [0.001, 0.0001]}]
grid = GridSearchCV(SVC(), params, cv=5, n_jobs=-1)
grid.fit(bow_lsi_tfidf, tags)
print(grid.best_params_)
print(grid.best_score_)

{'kernel': 'linear', 'C': 1}
0.885067479321
