In [2]:
#Data Analysis
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

'''Features'''
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import label_binarize

'''Classifiers'''
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import RidgeClassifier , LogisticRegression

'''Metrics/Evaluation'''
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, confusion_matrix
from scipy import interp
from itertools import cycle
from sklearn.ensemble import VotingClassifier
from sklearn.neural_network import MLPClassifier

'''Plotting'''
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

'''Display'''
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))
import warnings
warnings.filterwarnings('ignore')
pd.options.display.float_format = '{:,.2f}'.format

In [3]:
uskr_train = pd.read_csv('C:/Users/Juwon Oh/Documents/github/sentiment analysis/sentiment_six_relations_original_cnjp.csv')
uskr = uskr_train[['sentiment','text_raw']]
uskr = uskr.fillna(0)
uskr = uskr.dropna()
text= uskr.iloc[:, -1].values
labels = uskr.sentiment.values
uskr.shape

(54, 2)

## train set

In [4]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')

new_corpus=[' '.join([stemmer.stem(word) for word in text.split(' ')])for text in text]

In [5]:
from nltk.stem import WordNetLemmatizer
lemmer=WordNetLemmatizer()

new_corpus=[' '.join([lemmer.lemmatize(word) for word in text.split(' ')])for text in new_corpus]

In [6]:
count_vect = CountVectorizer(stop_words="english", ngram_range=(1, 2), max_df= 0.95, min_df=4)
text_vector = count_vect.fit_transform(new_corpus)
tfidf_transformer = TfidfTransformer(use_idf=True)
text_Tfidf = tfidf_transformer.fit_transform(text_vector)

In [7]:
text_Tfidf.shape

(54, 1059)

In [8]:
#Dimenionality reduction. Only using the 100 best features er category

lsa = TruncatedSVD(n_components=1000, 
                   n_iter=10, 
                   random_state=3)

X = lsa.fit_transform(text_Tfidf)

In [12]:
from imblearn.over_sampling import SMOTE
# 모델설정
sm = SMOTE(ratio='auto', kind='regular', k_neighbors=3)

# train데이터를 넣어 복제함
X_resampled, y_resampled = sm.fit_sample(X,list(labels))

print('After OverSampling, the shape of train_X: {}'.format(X_resampled.shape))
print('After OverSampling, the shape of train_y: {} \n'.format(X_resampled.shape))

print("After OverSampling, counts of label '-1': {}".format(sum(y_resampled==-1)))
print("After OverSampling, counts of label '1': {}".format(sum(y_resampled==1)))
print("After OverSampling, counts of label '0': {}".format(sum(y_resampled==0)))

After OverSampling, the shape of train_X: (105, 54)
After OverSampling, the shape of train_y: (105, 54) 

After OverSampling, counts of label '-1': 35
After OverSampling, counts of label '1': 35
After OverSampling, counts of label '0': 35


In [13]:
## BernoulliNB
# BernoulliNB(alpha=1.0, binarize=0.0, fit_prior=True, class_prior=None)

Bernoullinb = BernoulliNB()
params = {'alpha' : [0.0, 0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0,3.0 ,5.0, 7.0, 8.0,10.0]}

Bernoullinb = GridSearchCV(BernoulliNB(), 
                   params,
                   cv = 5,
                   verbose = 3, 
                   n_jobs = -1)
Bernoullinb_best_model = Bernoullinb.fit(X_resampled, y_resampled)
print('GridSearchCV best score : {:.2f}%, best_params : {}'.format(Bernoullinb.best_score_*100, Bernoullinb.best_params_))

Fitting 5 folds for each of 13 candidates, totalling 65 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    6.0s
[Parallel(n_jobs=-1)]: Done  50 out of  65 | elapsed:    6.2s remaining:    1.8s


GridSearchCV best score : 79.05%, best_params : {'alpha': 0.0}


[Parallel(n_jobs=-1)]: Done  65 out of  65 | elapsed:    6.2s finished


In [14]:
means = Bernoullinb_best_model.cv_results_['mean_test_score']
stds = Bernoullinb_best_model.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, Bernoullinb_best_model.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"% (mean, std * 2, params))

0.790 (+/-0.177) for {'alpha': 0.0}
0.790 (+/-0.177) for {'alpha': 0.0001}
0.790 (+/-0.177) for {'alpha': 0.001}
0.790 (+/-0.177) for {'alpha': 0.01}
0.790 (+/-0.177) for {'alpha': 0.1}
0.790 (+/-0.177) for {'alpha': 0.5}
0.771 (+/-0.220) for {'alpha': 1.0}
0.771 (+/-0.251) for {'alpha': 2.0}
0.762 (+/-0.233) for {'alpha': 3.0}
0.781 (+/-0.230) for {'alpha': 5.0}
0.771 (+/-0.212) for {'alpha': 7.0}
0.762 (+/-0.233) for {'alpha': 8.0}
0.771 (+/-0.212) for {'alpha': 10.0}


In [15]:
## SGDClassifier
# SGDClassifier(loss=’hinge’, penalty=’l2’, alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=1000, tol=0.001, shuffle=True, verbose=0, epsilon=0.1, n_jobs=None, random_state=None, learning_rate=’optimal’, eta0=0.0, power_t=0.5, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, class_weight=None, warm_start=False, average=False)

loss =  ['hinge','log']
penalty = ['l2','l1']
alpha = [1e-6, 1e-3, 1e-1, 1e0, 0.0001]
max_iter = [5,10,20,30, 50, 60, 70, 100,1000, 10000]
tol = [None, 1e-3]
eta0 = [0.1, 0.001]
random_state = [3]

SGD = SGDClassifier()

params = dict(loss=loss,
              penalty=penalty,
              alpha=alpha,
              max_iter=max_iter,
              tol=tol,
              random_state=random_state)

SGDgridsearch = GridSearchCV(SGDClassifier(),
                          params,
                          cv = 3,
                          verbose = 1, 
                          n_jobs = -1)

sgd_best_model = SGDgridsearch.fit(X_resampled, y_resampled)
print('GridSearchCV best score : {:.2f}%, best_params : {}'.format(SGDgridsearch.best_score_*100, SGDgridsearch.best_params_))

Fitting 3 folds for each of 400 candidates, totalling 1200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 1034 tasks      | elapsed:   13.4s


GridSearchCV best score : 95.24%, best_params : {'alpha': 1e-06, 'loss': 'log', 'max_iter': 5, 'penalty': 'l1', 'random_state': 3, 'tol': None}


[Parallel(n_jobs=-1)]: Done 1200 out of 1200 | elapsed:   20.1s finished


In [16]:
means = sgd_best_model.cv_results_['mean_test_score']
stds = sgd_best_model.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, sgd_best_model.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"% (mean, std * 2, params))

0.905 (+/-0.024) for {'alpha': 1e-06, 'loss': 'hinge', 'max_iter': 5, 'penalty': 'l2', 'random_state': 3, 'tol': None}
0.905 (+/-0.024) for {'alpha': 1e-06, 'loss': 'hinge', 'max_iter': 5, 'penalty': 'l2', 'random_state': 3, 'tol': 0.001}
0.905 (+/-0.024) for {'alpha': 1e-06, 'loss': 'hinge', 'max_iter': 5, 'penalty': 'l1', 'random_state': 3, 'tol': None}
0.905 (+/-0.024) for {'alpha': 1e-06, 'loss': 'hinge', 'max_iter': 5, 'penalty': 'l1', 'random_state': 3, 'tol': 0.001}
0.895 (+/-0.019) for {'alpha': 1e-06, 'loss': 'hinge', 'max_iter': 10, 'penalty': 'l2', 'random_state': 3, 'tol': None}
0.895 (+/-0.019) for {'alpha': 1e-06, 'loss': 'hinge', 'max_iter': 10, 'penalty': 'l2', 'random_state': 3, 'tol': 0.001}
0.905 (+/-0.024) for {'alpha': 1e-06, 'loss': 'hinge', 'max_iter': 10, 'penalty': 'l1', 'random_state': 3, 'tol': None}
0.905 (+/-0.024) for {'alpha': 1e-06, 'loss': 'hinge', 'max_iter': 10, 'penalty': 'l1', 'random_state': 3, 'tol': 0.001}
0.886 (+/-0.039) for {'alpha': 1e-06, 'l

In [17]:
## LogisticRegression
# LogisticRegression(penalty=’l2’, dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver=’warn’, max_iter=100, multi_class=’warn’, verbose=0, warm_start=False, n_jobs=None, l1_ratio=None)


params = {"solver" : ['lbfgs', 'newton-cg'],
          "C":[1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6],
          "penalty":["l2"],
          'multi_class': ['multinomial']}


Logisticgridsearch = GridSearchCV(LogisticRegression(), params,
                     cv=5,
                     verbose=1,
                     n_jobs=-1)

Logistic_model = Logisticgridsearch.fit(X_resampled, y_resampled)
print('Logistic GridSearchCV best score : {:.2f}%, best_params : {}'.format(Logisticgridsearch.best_score_*100, Logisticgridsearch.best_params_))


Fitting 5 folds for each of 22 candidates, totalling 110 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Logistic GridSearchCV best score : 94.29%, best_params : {'C': 10000.0, 'multi_class': 'multinomial', 'penalty': 'l2', 'solver': 'lbfgs'}


[Parallel(n_jobs=-1)]: Done 110 out of 110 | elapsed:    0.3s finished


In [18]:
## LinearSVC

params_grid = [{'kernel': ['rbf'], 'gamma': [1e-1, 1e-2, 1e-3, 1e-4, 1e-5],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000, 1500]}]
svm_model = GridSearchCV(SVC(), params_grid, cv=5)
svm_model.fit(X_resampled, y_resampled)
print('Logistic GridSearchCV best score : {:.2f}%, best_params : {}'.format(Logisticgridsearch.best_score_*100, Logisticgridsearch.best_params_))


Logistic GridSearchCV best score : 94.29%, best_params : {'C': 10000.0, 'multi_class': 'multinomial', 'penalty': 'l2', 'solver': 'lbfgs'}


In [19]:
## RandomForestClassifier
# randomForestClassifier(n_estimators=’warn’, criterion=’gini’, max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=’auto’, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None)

bootstrap = [True, False]
max_depth = [5,10,20,30, 50, 100, None]
max_features = ['auto', 'sqrt']
min_samples_leaf = [1, 2, 3, 4]
min_samples_split = [2, 5, 10]
n_estimators = [50, 100, 200, 300, 500 ,600, 800]
random_state = [3]

params = dict(bootstrap = bootstrap,
              max_depth = max_depth,
              max_features = max_features,
              min_samples_leaf = min_samples_leaf,
              n_estimators = n_estimators,
              random_state=random_state)

rfgridsearch = GridSearchCV(RandomForestClassifier(),
                          params, 
                          cv=5,
                          verbose=1, 
                          n_jobs=-1)

rf_best_model = rfgridsearch.fit(X_resampled, y_resampled)
print('GridSearchCV best score : {:.2f}%, best_params : {}'.format(rfgridsearch.best_score_*100, rfgridsearch.best_params_))

Fitting 5 folds for each of 784 candidates, totalling 3920 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:    6.5s
[Parallel(n_jobs=-1)]: Done 200 tasks      | elapsed:   28.0s
[Parallel(n_jobs=-1)]: Done 450 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 800 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 1250 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 1800 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 2450 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done 3200 tasks      | elapsed:  7.3min


GridSearchCV best score : 90.48%, best_params : {'bootstrap': True, 'max_depth': 5, 'max_features': 'auto', 'min_samples_leaf': 3, 'n_estimators': 100, 'random_state': 3}


[Parallel(n_jobs=-1)]: Done 3920 out of 3920 | elapsed:  9.0min finished


In [20]:
## DecisionTreeClassifier## extra tree
# DecisionTreeClassifier(criterion=’gini’, splitter=’best’, max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, class_weight=None, presort=False)

params = {'criterion': ["gini", 'entropy'],
          'min_samples_split':[2,3,5,10,12,15,17,20,25,30,35,40],
          'min_samples_leaf':[1,2,3,4,5,7,9,10,13,15,17,20,23,25,27,30]}

DecisionTree_gridsearch = GridSearchCV(DecisionTreeClassifier(), 
                                       params, 
                                       cv=5,
                                       verbose=1, 
                                       n_jobs=-1)

DecisionTree_best_model = DecisionTree_gridsearch.fit(X_resampled, y_resampled)
print('DecisionTree GridSearchCV best score : {:.2f}%, best_params : {}'.format(DecisionTree_gridsearch.best_score_*100, DecisionTree_gridsearch.best_params_))

Fitting 5 folds for each of 384 candidates, totalling 1920 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done 412 tasks      | elapsed:    0.6s


DecisionTree GridSearchCV best score : 82.86%, best_params : {'criterion': 'gini', 'min_samples_leaf': 1, 'min_samples_split': 2}


[Parallel(n_jobs=-1)]: Done 1920 out of 1920 | elapsed:    2.1s finished


In [21]:
## GradientBoostingClassifier
# GradientBoostingClassifier(loss=’deviance’, learning_rate=0.1, n_estimators=100, subsample=1.0, criterion=’friedman_mse’, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0, min_impurity_split=None, init=None, random_state=None, max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False, presort=’auto’, validation_fraction=0.1, n_iter_no_change=None, tol=0.0001)[source]

from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier()
params = {'n_estimators' : [10, 20, 30, 50, 100, 150, 200],
          'max_depth': [3, 5,10,15,20,25,30],
          'min_samples_split':[2,5,7, 10,12,15]}

gbgridsearch = GridSearchCV(GradientBoostingClassifier(), params,
                     cv=5,
                     verbose=1,
                     n_jobs=-1)

gb_best_model = gbgridsearch.fit(X_resampled, y_resampled)
print('extra tree GridSearchCV best score : {:.2f}%, best_params : {}'.format(gbgridsearch.best_score_*100, gbgridsearch.best_params_))

Fitting 5 folds for each of 294 candidates, totalling 1470 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Done 268 tasks      | elapsed:   14.4s
[Parallel(n_jobs=-1)]: Done 518 tasks      | elapsed:   29.4s
[Parallel(n_jobs=-1)]: Done 868 tasks      | elapsed:   52.2s
[Parallel(n_jobs=-1)]: Done 1318 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 1470 out of 1470 | elapsed:  1.5min finished


extra tree GridSearchCV best score : 91.43%, best_params : {'max_depth': 3, 'min_samples_split': 2, 'n_estimators': 150}


In [22]:
## RidgeClassifier
# class sklearn.linear_model.RidgeClassifier(alpha=1.0, fit_intercept=True, normalize=False, copy_X=True, max_iter=None, tol=0.001, class_weight=None, solver=’auto’, random_state=None)

params = {'alpha': [25,10,4,2,1.0,0.8,0.5,0.3,0.2,0.1,0.05,0.02,0.01]}
RidgeClassifier()
Ridgegridsearch = GridSearchCV(RidgeClassifier(), params,
                     cv=3,
                     verbose=1,
                     n_jobs=-1)

Ridge_model = Ridgegridsearch.fit(X_resampled, y_resampled)
print('Ridge GridSearchCV best score : {:.2f}%, best_params : {}'.format(Ridgegridsearch.best_score_*100, Ridgegridsearch.best_params_))

Fitting 3 folds for each of 13 candidates, totalling 39 fits
Ridge GridSearchCV best score : 91.43%, best_params : {'alpha': 2}


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  39 out of  39 | elapsed:    0.0s finished


In [None]:
clfs = [('sgd', sgd_best_model), ('svm', svm_model),('Logistic', Logistic_model)]

vote_clf = VotingClassifier(clfs, voting= 'soft') 
vote_clf.fit(X_resampled, y_resampled)


#### test set

In [24]:
uskr = pd.read_csv('C:/Users/Juwon Oh/Documents/Dropbox/BigDataDiplomacy/Code/2019/Analysis/Fall/CNJP.csv', encoding = "ISO-8859-1")
fox = uskr[['date', 'title', 'text']]
#fox = fox.dropna()
fox = fox.fillna(0)
test_text = fox.iloc[:, -1].values
fox.shape

(6452, 3)

In [25]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')

test_corpus=[' '.join([stemmer.stem(word) for word in text.split(' ')])for text in test_text]

In [26]:
from nltk.stem import WordNetLemmatizer
lemmer=WordNetLemmatizer()

test_corpus=[' '.join([lemmer.lemmatize(word) for word in text.split(' ')])for text in test_corpus]

In [27]:
count_vect = CountVectorizer(stop_words="english", ngram_range=(1, 2), max_df= 0.99, min_df=0.01)
text_vector = count_vect.fit_transform(test_corpus)
tfidf_transformer = TfidfTransformer(use_idf=True)
test_Tfidf = tfidf_transformer.fit_transform(text_vector)

In [30]:
#Dimenionality reduction. Only using the 100 best features er category

lsa = TruncatedSVD(n_components=54, 
                   n_iter=10, 
                   random_state=3)

y = lsa.fit_transform(test_Tfidf)


In [31]:
sgd_pred = sgd_best_model.predict(y)

In [77]:
rf_pred = rf_best_model.predict(y)

In [None]:
svm_pred =svm_model.predict(y)

In [None]:
Logistic_pred =Logistic_model.predict(y)

In [None]:
y_pred = vote_clf.predict(X_test)

In [None]:
accuracy_score(svm_pred, sgd_pred)

In [None]:
accuracy_score(Logistic_pred, sgd_pred)

In [None]:
len(pred)

In [33]:
fox['pred'] = sgd_pred

In [34]:
fox
fox.to_csv("C:/Users/Juwon Oh/Documents/Dropbox/BigDataDiplomacy/Code/2019/Analysis/Fall/CNJP_predict.csv", mode='w')

--------------------------------

In [35]:
uskr_train = pd.read_csv('C:/Users/Juwon Oh/Documents/github/sentiment analysis/sentiment_six_relations_original_usjp.csv')
uskr = uskr_train[['sentiment','text_raw']]
uskr = uskr.fillna(0)
uskr = uskr.dropna()
text= uskr.iloc[:, -1].values
labels = uskr.sentiment.values
uskr.shape

(158, 2)

## train set

In [36]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')

new_corpus=[' '.join([stemmer.stem(word) for word in text.split(' ')])for text in text]

In [37]:
from nltk.stem import WordNetLemmatizer
lemmer=WordNetLemmatizer()

new_corpus=[' '.join([lemmer.lemmatize(word) for word in text.split(' ')])for text in new_corpus]

In [38]:
count_vect = CountVectorizer(stop_words="english", ngram_range=(1, 2), max_df= 0.95, min_df=4)
text_vector = count_vect.fit_transform(new_corpus)
tfidf_transformer = TfidfTransformer(use_idf=True)
text_Tfidf = tfidf_transformer.fit_transform(text_vector)

In [39]:
text_Tfidf.shape

(158, 4443)

In [40]:
#Dimenionality reduction. Only using the 100 best features er category

lsa = TruncatedSVD(n_components=1000, 
                   n_iter=10, 
                   random_state=3)

X = lsa.fit_transform(text_Tfidf)

In [41]:
from imblearn.over_sampling import SMOTE
# 모델설정
sm = SMOTE(ratio='auto', kind='regular', k_neighbors=5)

# train데이터를 넣어 복제함
X_sampled, y_sampled = sm.fit_sample(X,list(labels))

print('After OverSampling, the shape of train_X: {}'.format(X_sampled.shape))
print('After OverSampling, the shape of train_y: {} \n'.format(X_sampled.shape))

print("After OverSampling, counts of label '-1': {}".format(sum(y_sampled==-1)))
#print("After OverSampling, counts of label '1': {}".format(sum(y_resampled==1)))
print("After OverSampling, counts of label '0': {}".format(sum(y_sampled==0)))

After OverSampling, the shape of train_X: (222, 158)
After OverSampling, the shape of train_y: (222, 158) 

After OverSampling, counts of label '-1': 74
After OverSampling, counts of label '0': 74


In [42]:
## BernoulliNB
# BernoulliNB(alpha=1.0, binarize=0.0, fit_prior=True, class_prior=None)

Bernoullinb = BernoulliNB()
params = {'alpha' : [0.0, 0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0,3.0 ,5.0, 7.0, 8.0,10.0]}

Bernoullinb = GridSearchCV(BernoulliNB(), 
                   params,
                   cv = 5,
                   verbose = 3, 
                   n_jobs = -1)
Bernoullinb_best_model = Bernoullinb.fit(X_sampled, y_sampled)
print('GridSearchCV best score : {:.2f}%, best_params : {}'.format(Bernoullinb.best_score_*100, Bernoullinb.best_params_))

Fitting 5 folds for each of 13 candidates, totalling 65 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


GridSearchCV best score : 66.67%, best_params : {'alpha': 0.0}


[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    7.7s
[Parallel(n_jobs=-1)]: Done  50 out of  65 | elapsed:    7.8s remaining:    2.3s
[Parallel(n_jobs=-1)]: Done  65 out of  65 | elapsed:    7.8s finished


In [43]:
means = Bernoullinb_best_model.cv_results_['mean_test_score']
stds = Bernoullinb_best_model.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, Bernoullinb_best_model.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"% (mean, std * 2, params))

0.667 (+/-0.162) for {'alpha': 0.0}
0.667 (+/-0.162) for {'alpha': 0.0001}
0.667 (+/-0.162) for {'alpha': 0.001}
0.667 (+/-0.162) for {'alpha': 0.01}
0.667 (+/-0.162) for {'alpha': 0.1}
0.667 (+/-0.162) for {'alpha': 0.5}
0.662 (+/-0.156) for {'alpha': 1.0}
0.662 (+/-0.156) for {'alpha': 2.0}
0.667 (+/-0.144) for {'alpha': 3.0}
0.653 (+/-0.169) for {'alpha': 5.0}
0.653 (+/-0.169) for {'alpha': 7.0}
0.653 (+/-0.169) for {'alpha': 8.0}
0.649 (+/-0.163) for {'alpha': 10.0}


In [44]:
## SGDClassifier
# SGDClassifier(loss=’hinge’, penalty=’l2’, alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=1000, tol=0.001, shuffle=True, verbose=0, epsilon=0.1, n_jobs=None, random_state=None, learning_rate=’optimal’, eta0=0.0, power_t=0.5, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, class_weight=None, warm_start=False, average=False)

loss =  ['hinge','log']
penalty = ['l2','l1']
alpha = [1e-6, 1e-3, 1e-1, 1e0, 0.0001]
max_iter = [5,10,20,30, 50, 60, 70, 100,1000, 10000]
tol = [None, 1e-3]
eta0 = [0.1, 0.001]
random_state = [3]

SGD = SGDClassifier()

params = dict(loss=loss,
              penalty=penalty,
              alpha=alpha,
              max_iter=max_iter,
              tol=tol,
              random_state=random_state)

SGDgridsearch = GridSearchCV(SGDClassifier(),
                          params,
                          cv = 5,
                          verbose = 1, 
                          n_jobs = -1)

sgd_best_model = SGDgridsearch.fit(X_sampled, y_sampled)
print('GridSearchCV best score : {:.2f}%, best_params : {}'.format(SGDgridsearch.best_score_*100, SGDgridsearch.best_params_))

Fitting 5 folds for each of 400 candidates, totalling 2000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 795 tasks      | elapsed:   36.9s
[Parallel(n_jobs=-1)]: Done 1142 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 1588 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 1985 out of 2000 | elapsed:  2.8min remaining:    1.2s


GridSearchCV best score : 82.88%, best_params : {'alpha': 0.001, 'loss': 'log', 'max_iter': 20, 'penalty': 'l2', 'random_state': 3, 'tol': None}


[Parallel(n_jobs=-1)]: Done 2000 out of 2000 | elapsed:  3.1min finished


In [45]:
means = sgd_best_model.cv_results_['mean_test_score']
stds = sgd_best_model.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, sgd_best_model.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"% (mean, std * 2, params))

0.793 (+/-0.162) for {'alpha': 1e-06, 'loss': 'hinge', 'max_iter': 5, 'penalty': 'l2', 'random_state': 3, 'tol': None}
0.793 (+/-0.162) for {'alpha': 1e-06, 'loss': 'hinge', 'max_iter': 5, 'penalty': 'l2', 'random_state': 3, 'tol': 0.001}
0.784 (+/-0.169) for {'alpha': 1e-06, 'loss': 'hinge', 'max_iter': 5, 'penalty': 'l1', 'random_state': 3, 'tol': None}
0.784 (+/-0.169) for {'alpha': 1e-06, 'loss': 'hinge', 'max_iter': 5, 'penalty': 'l1', 'random_state': 3, 'tol': 0.001}
0.748 (+/-0.094) for {'alpha': 1e-06, 'loss': 'hinge', 'max_iter': 10, 'penalty': 'l2', 'random_state': 3, 'tol': None}
0.748 (+/-0.094) for {'alpha': 1e-06, 'loss': 'hinge', 'max_iter': 10, 'penalty': 'l2', 'random_state': 3, 'tol': 0.001}
0.784 (+/-0.159) for {'alpha': 1e-06, 'loss': 'hinge', 'max_iter': 10, 'penalty': 'l1', 'random_state': 3, 'tol': None}
0.784 (+/-0.159) for {'alpha': 1e-06, 'loss': 'hinge', 'max_iter': 10, 'penalty': 'l1', 'random_state': 3, 'tol': 0.001}
0.802 (+/-0.118) for {'alpha': 1e-06, 'l

In [46]:
## LogisticRegression
# LogisticRegression(penalty=’l2’, dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver=’warn’, max_iter=100, multi_class=’warn’, verbose=0, warm_start=False, n_jobs=None, l1_ratio=None)


params = {"solver" : ['lbfgs', 'newton-cg'],
          "C":[1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6],
          "penalty":["l2"],
          'multi_class': ['multinomial']}


Logisticgridsearch = GridSearchCV(LogisticRegression(), params,
                     cv=5,
                     verbose=1,
                     n_jobs=-1)

Logistic_model = Logisticgridsearch.fit(X_sampled, y_sampled)
print('Logistic GridSearchCV best score : {:.2f}%, best_params : {}'.format(Logisticgridsearch.best_score_*100, Logisticgridsearch.best_params_))


Fitting 5 folds for each of 22 candidates, totalling 110 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Logistic GridSearchCV best score : 81.53%, best_params : {'C': 100.0, 'multi_class': 'multinomial', 'penalty': 'l2', 'solver': 'lbfgs'}


[Parallel(n_jobs=-1)]: Done 110 out of 110 | elapsed:    0.9s finished


In [47]:
## LinearSVC

params_grid = [{'kernel': ['rbf'], 'gamma': [1e-1, 1e-2, 1e-3, 1e-4, 1e-5],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000, 1500]}]
svm_model = GridSearchCV(SVC(), params_grid, cv=5)
svm_model.fit(X_resampled, y_resampled)
print('Logistic GridSearchCV best score : {:.2f}%, best_params : {}'.format(Logisticgridsearch.best_score_*100, Logisticgridsearch.best_params_))


Logistic GridSearchCV best score : 81.53%, best_params : {'C': 100.0, 'multi_class': 'multinomial', 'penalty': 'l2', 'solver': 'lbfgs'}


In [48]:
## RandomForestClassifier
# randomForestClassifier(n_estimators=’warn’, criterion=’gini’, max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=’auto’, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None)

bootstrap = [True, False]
max_depth = [5,10,20,30, 50, 100, None]
max_features = ['auto', 'sqrt']
min_samples_leaf = [1, 2, 3, 4]
min_samples_split = [2, 5, 10]
n_estimators = [50, 100, 200, 300, 500 ,600, 800]
random_state = [3]

params = dict(bootstrap = bootstrap,
              max_depth = max_depth,
              max_features = max_features,
              min_samples_leaf = min_samples_leaf,
              n_estimators = n_estimators,
              random_state=random_state)

rfgridsearch = GridSearchCV(RandomForestClassifier(),
                          params, 
                          cv=3,
                          verbose=1, 
                          n_jobs=-1)

rf_best_model = rfgridsearch.fit(X_sampled, y_sampled)
print('GridSearchCV best score : {:.2f}%, best_params : {}'.format(rfgridsearch.best_score_*100, rfgridsearch.best_params_))

Fitting 3 folds for each of 784 candidates, totalling 2352 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   32.2s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done 2352 out of 2352 | elapsed:  7.6min finished


GridSearchCV best score : 81.08%, best_params : {'bootstrap': False, 'max_depth': 20, 'max_features': 'auto', 'min_samples_leaf': 1, 'n_estimators': 100, 'random_state': 3}


In [49]:
## DecisionTreeClassifier## extra tree
# DecisionTreeClassifier(criterion=’gini’, splitter=’best’, max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, class_weight=None, presort=False)

params = {'criterion': ["gini", 'entropy'],
          'min_samples_split':[2,3,5,10,12,15,17,20,25,30,35,40],
          'min_samples_leaf':[1,2,3,4,5,7,9,10,13,15,17,20,23,25,27,30]}

DecisionTree_gridsearch = GridSearchCV(DecisionTreeClassifier(), 
                                       params, 
                                       cv=3,
                                       verbose=1, 
                                       n_jobs=-1)

DecisionTree_best_model = DecisionTree_gridsearch.fit(X_sampled, y_sampled)
print('DecisionTree GridSearchCV best score : {:.2f}%, best_params : {}'.format(DecisionTree_gridsearch.best_score_*100, DecisionTree_gridsearch.best_params_))

Fitting 3 folds for each of 384 candidates, totalling 1152 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done 268 tasks      | elapsed:    0.9s


DecisionTree GridSearchCV best score : 62.61%, best_params : {'criterion': 'entropy', 'min_samples_leaf': 9, 'min_samples_split': 15}


[Parallel(n_jobs=-1)]: Done 1152 out of 1152 | elapsed:    4.4s finished


In [50]:
## GradientBoostingClassifier
# GradientBoostingClassifier(loss=’deviance’, learning_rate=0.1, n_estimators=100, subsample=1.0, criterion=’friedman_mse’, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0, min_impurity_split=None, init=None, random_state=None, max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False, presort=’auto’, validation_fraction=0.1, n_iter_no_change=None, tol=0.0001)[source]

from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier()
params = {'n_estimators' : [10, 20, 30, 50, 100, 150, 200],
          'max_depth': [3, 5,10,15,20,25,30],
          'min_samples_split':[2,5,7, 10,12,15]}

gbgridsearch = GridSearchCV(GradientBoostingClassifier(), params,
                     cv=3,
                     verbose=1,
                     n_jobs=-1)

gb_best_model = gbgridsearch.fit(X_sampled, y_sampled)
print('extra tree GridSearchCV best score : {:.2f}%, best_params : {}'.format(gbgridsearch.best_score_*100, gbgridsearch.best_params_))

Fitting 3 folds for each of 294 candidates, totalling 882 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  43 tasks      | elapsed:    7.5s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:   35.0s
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 796 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 882 out of 882 | elapsed:  3.7min finished


extra tree GridSearchCV best score : 77.48%, best_params : {'max_depth': 3, 'min_samples_split': 15, 'n_estimators': 200}


In [51]:
## RidgeClassifier
# class sklearn.linear_model.RidgeClassifier(alpha=1.0, fit_intercept=True, normalize=False, copy_X=True, max_iter=None, tol=0.001, class_weight=None, solver=’auto’, random_state=None)

params = {'alpha': [25,10,4,2,1.0,0.8,0.5,0.3,0.2,0.1,0.05,0.02,0.01]}
RidgeClassifier()
Ridgegridsearch = GridSearchCV(RidgeClassifier(), params,
                     cv=3,
                     verbose=1,
                     n_jobs=-1)

Ridge_model = Ridgegridsearch.fit(X_sampled, y_sampled)
print('Ridge GridSearchCV best score : {:.2f}%, best_params : {}'.format(Ridgegridsearch.best_score_*100, Ridgegridsearch.best_params_))

Fitting 3 folds for each of 13 candidates, totalling 39 fits
Ridge GridSearchCV best score : 81.08%, best_params : {'alpha': 0.5}


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  39 out of  39 | elapsed:    0.0s finished


In [42]:
clfs = [('sgd', sgd_best_model), ('svm', svm_model),('Logistic', Logistic_model)]

vote_clf = VotingClassifier(clfs, voting= 'soft') 
vote_clf.fit(X_resampled, y_resampled)


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 5 folds for each of 400 candidates, totalling 2000 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   53.0s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  9.0min


KeyboardInterrupt: 

#### test set

In [52]:
uskr = pd.read_csv('C:/Users/Juwon Oh/Documents/Dropbox/BigDataDiplomacy/Code/2019/Analysis/Fall/USJP.csv', encoding = "ISO-8859-1")
fox = uskr[['date', 'title', 'text']]
#fox = fox.dropna()
fox = fox.fillna(0)
test_text = fox.iloc[:, -1].values
fox.shape

(6415, 3)

In [53]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')

test_corpus=[' '.join([stemmer.stem(word) for word in text.split(' ')])for text in test_text]

In [54]:
from nltk.stem import WordNetLemmatizer
lemmer=WordNetLemmatizer()

test_corpus=[' '.join([lemmer.lemmatize(word) for word in text.split(' ')])for text in test_corpus]

In [55]:
count_vect = CountVectorizer(stop_words="english", ngram_range=(1, 2), max_df= 0.99, min_df=0.01)
text_vector = count_vect.fit_transform(test_corpus)
tfidf_transformer = TfidfTransformer(use_idf=True)
test_Tfidf = tfidf_transformer.fit_transform(text_vector)

In [58]:
#Dimenionality reduction. Only using the 100 best features er category

lsa = TruncatedSVD(n_components=158, 
                   n_iter=10, 
                   random_state=3)

y = lsa.fit_transform(test_Tfidf)


In [60]:
sgd_pred = sgd_best_model.predict(y)

In [36]:
svm_pred =svm_model.predict(y)

In [37]:
Logistic_pred =Logistic_model.predict(y)

In [48]:
y_pred = vote_clf.predict(X_test)

NotFittedError: This VotingClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.

In [38]:
accuracy_score(svm_pred, sgd_pred)

0.7806004618937644

In [39]:
accuracy_score(Logistic_pred, sgd_pred)

0.8937644341801386

In [53]:
len(pred)

NameError: name 'pred' is not defined

In [55]:
fox

Unnamed: 0,date,title,text,pred
0,2018-11-21,Interpol elects South Korean Kim Jong Yang Pre...,russia ambitions head international police age...,-1
1,2018-12-20,"Mattis quits, says his views aren't 'aligned' ...",defense secretary james mattis resigned thursd...,-1
2,2019-01-16,Schellenberg death sentence: Canadian's verdic...,death sentence imposed canadian robert lloyd ...,-1
3,2019-01-27,Why a military spat between Japan and South Ko...,japan southkorea engaged heated military dispu...,-1
4,2019-02-22,President Trump meets with Kim Jong Un,trump spoke reporters hano talks northkorean k...,-1
5,2019-03-08,#MyFreedomDay 2019: Students teach the world a...,enormous thank everybody took stand slavery m...,0
6,2019-05-02,As last jailed Kim Jong Nam killing suspect fr...,second woman accused killing half brother nor...,-1
7,2019-05-24,Trump arrives in Japan with his Asia policy in...,shinzo abe long game paying year ago japanese ...,-1
8,2019-06-17,Chinese President Xi Jinping to make first off...,chinese president jinping will arrive northkor...,-1
9,2019-06-19,Trump has two strategies to counter nuclear th...,inherit one nuclear crisis might look like mi...,-1


In [61]:
fox['pred'] = sgd_pred

In [62]:
fox
fox.to_csv("C:/Users/Juwon Oh/Documents/Dropbox/BigDataDiplomacy/Code/2019/Analysis/Fall/usjp_predict.csv", mode='w')