In [1]:
import pickle
import numpy as np
import xgboost as xgb
from CountFeatureGenerator import *
from TfidfFeatureGenerator import *
from SvdFeatureGenerator import *
from Word2VecFeatureGenerator import *
from SentimentFeatureGenerator import *

In [2]:
params_xgb = {
    'max_depth': 6,
    'colsample_bytree': 0.6,
    'subsample': 1.0,
    'eta': 0.1,
    'silent': 1,
    'objective': 'multi:softmax',
    'eval_metric':'mlogloss',
    'num_class': 4
}

In [3]:
with open('related_data.pkl','rb') as infile:
    related_data = pickle.load(infile)
    print('related_data loaded, the shape is ', related_data.shape)
train_y = related_data['target'].values
print(train_y.shape)

related_data loaded, the shape is  (13427, 11)
(13427,)


In [4]:
features = []
generators = [
    CountFeatureGenerator(),
    TfidfFeatureGenerator(),
    SvdFeatureGenerator(),
    Word2VecFeatureGenerator(),
    SentimentFeatureGenerator()
]
for g in generators:
    features = g.read(header='train')
    if g.name()=='tfidfFeatureGenerator':
        features.append(features[-1])
    else:
        for f in features:
            features.append(f)
train_X = np.hstack(related_test_features)
print(train_X.shape)

feature names:  ['count_of_Headline_unigram', 'count_of_unique_Headline_unigram', 'ratio_of_unique_Headline_unigram', 'count_of_Headline_bigram', 'count_of_unique_Headline_bigram', 'ratio_of_unique_Headline_bigram', 'count_of_Headline_trigram', 'count_of_unique_Headline_trigram', 'ratio_of_unique_Headline_trigram', 'count_of_articleBody_unigram', 'count_of_unique_articleBody_unigram', 'ratio_of_unique_articleBody_unigram', 'count_of_articleBody_bigram', 'count_of_unique_articleBody_bigram', 'ratio_of_unique_articleBody_bigram', 'count_of_articleBody_trigram', 'count_of_unique_articleBody_trigram', 'ratio_of_unique_articleBody_trigram', 'count_of_Headline_unigram_in_articleBody', 'ratio_of_Headline_unigram_in_articleBody', 'count_of_Headline_bigram_in_articleBody', 'ratio_of_Headline_bigram_in_articleBody', 'count_of_Headline_trigram_in_articleBody', 'ratio_of_Headline_trigram_in_articleBody', 'len_sent_Headline', 'len_sent_articleBody', 'fake_exist', 'fraud_exist', 'hoax_exist', 'false

In [5]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.ensemble import GradientBoostingClassifier

In [6]:
train_X

array([[ 8.        ,  8.        ,  1.        , ...,  0.08      ,
         0.885     ,  0.03488235],
       [ 5.        ,  5.        ,  1.        , ...,  0.03011429,
         0.901     ,  0.06888571],
       [13.        , 13.        ,  1.        , ...,  0.14115   ,
         0.77945   ,  0.02945   ],
       ...,
       [11.        , 11.        ,  1.        , ...,  0.10942105,
         0.86442105,  0.02615789],
       [ 8.        ,  8.        ,  1.        , ...,  0.158     ,
         0.728     ,  0.115     ],
       [11.        , 11.        ,  1.        , ...,  0.03528571,
         0.86414286,  0.10057143]])

In [7]:
train_y

array([0, 1, 0, ..., 2, 0, 2], dtype=int64)

In [8]:
kf = KFold(n_splits=5, shuffle=True, random_state = 1415926)
train_cv_acc = []
best_score = 0
for train_idx, val_idx in kf.split(train_X):
    train_data = train_X[train_idx]
    train_label = train_y[train_idx]
    val_data = train_X[val_idx]
    val_label = train_y[val_idx]
    clf = GradientBoostingClassifier(n_estimators=200, random_state=14128, verbose=True)
    clf.fit(train_data, train_label)
    pred_label = clf.predict(val_data)
    score = accuracy_score(val_label, pred_label)
    if score>best_score:
        best_score = score
        best_clf=clf
    print("accuracy = ", score)

      Iter       Train Loss   Remaining Time 
         1        8216.5275           15.49m
         2        7920.6262           14.76m
         3        7676.0906           14.81m
         4        7448.3173           14.80m
         5        7267.1475           14.64m
         6        7115.8775           14.49m
         7        6959.8799           14.38m
         8        6839.5640           14.30m
         9        6717.7665           14.16m
        10        6598.4330           14.05m
        20        5851.2523           13.13m
        30        5359.3949           12.40m
        40        4969.8418           11.71m
        50        4681.1144           11.01m
        60        4423.3088           10.29m
        70        4184.7860            9.55m
        80        3972.8249            8.82m
        90        3799.5705            8.08m
       100        3643.1063            7.35m
       200        2591.1783            0.00s
accuracy =  0.8912881608339538
      Iter       Train 

In [10]:
from joblib import dump, load

In [11]:
dump(best_clf, 'relation.joblib')

['relation.joblib']