In [1542]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import category_encoders as ce
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.metrics import f1_score, accuracy_score
from sklearn.naive_bayes import MultinomialNB, CategoricalNB
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import xgboost as xgb
import numpy as np

In [1543]:
train = pd.read_csv("1. train_original final.csv")
test =pd.read_csv("test final.csv")

In [1544]:
encoder = ce.OrdinalEncoder(cols=['category'])
train= encoder.fit_transform(train)
test= encoder.transform(test)

In [1545]:
train = train.drop(['Unnamed: 0.1', 'Unnamed: 0'], axis=1)
test = test.drop(['Unnamed: 0.1', 'Unnamed: 0'], axis=1)

In [1546]:
train.head()

Unnamed: 0,title,category,has_exclamation,has_question,has_number,num_count,mark_count,original_title,label_score
0,guru tinggi muhammadiyah indonesia tolak revis...,1,0,0,1,2,0,40 Perguruan Tinggi Muhammadiyah se-Indonesia ...,0
1,tiga bulan nikah onadio ronaldo karunia anak p...,2,0,0,0,0,1,"Tiga Bulan Nikah, Onadio Ronaldo Dikaruniai An...",1
2,sosok sukanto tanoto yang bakal buat ada lahan...,1,0,0,0,0,1,Sosok Sukanto Tanoto yang Bakal Buat Pengadaan...,1
3,anggota dprd provinsi sumut lantik,3,0,0,1,3,0,100 Anggota DPRD Provinsi Sumut Dilantik,0
4,polisi di meksiko temu mayat yang mutilasi,1,0,0,1,2,0,Polisi di Meksiko Temukan 44 Mayat yang Dimuti...,0


In [1547]:
test.head()

Unnamed: 0,title,category,has_exclamation,has_question,has_number,num_count,mark_count,original_title,label_score
0,embus angin di phillip island ingat rossi saat...,6.0,0,0,1,3,0,Embusan Angin di Phillip Island Ingatkan Rossi...,0
1,cara tiap zodiak manipulasi ada pisc dan capri...,7.0,1,0,0,0,2,"Cara Setiap Zodiak Memanipulasi Keadaan, Pisce...",1
2,prostitusi onlin di karimun korban buat utang ...,1.0,0,0,0,0,2,"Prostitusi Online di Karimun, Korban Dibuat Be...",0
3,libat dalam sinetron ppt jilid dube amerika ak...,2.0,0,0,1,2,1,"Terlibat Dalam Sinetron PPT Jilid 12, Dubes Am...",0
4,instrumenta hadir iris praktik seni media dan ...,7.0,0,0,1,1,0,Instrumenta #2 Hadirkan Irisan Praktik Seni Me...,0


In [1548]:
train['total_character'] = train['original_title'].str.len()
test['total_character'] = test['original_title'].str.len()

In [1549]:
train.head()

Unnamed: 0,title,category,has_exclamation,has_question,has_number,num_count,mark_count,original_title,label_score,total_character
0,guru tinggi muhammadiyah indonesia tolak revis...,1,0,0,1,2,0,40 Perguruan Tinggi Muhammadiyah se-Indonesia ...,0,65
1,tiga bulan nikah onadio ronaldo karunia anak p...,2,0,0,0,0,1,"Tiga Bulan Nikah, Onadio Ronaldo Dikaruniai An...",1,73
2,sosok sukanto tanoto yang bakal buat ada lahan...,1,0,0,0,0,1,Sosok Sukanto Tanoto yang Bakal Buat Pengadaan...,1,94
3,anggota dprd provinsi sumut lantik,3,0,0,1,3,0,100 Anggota DPRD Provinsi Sumut Dilantik,0,40
4,polisi di meksiko temu mayat yang mutilasi,1,0,0,1,2,0,Polisi di Meksiko Temukan 44 Mayat yang Dimuti...,0,50


In [1550]:
vec_tdidf = TfidfVectorizer(ngram_range=(1,1), analyzer='word', 
                                               norm='l2')

In [1551]:
#### Membuat Selector

In [1552]:
class TextSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on text columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None, *parg, **kwarg):
        return self

    def transform(self, X):
        # returns the input as a string
        return X[self.key]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # returns the input as a dataframe
        return X[[self.key]]

#### Membuat Pipeline

In [1553]:
# clf = xgb.XGBClassifier()
clf = xgb.XGBClassifier(random_state = 42,  n_estimators= 210, colsample_bytree= 0.65, subsample = 1, learning_rate = 0.1, max_depth = 12, reg_lambda = 1, seed=4)
# clf = RandomForestClassifier()
# clf = MultinomialNB()

In [1554]:
text = Pipeline([
                ('selector', TextSelector(key='title')),
                ('vectorizer', vec_tdidf)
                ])

In [1555]:
category = Pipeline([
                ('selector', NumberSelector(key='category')),
                ])
has_exclamation = Pipeline([
                ('selector', NumberSelector(key='has_exclamation')),
                ])
has_question = Pipeline([
                ('selector', NumberSelector(key='has_question')),
                ])
has_number = Pipeline([
                ('selector', NumberSelector(key='has_number')),
                ])
num_count = Pipeline([
                ('selector', NumberSelector(key='num_count')),
                ])
mark_count = Pipeline([
                ('selector', NumberSelector(key='mark_count')),
                ])
total_character = Pipeline([
                ('selector', NumberSelector(key='total_character')),
                ])

In [1556]:
feats = FeatureUnion([('title', text),
                      ('category', category),
                      ('has_exclamation', has_exclamation),
                      ('has_question', has_question),
                      ('has_number', has_number),
                      ('num_count', num_count),
                      ('mark_count', mark_count),
                      ('total_character', total_character)
                      ])

In [1557]:
pipe = Pipeline([('feats', feats),
                 ('clf', clf)
                 ])

In [1558]:
X_train = train.drop('label_score', axis=1)
y_train = train['label_score']
X_test = test.drop('label_score', axis=1)
y_test = test['label_score']

### Melakukan GridSearch

In [1559]:
# param_grid = {
# }
# 'clf__subsample': 0.5, 0.6, 0.7,0.8,1
# 'clf__colsample_bytree':0.8,1, 0.4, 0.7
# , 'clf__reg_lambda' : [1], 'clf__subsample' : [0.5, 0.7, 0.9] -> best gridsearch 0,9 padahal 0,7
# 'clf__gamma' : [0.1,0.2] jelek

In [1560]:
# grid_search = GridSearchCV(estimator=pipe, param_grid=param_grid, cv=3, n_jobs=-1, verbose=0, return_train_score=True)

In [1561]:
# grid_search.fit(X_train, y_train)

In [1562]:
# grid_search.cv_results_['mean_train_score']

In [1563]:
# grid_search.cv_results_['mean_test_score']

In [1564]:
# grid_search.best_params_

In [1565]:
# clf_test = grid_search.best_estimator_

In [1566]:
# preds = clf_test.predict(X_test)

In [1567]:
# f1 = f1_score(y_test, preds)

In [1568]:
# from sklearn.metrics import recall_score
# accuracy = accuracy_score(y_test, preds)
# recall = recall_score(y_test, preds, average = 'binary')



# print("F1 score:", f1)
# print("Accuracy:", accuracy)
# # print("Recall Score:", recall)

#### Prediction dengan Pipeline

In [1569]:
pipe.fit(X_train, y_train)

In [1570]:
preds = pipe.predict(X_test)

In [1571]:
f1 = f1_score(y_test, preds)

In [1572]:
from sklearn.metrics import recall_score
accuracy = accuracy_score(y_test, preds)
recall = recall_score(y_test, preds)



print("F1 score:", f1)
print("Accuracy:", accuracy)
print("Recall Score:", recall)

F1 score: 0.8783855549654807
Accuracy: 0.9107908063887806
Recall Score: 0.8464687819856704
