In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
from scipy import sparse

In [3]:
train_data_path = "../data/train.csv"
train_df = pd.read_csv(train_data_path)
train_df['label'] = np.where(train_df['target']>=0.5, 1, 0)
train_y = train_df["label"]

In [4]:
xtrain_tfidf = sparse.load_npz("../preprocessing_data/xtrain_tfidf.npz")
xtest_tfidf = sparse.load_npz("../preprocessing_data/xtest_tfidf.npz")

In [5]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier

In [6]:
clf1 = LogisticRegression(random_state=1)
clf2 = DecisionTreeClassifier(random_state=1)
eclf = VotingClassifier(estimators=[('lr', clf1), ('dt', clf2)], voting='hard')

In [7]:
c_params = [0.1,  5.0, 7.0, 10.0, 15.0, 20.0, 100.0]


params ={
    "lr__solver" : ['liblinear'], "lr__penalty" : ["l2"], "lr__C" : c_params,"dt__criterion" : ["gini", "entropy"],
    "dt__max_depth" : [10,8,7,6,5,4,3,2],
    "dt__min_samples_leaf": [1,2,3,4,5,6,7,8,9]
    }

In [None]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(estimator=eclf, param_grid=params, cv=5, n_jobs=4)
grid = grid.fit(xtrain_tfidf, train_y)

In [None]:
ypred = grid.predict(xtest_tfidf)

In [None]:
test_data_path = "../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv"
test_df = pd.read_csv(test_data_path)

In [None]:
submit = pd.DataFrame({'id': test_df['id'], 'prediction': ypred})
submit.head()

In [None]:
submit.to_csv('../result/submission_ensemble.csv', index=False)