In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
from scipy import sparse

In [15]:
train_data_path = "../data/train.csv"
test_data_path = "../data/test.csv"

train_df = pd.read_csv(train_data_path)
test_df = pd.read_csv(test_data_path)

In [3]:
train_df['label'] = np.where(train_df['target']>=0.5, 1, 0)

In [4]:
train_x = train_df["comment_text"]
train_y = train_df["label"]

test_x = test_df["comment_text"]

In [5]:
tfidf_vect = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=(1,3), max_features=50000)

xtrain_tfidf = tfidf_vect.fit_transform(train_x)

xtest_tfidf = tfidf_vect.transform(test_x)

In [10]:
with open('../preprocessing_data/train_y.pkl', 'wb')as f:
    pickle.dump(train_y, f)

In [15]:
sparse.save_npz("../preprocessing_data/xtrain_tfidf.npz", xtrain_tfidf)
sparse.save_npz("../preprocessing_data/xtest_tfidf.npz", xtest_tfidf)

In [2]:
with open('../preprocessing_data/train_y.pkl', 'rb')as f:
    train_y = pickle.load(f)

In [3]:
xtrain_tfidf = sparse.load_npz("../preprocessing_data/xtrain_tfidf.npz")
xtest_tfidf = sparse.load_npz("../preprocessing_data/xtest_tfidf.npz")

In [7]:
from sklearn import model_selection

X_train, X_test, y_train, y_test = model_selection.train_test_split(xtrain_tfidf, train_y, test_size=0.3)

In [9]:
import lightgbm as lgb

In [11]:
from sklearn.model_selection import GridSearchCV


In [12]:
estimator = lgb.LGBMClassifier(num_leaves=2)

param_grid = {
    'learning_rate': [0.01, 0.1, 0.05, 0.5, 1],
    'n_estimators': [20, 40, 60, 80, 100, 120]
}

gbm = GridSearchCV(estimator, param_grid, cv=5, scoring='roc_auc')

gbm.fit(xtrain_tfidf, train_y)

print('Best parameters found by grid search are:', gbm.best_params_)

Best parameters found by grid search are: {'learning_rate': 0.5, 'n_estimators': 120}


In [13]:
ypred = gbm.best_estimator_.predict(xtest_tfidf)

In [16]:
submit = pd.DataFrame({'id': test_df['id'], 'prediction': ypred})
submit.head()

Unnamed: 0,id,prediction
0,7000000,0
1,7000001,0
2,7000002,0
3,7000003,0
4,7000004,1


In [18]:
submit.to_csv('../result/submission.csv', index=False)