In [1]:
import numpy as np
import pandas as pd
import os
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from nltk.corpus import stopwords
import keras.preprocessing.text as T
from keras.preprocessing.text import Tokenizer
import nltk
nltk.download('stopwords')
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

Using TensorFlow backend.


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\GanJinZERO\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")
print("Train shape : ",train.shape)
print("Test shape : ",test.shape)

Train shape :  (1306122, 3)
Test shape :  (56370, 2)


In [4]:
x_train_all = train["question_text"].values
x_test = test["question_text"].values
y_train_all = train["target"].values

x_train, x_val, y_train, y_val = train_test_split(x_train_all, y_train_all, test_size=0.2, random_state=42)

train_len = len(x_train)
val_len = len(x_val)
test_len = len(x_test)
print(train_len)
print(val_len)
print(test_len)

1044897
261225
56370


In [5]:
doc = np.concatenate((x_train, x_val, x_test))

In [27]:
vectorizer = TfidfVectorizer(stop_words='english', min_df=10, ngram_range=(1, 3))
# vectorizer = CountVectorizer(stop_words='english', min_df=5, ngram_range=(1, 3))
tfidf_model = vectorizer.fit(doc)
train_vector = tfidf_model.transform(x_train)
val_vector = tfidf_model.transform(x_val)
test_vector = tfidf_model.transform(x_test)

In [28]:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
mnb.fit(train_vector, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [29]:
y_pred_val = mnb.predict(val_vector)
from sklearn.metrics import accuracy_score, precision_score, f1_score
acc_val = accuracy_score(y_val, y_pred_val)
pre_val = precision_score(y_val, y_pred_val)
f1_val = 2 / (1 / acc_val + 1 / pre_val)
print(acc_val, pre_val, f1_val)

0.9483204134366925 0.6449667733202068 0.7677651112890926


In [30]:
y_pred_val_prob = mnb.predict_proba(val_vector)[:, 1]

In [31]:
y_pred_val_prob

array([0.00062861, 0.00273975, 0.00508476, ..., 0.00831363, 0.02428476,
       0.40042209])

In [33]:
from sklearn.metrics import f1_score
threshold_optimal = 0
f1_max = 0
divide_count = 100
for i in range(divide_count - 1):
    threshold = (i + 1) / divide_count
    y_pred_val = np.where(y_pred_val_prob > threshold, 1, 0)
    f1 = f1_score(y_val, y_pred_val)
    if f1 >= f1_max:
        threshold_optimal = threshold
        f1_max = f1
    # print(threshold, f1)
print(threshold_optimal, f1_max)

0.21 0.5503901170351105


In [35]:
y_pred_test = np.where(mnb.predict_proba(test_vector)[:, 1] > threshold_optimal, 1, 0)
out_df = pd.DataFrame({"qid":test["qid"].values})
out_df['prediction'] = y_pred_test
out_df.to_csv("submission.csv", index=False)