## Reading Preprocessed data

In [1]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix,accuracy_score,roc_auc_score,roc_curve,auc,f1_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import GridSearchCV

In [2]:
train_df = pd.read_csv("/kaggle/input/yahoo-troll-question-detection/train_df.csv")
test_df = pd.read_csv("/kaggle/input/yahoo-troll-question-detection/test_df.csv")

In [3]:
test_df

Unnamed: 0,qid,question_text
0,a4f3da3a3df9dd881edd,My period is due on my wedding day. How can I ...
1,9914c62ed3f69684d549,How many numbers higher than a million can be ...
2,8138ae48649e37091a91,"How come I feel nothing for my family, but sti..."
3,981b4753d17ef14d09f7,"In case of collapse of the Democratic party, w..."
4,452e2c705276ba16b7b7,Who is Émile Naoumoff?
...,...,...
306117,a352dff4fcc2571815ce,Did anyone get an update on Maruti Suzuki All ...
306118,ad4a8498d97c536c67b9,What 5 people in history do you find the most ...
306119,19784a27b55d4b453fda,How can I remove the tan on my forehead?
306120,370191dba26465997879,"If you are a well known hacker, will you be mo..."


## Vectorization of the dataset

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

# # from sklearn.feature_extraction.text import TfidfVectorizer

# # We can change the number of words counted together and see results on accuracy
# CountVectorizer(analyzer='word', ngram_range=(1, 3))

word_vectorizer = TfidfVectorizer(
    strip_accents = 'unicode',
    analyzer = 'word',
    ngram_range = (1, 3),
    max_df = 0.5,
    max_features = 150000,
)

char_vectorizer = TfidfVectorizer(
    strip_accents = 'unicode',
    analyzer = 'char',
    ngram_range = (1, 3),
    max_df = 0.5,
    max_features = 150000,
)

# # tfidf = TfidfVectorizer()

X_train1_wv = word_vectorizer.fit_transform(train_df.question_text.values.astype('U'))
# X_validate1_wv = word_vectorizer.transform(X_validate.values.astype('U'))

X_train1_cv = char_vectorizer.fit_transform(train_df.question_text.values.astype('U'))
# X_validate1_cv = char_vectorizer.transform(X_validate.values.astype('U'))
# # X_train = tfidf.fit(X_train)

train = hstack((X_train1_wv, X_train1_cv)).tocsr()
# X_validate1 = hstack((X_validate1_wv, X_validate1_cv)).tocsr()


X_test1_wv = word_vectorizer.transform(test_df.question_text)
X_test1_cv = char_vectorizer.transform(test_df.question_text)

test_df_matrix = hstack((X_test1_wv, X_test1_cv)).tocsr()


In [5]:
# tfidf = TfidfVectorizer(ngram_range=(1,3))
# # Numericalize the train dataset
# train = tfidf.fit_transform(train_df.question_text.values.astype('U'))

# # for test_df
# test_df_matrix = tfidf.transform(test_df.question_text.values.astype('U'))
# # print(test_df_matrix.shape)
# test_df_matrix

In [6]:
train

<1000000x232101 sparse matrix of type '<class 'numpy.float64'>'
	with 127658239 stored elements in Compressed Sparse Row format>

## Doing train test split of the train_df.csv

In [7]:

X_train, X_test, y_train, y_test = train_test_split(
    train, train_df.target.values, test_size=0.20)

X_train.shape
train = X_train
test = X_test
y_test
# train=train
# y_train=train_df.target

array([0, 0, 0, ..., 0, 0, 0])

In [8]:
train.shape

(800000, 232101)

In [9]:
from sklearn.linear_model import LogisticRegression 
model = LogisticRegression(dual = False,
    class_weight = {0: 0.9, 1: 2},max_iter=10000)
# parameters = {'class_weight':[{0:0.2, 1:0.8},{0:0.25,1:0.75},{0:0.3, 1:0.7}]}
# model=GridSearchCV(model1,parameters,verbose=4,cv=2,scoring='f1')
model.fit(train,y_train)
y_pred = model.predict(train)
print("train f1 score: ",f1_score(y_train,y_pred))
print("test f1 score: ",f1_score(y_test,model.predict(test)))
test_y_pred = model.predict(test_df_matrix)
print(metrics.classification_report(y_train,y_pred) )
print(model.predict_proba(train))

train f1 score:  0.7257441128421724
test f1 score:  0.640827922077922
              precision    recall  f1-score   support

           0       0.98      0.98      0.98    750478
           1       0.72      0.74      0.73     49522

    accuracy                           0.97    800000
   macro avg       0.85      0.86      0.85    800000
weighted avg       0.97      0.97      0.97    800000

[[0.99825259 0.00174741]
 [0.98901815 0.01098185]
 [0.99710393 0.00289607]
 ...
 [0.97688296 0.02311704]
 [0.87083843 0.12916157]
 [0.9835356  0.0164644 ]]


In [10]:
def custom_predict(X, threshold):
    probs = model.predict_proba(X) 
    return (probs[:, 1] > threshold).astype(int)
    
Y_pred = custom_predict(train,0.4)
print("train f1 score: ",f1_score(y_train,Y_pred))

train f1 score:  0.7299148547282318


In [11]:
test_y_pred = custom_predict(X=test_df_matrix,threshold=0.4)

In [12]:
pred_proba_df = pd.DataFrame(model.predict_proba(train))
threshold_list = [0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5,0.55,0.6,0.65,.7,.75,.8,.85,.9,.95,.99]
for i in threshold_list:
    print ('\n******** For i = {} **q****'.format(i))
    Y_test_pred = [1 if j > i else 0 for j in pred_proba_df[1]]
#     print((Y_test_pred))
#     print(pred_proba_df[1])
#     print((Y_test_pred))
#     print(y_test.shape)
#     print(np.array(Y_test_pred).shape)
    
    test_accuracy = metrics.f1_score(y_train,Y_test_pred)
    print(test_accuracy)
#                                          y_train_pred.iloc[:,1].as_matrix().reshape(y_train_pred.iloc[:,1].size,1))
#     print('Our testing accuracy is {}'.format(test_accuracy))

#     print(confusion_matrix(y_train.reshape(y_train.as_matrix().size,1),
#                            y_test_pred.iloc[:,1].as_matrix().reshape(y_test_pred.iloc[:,1].as_matrix().size,1)))


******** For i = 0.05 **q****
0.4022440507993917

******** For i = 0.1 **q****
0.5256494286775469

******** For i = 0.15 **q****
0.6007580864676477

******** For i = 0.2 **q****
0.6505077574047955

******** For i = 0.25 **q****
0.6843654667480171

******** For i = 0.3 **q****
0.7087218155316879

******** For i = 0.35 **q****
0.7226828596986169

******** For i = 0.4 **q****
0.7299148547282318

******** For i = 0.45 **q****
0.7313479594064763

******** For i = 0.5 **q****
0.7257441128421724

******** For i = 0.55 **q****
0.7148052925315106

******** For i = 0.6 **q****
0.696938462213255

******** For i = 0.65 **q****
0.6727861895591406

******** For i = 0.7 **q****
0.6402885561263525

******** For i = 0.75 **q****
0.5974797479747975

******** For i = 0.8 **q****
0.5408277864519133

******** For i = 0.85 **q****
0.46741414527339786

******** For i = 0.9 **q****
0.3683614015484077

******** For i = 0.95 **q****
0.22402660581294556

******** For i = 0.99 **q****
0.04654917790686226


In [13]:
# y_test_pred = [1 if j > 0.05 else 0 for j in pred_proba_df[1]]
y_test_pred=[]
pred_proba_df = pd.DataFrame(model.predict_proba(test))
# for i in range(len(pred_proba_df)):
#     if(pred_proba_df[1][i] > 0.45):
#         y_test_pred.append(1)
#     else:
#         y_test_pred.append(0)

# test_accuracy = metrics.f1_score(y_test,y_test_pred)
# print(test_accuracy)
threshold_list = [0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5,0.55,0.6,0.65,.7,.75,.8,.85,.9,.95,.99]
for i in threshold_list:
    print ('\n******** For i = {} ******'.format(i))
    Y_test_pred = [1 if j > i else 0 for j in pred_proba_df[1]]
#     print((Y_test_pred))
#     print(pred_proba_df[1])
#     print((Y_test_pred))
#     print(y_test.shape)
#     print(np.array(Y_test_pred).shape)
    
    test_accuracy = metrics.f1_score(y_test,Y_test_pred)
    print(test_accuracy)
#                                  



# print(y_test_pred)


******** For i = 0.05 ******
0.38742873944363676

******** For i = 0.1 ******
0.4945217391304348

******** For i = 0.15 ******
0.5559731816334823

******** For i = 0.2 ******
0.593469295520027

******** For i = 0.25 ******
0.6177914110429448

******** For i = 0.3 ******
0.6329612606697308

******** For i = 0.35 ******
0.6425761531766754

******** For i = 0.4 ******
0.6461493368505898

******** For i = 0.45 ******
0.6447571914630373

******** For i = 0.5 ******
0.640827922077922

******** For i = 0.55 ******
0.6316552282705997

******** For i = 0.6 ******
0.6174052894924947

******** For i = 0.65 ******
0.59791881503703

******** For i = 0.7 ******
0.572244656751699

******** For i = 0.75 ******
0.5383136786326562

******** For i = 0.8 ******
0.4935537190082645

******** For i = 0.85 ******
0.43065693430656937

******** For i = 0.9 ******
0.3473664097828164

******** For i = 0.95 ******
0.21610558260992307

******** For i = 0.99 ******
0.04531459698428988


In [14]:
# # y_test_pred = [1 if j > 0.05 else 0 for j in pred_proba_df[1]]
# y_test_pred=[]
# pred_proba_df = pd.DataFrame(model.predict_proba(test_df_matrix))
# # lund = pred_proba_df.to_numpy().reshape(2,500000)
# # print(lund.shape)
# # print(pred_proba_df)
# # print(pred_proba_df[0][3])
# for i in range(len(pred_proba_df)):
#     if(pred_proba_df[1][i] > 0.45):
#         y_test_pred.append(1)
#     else:
#         y_test_pred.append(0)

# # test_accuracy = metrics.f1_score(y_test,y_test_pred)
# # print(test_accuracy)
# # print(y_test_pred)
# # y_test_pred

In [15]:
# print(metrics.classification_report(y_train,y_pred) )

In [16]:
Test_DF_TARGET = pd.DataFrame(test_y_pred,columns=['target'])
TEST_DF_QID = pd.DataFrame(test_df ,columns=['qid'])
TEST_DF = pd.concat([TEST_DF_QID, Test_DF_TARGET], axis=1, join='inner')
TEST_DF.to_csv("sample_submission69.csv",index=False)