## Reading Preprocessed data

In [1]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix,accuracy_score,roc_auc_score,roc_curve,auc,f1_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import GridSearchCV

In [2]:
train_df = pd.read_csv("/kaggle/input/datafiles/train_df.csv")
test_df = pd.read_csv("/kaggle/input/datafiles/test_df.csv")

In [3]:
test_df

Unnamed: 0,qid,question_text
0,a4f3da3a3df9dd881edd,My period is due on my wedding day. How can I ...
1,9914c62ed3f69684d549,How many numbers higher than a million can be ...
2,8138ae48649e37091a91,"How come I feel nothing for my family, but sti..."
3,981b4753d17ef14d09f7,"In case of collapse of the Democratic party, w..."
4,452e2c705276ba16b7b7,Who is Émile Naoumoff?
...,...,...
306117,a352dff4fcc2571815ce,Did anyone get an update on Maruti Suzuki All ...
306118,ad4a8498d97c536c67b9,What 5 people in history do you find the most ...
306119,19784a27b55d4b453fda,How can I remove the tan on my forehead?
306120,370191dba26465997879,"If you are a well known hacker, will you be mo..."


## Vectorization of the dataset

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

# # from sklearn.feature_extraction.text import TfidfVectorizer

# # We can change the number of words counted together and see results on accuracy
# CountVectorizer(analyzer='word', ngram_range=(1, 3))

word_vectorizer = TfidfVectorizer(
    strip_accents = 'unicode',
    analyzer = 'word',
    ngram_range = (1, 3),
#     max_df = 0.9,
#     max_features = 150000,
)

char_vectorizer = TfidfVectorizer(
    strip_accents = 'unicode',
    analyzer = 'char',
    ngram_range = (1, 3),
#     max_df = 0.9,
#     max_features = 150000,
)

# # tfidf = TfidfVectorizer()

X_train1_wv = word_vectorizer.fit_transform(train_df.question_text.values.astype('U'))
# X_validate1_wv = word_vectorizer.transform(X_validate.values.astype('U'))

X_train1_cv = char_vectorizer.fit_transform(train_df.question_text.values.astype('U'))
# X_validate1_cv = char_vectorizer.transform(X_validate.values.astype('U'))
# # X_train = tfidf.fit(X_train)

train = hstack((X_train1_wv, X_train1_cv)).tocsr()
# X_validate1 = hstack((X_validate1_wv, X_validate1_cv)).tocsr()


X_test1_wv = word_vectorizer.transform(test_df.question_text)
X_test1_cv = char_vectorizer.transform(test_df.question_text)

test_df_matrix = hstack((X_test1_wv, X_test1_cv)).tocsr()


In [5]:
# tfidf = TfidfVectorizer(ngram_range=(1,3))
# # Numericalize the train dataset
# train = tfidf.fit_transform(train_df.question_text.values.astype('U'))

# # for test_df
# test_df_matrix = tfidf.transform(test_df.question_text.values.astype('U'))
# # print(test_df_matrix.shape)
# test_df_matrix

In [6]:
train

<1000000x8818837 sparse matrix of type '<class 'numpy.float64'>'
	with 170183662 stored elements in Compressed Sparse Row format>

## Doing train test split of the train_df.csv

In [7]:

# X_train, X_test, y_train, y_test = train_test_split(
#     train, train_df.target.values, test_size=0.0005)

# X_train.shape
# train = X_train
# test = X_test
train=train
y_train=train_df.target

In [8]:
train.shape

(1000000, 8818837)

In [9]:
from sklearn.linear_model import LogisticRegression 
model = LogisticRegression(penalty='l2', max_iter=50000,solver='lbfgs',
                           class_weight = {0: 0.23, 1: 0.77})
model.fit(train,y_train)
y_pred = model.predict(train)
print("train f1 score: ",f1_score(y_train,y_pred))
# print("test f1 score: ",f1_score(y_test,model.predict(test)))
test_y_pred = model.predict(test_df_matrix)
print(metrics.classification_report(y_train,y_pred) )

train f1 score:  0.6911455488293088
              precision    recall  f1-score   support

           0       0.98      0.97      0.98    938130
           1       0.64      0.75      0.69     61870

    accuracy                           0.96   1000000
   macro avg       0.81      0.86      0.83   1000000
weighted avg       0.96      0.96      0.96   1000000



In [10]:
print(metrics.classification_report(y_train,y_pred) )

              precision    recall  f1-score   support

           0       0.98      0.97      0.98    938130
           1       0.64      0.75      0.69     61870

    accuracy                           0.96   1000000
   macro avg       0.81      0.86      0.83   1000000
weighted avg       0.96      0.96      0.96   1000000



In [11]:
Test_DF_TARGET = pd.DataFrame(test_y_pred,columns=['target'])
TEST_DF_QID = pd.DataFrame(test_df ,columns=['qid'])
TEST_DF = pd.concat([TEST_DF_QID, Test_DF_TARGET], axis=1, join='inner')
TEST_DF.to_csv("sample_submission_tfidf_max_df=0.9Novalidation.csv",index=False)