## Importing important libraries and reading the training and testing data

In [1]:
import numpy as np 
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt 
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import confusion_matrix,accuracy_score,roc_auc_score,roc_curve,auc,f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn import ensemble
import xgboost as xgb
import pickle as pickle

In [2]:
def loadData(text):
    # for reading also binary mode is important
    dbfile = open(text+'.pickle', 'rb')     
    db = pickle.load(dbfile)
    dbfile.close()
    return db
train = loadData('X_train')
test = loadData('X_test')
y_train = loadData('y_train')
y_test = loadData('y_test')
test_df_matrix = loadData('test_df_matrix')
test_df = pd.read_csv("preprocessed_test.csv")
train_df = pd.read_csv("preprocessed_train.csv")

In [3]:
train.shape

(800000, 7817595)

In [4]:
np.unique(y_train)

array([0, 1])

#### Creating class weights for each class to take care of the fact that there are more class 0 elements than that of class 1

In [5]:
from sklearn.utils import class_weight
sample_weights = class_weight.compute_sample_weight(class_weight='balanced', y=y_train)
np.unique(sample_weights)

class_weights = class_weight.compute_class_weight(y=y_train,classes = np.unique(y_train),class_weight='balanced')
class_weights

## Applying Multinomial Naive Bayes to the model

In [6]:
model = MultinomialNB()
model.fit(train, y_train)
print("train f1 score:", metrics.f1_score(y_train,model.predict(train)))
print("test f1 score:", metrics.f1_score(y_test,model.predict(test)))
print(metrics.classification_report(y_train,model.predict(train)) )


## Applying Logistic Regression to our dataset

In [7]:
from sklearn.linear_model import LogisticRegression 
model = LogisticRegression(dual = False,
    class_weight = {0: 0.9, 1: 2},max_iter=10000)
model.fit(train,y_train)
y_pred = model.predict(train)
print("train f1 score: ",f1_score(y_train,y_pred))
print("test f1 score: ",f1_score(y_test,model.predict(test)))
test_y_pred = model.predict(test_df_matrix)
print(metrics.classification_report(y_train,y_pred) )
# print(model.predict_proba(train))

### Using GridSearchCV for hypertuning parameters

In [None]:
# model1 = LogisticRegression(max_iter=1000)
# parameters = {'class_weight':[{0:0.1,1:0.2},{0:0.1,1:0.3},{0:0.1,1:0.5},{0:0.1, 1:1},{0:0.1,1:2},{0:0.1, 1:3},{0:0.1, 1:4}]}
# model=GridSearchCV(model1,parameters,verbose=10,cv=2,scoring='f1')
# model.fit(train,y_train)

### Finding the optimal threshold value for logistic regression

In [None]:
pred_proba_df = pd.DataFrame(model.predict_proba(train))
threshold_list = [0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5,0.55,0.6,0.65,.7,.75,.8,.85,.9,.95,.99]
for i in threshold_list:
    print ('\n******** For i = {} ******'.format(i))
    Y_test_pred = [1 if j > i else 0 for j in pred_proba_df[1]]
    
    test_accuracy = metrics.f1_score(y_train,Y_test_pred)
    print(test_accuracy)

In [None]:
y_test_pred=[]
pred_proba_df = pd.DataFrame(model.predict_proba(test))

threshold_list = [0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5,0.55,0.6,0.65,.7,.75,.8,.85,.9,.95,.99]
for i in threshold_list:
    print ('\n******** For i = {} ******'.format(i))
    Y_test_pred = [1 if j > i else 0 for j in pred_proba_df[1]]
    test_accuracy = metrics.f1_score(y_test,Y_test_pred)
    print(test_accuracy)

In [None]:
def custom_predict(X, threshold):
    probs = model.predict_proba(X) 
    return (probs[:, 1] > threshold).astype(int)
    
Y_pred = custom_predict(train,0.4)
print("train f1 score: ",f1_score(y_train,Y_pred))
test_y_pred = custom_predict(X=test_df_matrix,threshold=0.4)

### Applying XGBoost on our dataset

In [9]:
model = xgb.XGBClassifier(learning_rate = 0.9,n_estimators = 100)
model.fit(train,y_train )
y_pred = model.predict(train)
print("train f1 score: ",f1_score(y_train,y_pred))
print("test f1 score: ",f1_score(y_test,model.predict(test)))
test_y_pred = model.predict(test_df_matrix)
print(metrics.classification_report(y_train,y_pred) )

In [10]:
# model

### Applying KNN to our dataset

In [11]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(algorithm = 'brute', n_jobs=-1)
model.fit(train,y_train)
y_pred = model.predict(train)
print("train f1 score: ",f1_score(y_train,y_pred))
print("test f1 score: ",f1_score(y_test,model.predict(test)))
test_y_pred = model.predict(test_df_matrix)
print(metrics.classification_report(y_train,y_pred) )
model

### Applying ADABoost to our dataset (Taking too long to run)

In [12]:
model = ensemble.AdaBoostClassifier()
model.fit(train,y_train,sample_weight=sample_weights )
y_pred = model.predict(train)
print("train f1 score: ",f1_score(y_train,y_pred))
print("test f1 score: ",f1_score(y_test,model.predict(test)))
test_y_pred = model.predict(test_df_matrix)
print(metrics.classification_report(y_train,y_pred) )

In [13]:
# model

### Applying Perceptron to our dataset

In [14]:
from sklearn.linear_model import Perceptron
model = Perceptron(class_weight='balanced',n_jobs=-1,warm_start=True,tol=1e-10)
model.fit(train,y_train,sample_weight=sample_weights )
y_pred = model.predict(train)
print("train f1 score: ",f1_score(y_train,y_pred))
print("test f1 score: ",f1_score(y_test,model.predict(test)))
test_y_pred = model.predict(test_df_matrix)
print(metrics.classification_report(y_train,y_pred) )

### Applying SVM to our dataset

In [15]:
from sklearn.svm import LinearSVC
model = LinearSVC()
model.fit(test,y_test )
y_pred = model.predict(train)
print("train f1 score: ",f1_score(y_train,y_pred))
print("test f1 score: ",f1_score(y_test,model.predict(test)))
test_y_pred = model.predict(test_df_matrix)
print(metrics.classification_report(y_train,y_pred) )

## Stacking Classifier

In [None]:
estimators = [
    ('svc', make_pipeline(StandardScaler(),
                          LinearSVC(random_state=42)))
    ('lr',LogisticRegression(penalty='l2', max_iter=50000,solver='lbfgs',class_weight='balanced'))
]
model = StackingClassifier(
    estimators=estimators, final_estimator=LogisticRegression()
)
model.fit(train,y_train)
y_pred = model.predict(train)
print("train f1 score: ",f1_score(y_train,y_pred))
print("test f1 score: ",f1_score(y_test,model.predict(test)))
test_y_pred = model.predict(test_df_matrix)
print(metrics.classification_report(y_train,y_pred) )

### Creating sample_submission.csv

In [16]:
Test_DF_TARGET = pd.DataFrame(test_y_pred,columns=['target'])
TEST_DF_QID = pd.DataFrame(test_df ,columns=['qid'])
TEST_DF = pd.concat([TEST_DF_QID, Test_DF_TARGET], axis=1, join='inner')
TEST_DF.to_csv("sample_submission.csv",index=False)