Importing Required libraries

In [1]:
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV, cross_val_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer, roc_auc_score
import numpy as np

In [3]:
# Importing the data with features
dataset = pd.read_csv('./data/train_100k_random.csv')

In [14]:
dataset

Unnamed: 0,common_neighbour,res_alloc,jaccard,adar,pref_att,num_followers_s,num_followees_s,num_followers_d,num_followees_d,in_deg,out_deg,label
0,27,0.000657,0.000289,2.489652,2988992,1010,93387,32,0,32,93387,1
1,3,0.000080,0.000509,0.278029,211176,51,5860,36,0,36,5860,1
2,457,2.623641,0.010333,77.705596,68643963,2014,42851,154,1586,154,42851,1
3,169,0.012772,0.009615,16.959038,6938000,380,17343,400,0,400,17343,1
4,12,0.001054,0.007207,1.221553,90776,409,1612,56,0,56,1612,1
...,...,...,...,...,...,...,...,...,...,...,...,...
199995,0,0.000000,0.000000,0.000000,2,1,0,2,0,2,0,0
199996,1,0.000002,1.000000,0.077345,1,1,0,1,0,1,0,0
199997,0,0.000000,0.000000,0.000000,1,1,0,1,0,1,0,0
199998,0,0.000000,0.000000,0.000000,3,1,0,3,0,3,0,0


In [24]:
FEATURE_SIZE=11
X = dataset.iloc[:,0:FEATURE_SIZE-6].values
y = dataset.iloc[:, FEATURE_SIZE].values


In [25]:
dataset.iloc[:,0:FEATURE_SIZE-6]

Unnamed: 0,common_neighbour,res_alloc,jaccard,adar,pref_att
0,27,0.000657,0.000289,2.489652,2988992
1,3,0.000080,0.000509,0.278029,211176
2,457,2.623641,0.010333,77.705596,68643963
3,169,0.012772,0.009615,16.959038,6938000
4,12,0.001054,0.007207,1.221553,90776
...,...,...,...,...,...
199995,0,0.000000,0.000000,0.000000,2
199996,1,0.000002,1.000000,0.077345,1
199997,0,0.000000,0.000000,0.000000,1
199998,0,0.000000,0.000000,0.000000,3


In [26]:
X.shape

(200000, 5)

Splitting the data into train and test (Hold out set) and Training with Cross validation

In [27]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)


clf = AdaBoostClassifier()

param_dist = {
    'n_estimators': [50, 100, 200],
    'learning_rate': np.logspace(-3, 0, 100)
}

auc_scorer = make_scorer(roc_auc_score, greater_is_better=True)

random_search = RandomizedSearchCV(estimator=clf, param_distributions=param_dist, n_iter=10, cv=5, scoring=auc_scorer)
random_search.fit(x_train_scaled, y_train)

print("Best Hyperparameters: ", random_search.best_params_)

cross_val_auc = cross_val_score(random_search.best_estimator_, x_train_scaled, y_train, cv=5, scoring=auc_scorer)
print("Cross-Validation AUC Scores: ", cross_val_auc)
print("Mean Cross-Validation AUC: ", cross_val_auc.mean())

y_pred = random_search.best_estimator_.predict(x_test_scaled)

test_auc = roc_auc_score(y_test, y_pred)

print("AUC on the test set: ", test_auc)


Best Hyperparameters:  {'n_estimators': 100, 'learning_rate': 0.24770763559917114}
Cross-Validation AUC Scores:  [0.98657424 0.98897588 0.98729243 0.98201727 0.98429702]
Mean Cross-Validation AUC:  0.9858313658963468
AUC on the test set:  0.9828090984551655


In [29]:
testdata = pd.read_csv('./data/test_100k_random.csv')
x_testing = testdata.iloc[:,0:FEATURE_SIZE-6].values
x_testing = scaler.transform(x_testing)
y_pred=random_search.best_estimator_.predict_proba(x_testing)[:,1]

In [65]:
y_pred

array([0.98150906, 0.88546123, 0.88546123, ..., 0.98150906, 0.88546123,
       0.88546123])

In [52]:
import csv
with open("./data/jupiter_adaboost_strategic.csv","w",newline="") as csvfile:
    writer=csv.writer(csvfile)
    writer.writerow(["Id","Predictions"])
    test_id=1
    for prediction in y_pred:
        writer.writerow([test_id,prediction])
        test_id+=1