Importing the required libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import PolynomialFeatures


Loading the data with generated features

In [3]:

# Importing the data with features
# For loading Strategic sampling  ---> /train_100k_str.csv
# For loading Random    sampling  ---> /train_100k_random.csv

dataset = pd.read_csv('./data/train_100k_random.csv')

In [4]:
dataset

Unnamed: 0,common_neighbour,res_alloc,jaccard,adar,pref_att,num_followers_s,num_followees_s,num_followers_d,num_followees_d,in_deg,out_deg,label
0,27,0.000657,0.000289,2.489652,2988992,1010,93387,32,0,32,93387,1
1,3,0.000080,0.000509,0.278029,211176,51,5860,36,0,36,5860,1
2,457,2.623641,0.010333,77.705596,68643963,2014,42851,154,1586,154,42851,1
3,169,0.012772,0.009615,16.959038,6938000,380,17343,400,0,400,17343,1
4,12,0.001054,0.007207,1.221553,90776,409,1612,56,0,56,1612,1
...,...,...,...,...,...,...,...,...,...,...,...,...
199995,0,0.000000,0.000000,0.000000,2,1,0,2,0,2,0,0
199996,1,0.000002,1.000000,0.077345,1,1,0,1,0,1,0,0
199997,0,0.000000,0.000000,0.000000,1,1,0,1,0,1,0,0
199998,0,0.000000,0.000000,0.000000,3,1,0,3,0,3,0,0


Experimenting by taking various no of samples 

In [72]:
num_samples = 100000 # no of positive samples

In [73]:
first_half = dataset.iloc[:100000, :]
second_half = dataset.iloc[100000:, :]

sampled_first_half = first_half.sample(n=num_samples, random_state=0)
sampled_second_half = second_half.sample(n=num_samples, random_state=0)

new_df = pd.concat([sampled_first_half, sampled_second_half])
new_df.reset_index(drop=True, inplace=True)

In [74]:
new_df

Unnamed: 0,common_neighbour,res_alloc,jaccard,adar,pref_att,num_followers_s,num_followees_s,num_followers_d,num_followees_d,in_deg,out_deg,label
0,3,0.000499,0.000572,0.342926,68016,183,5221,13,0,13,5221,1
1,8,0.009393,0.002973,1.163644,178908,85,2600,68,0,68,2600,1
2,41,0.044018,0.002408,5.466024,969684,930,16971,57,0,57,16971,1
3,63,0.002374,0.012761,5.975142,960000,194,4797,200,0,200,4797,1
4,0,0.000000,0.000000,0.000000,762657,4840,761793,1,0,1,761793,1
...,...,...,...,...,...,...,...,...,...,...,...,...
199995,0,0.000000,0.000000,0.000000,1,1,0,1,0,1,0,0
199996,0,0.000000,0.000000,0.000000,228,12,0,19,0,19,0,0
199997,0,0.000000,0.000000,0.000000,4,4,0,1,0,1,0,0
199998,0,0.000000,0.000000,0.000000,1,1,0,1,0,1,0,0


In [75]:
FEATURE_SIZE=11

X = new_df.iloc[:,0:FEATURE_SIZE].values
y = new_df.iloc[:, FEATURE_SIZE].values

In [76]:
new_df.iloc[:,0:FEATURE_SIZE]

Unnamed: 0,common_neighbour,res_alloc,jaccard,adar,pref_att,num_followers_s,num_followees_s,num_followers_d,num_followees_d,in_deg,out_deg
0,3,0.000499,0.000572,0.342926,68016,183,5221,13,0,13,5221
1,8,0.009393,0.002973,1.163644,178908,85,2600,68,0,68,2600
2,41,0.044018,0.002408,5.466024,969684,930,16971,57,0,57,16971
3,63,0.002374,0.012761,5.975142,960000,194,4797,200,0,200,4797
4,0,0.000000,0.000000,0.000000,762657,4840,761793,1,0,1,761793
...,...,...,...,...,...,...,...,...,...,...,...
199995,0,0.000000,0.000000,0.000000,1,1,0,1,0,1,0
199996,0,0.000000,0.000000,0.000000,228,12,0,19,0,19,0
199997,0,0.000000,0.000000,0.000000,4,4,0,1,0,1,0
199998,0,0.000000,0.000000,0.000000,1,1,0,1,0,1,0


In [77]:
X.shape

(200000, 11)

In [78]:
y

array([1, 1, 1, ..., 0, 0, 0])

Training with Logistic regression with Cross Validation

In [79]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
lr = linear_model.LogisticRegression(C=1, penalty='l2',solver='liblinear')

k = 5 
auc_scores = cross_val_score(lr, x_train, y_train, cv=k, scoring='roc_auc')
print("AUC Scores for Each Fold:", auc_scores)

mean_auc = auc_scores.mean()
std_auc = auc_scores.std()
print(f"Mean AUC: {mean_auc:.4f}  Std AUC: {std_auc:.4f}")

lr.fit(x_train, y_train)


AUC Scores for Each Fold: [0.99666124 0.9966165  0.99666886 0.9965945  0.99617172]
Mean AUC: 0.9965  Std AUC: 0.0002


Calculating model performance on Known test data (Hold out set)

In [80]:
x_test = sc.fit_transform(x_test)
pre=lr.predict_proba(x_test)
# print(pre)
y_pre=[p[1] for p in pre]
acc=lr.score(x_test,y_test)
print("Accuracy - ",acc)
auc=roc_auc_score(y_test,y_pre)
print("AUC      - ",auc)


Accuracy -  0.953825
AUC      -  0.9967571673875


In [177]:
dataset.iloc[:, :4]

Unnamed: 0,common_neighbour,res_alloc,jaccard,adar
0,7,0.003140,0.024735,0.854683
1,91,0.017468,0.021677,10.474274
2,6,0.000761,0.001779,0.618650
3,0,0.000000,0.000000,0.000000
4,11,0.005973,0.046218,1.398022
...,...,...,...,...
199995,0,0.000000,0.000000,0.000000
199996,0,0.000000,0.000000,0.000000
199997,0,0.000000,0.000000,0.000000
199998,1,0.000001,0.066667,0.073830


Predictions for Test data which is to be submitted on Kaggle

In [184]:
testdata = pd.read_csv('./data/test_100k_random.csv')
x_testing = testdata.iloc[:,0:FEATURE_SIZE-7].values
x_testing = sc.transform(x_testing)
predictions=lr.predict_proba(x_testing)[:,1]

In [185]:
predictions

array([1.        , 0.76025351, 0.79264506, ..., 0.99804125, 0.74172875,
       0.76063547])

In [186]:
import csv
with open("./data/jupiter_logistic_strategic.csv","w",newline="") as csvfile:
    writer=csv.writer(csvfile)
    writer.writerow(["Id","Predictions"])
    test_id=1
    for prediction in predictions:
        writer.writerow([test_id,prediction])
        test_id+=1

Non linear Feature Generation 

In [81]:
poly = PolynomialFeatures(degree=2, include_bias=False)
poly_features = poly.fit_transform(dataset.iloc[:, :4])
n_original_features = dataset.iloc[:, :4].shape[1] 
poly_feature_names = [f'poly_{i}' for i in range(1, poly_features.shape[1] + 1)]
df_poly = pd.DataFrame(poly_features, columns=poly_feature_names)

In [82]:
df_poly

Unnamed: 0,poly_1,poly_2,poly_3,poly_4,poly_5,poly_6,poly_7,poly_8,poly_9,poly_10,poly_11,poly_12,poly_13,poly_14
0,27.0,0.000657,0.000289,2.489652,729.0,0.017749,0.007804,67.220615,4.321164e-07,1.900055e-07,1.636587e-03,8.354712e-08,0.000720,6.198369
1,3.0,0.000080,0.000509,0.278029,9.0,0.000241,0.001526,0.834086,6.465072e-09,4.089117e-08,2.235509e-05,2.586341e-07,0.000141,0.077300
2,457.0,2.623641,0.010333,77.705596,208849.0,1199.003892,4.722206,35511.457482,6.883492e+00,2.711022e-02,2.038716e+02,1.067720e-04,0.802936,6038.159687
3,169.0,0.012772,0.009615,16.959038,28561.0,2.158514,1.625000,2866.077381,1.631310e-04,1.228103e-04,2.166055e-01,9.245562e-05,0.163068,287.608962
4,12.0,0.001054,0.007207,1.221553,144.0,0.012648,0.086486,14.658641,1.110932e-06,7.596449e-06,1.287526e-03,5.194384e-05,0.008804,1.492193
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000
199996,1.0,0.000002,1.000000,0.077345,1.0,0.000002,1.000000,0.077345,5.887794e-12,2.426478e-06,1.876761e-07,1.000000e+00,0.077345,0.005982
199997,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000
199998,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000


In [83]:
FEATURE_SIZE=14

X = df_poly.iloc[:,0:FEATURE_SIZE].values
y = dataset.iloc[:, -1].values

In [84]:
X.shape

(200000, 14)

In [85]:
y

array([1, 1, 1, ..., 0, 0, 0])

Training Logistic regression on non linear features

In [89]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
lr = linear_model.LogisticRegression(C=1e8, penalty='l2',solver='liblinear')

k = 5 
auc_scores = cross_val_score(lr, x_train, y_train, cv=k, scoring='roc_auc')
print("AUC Scores for Each Fold:", auc_scores)

mean_auc = auc_scores.mean()
std_auc = auc_scores.std()
print(f"Mean AUC: {mean_auc:.4f}  Std AUC: {std_auc:.4f}")

lr.fit(x_train, y_train)




AUC Scores for Each Fold: [0.90162792 0.89989488 0.90087015 0.89804764 0.90308137]
Mean AUC: 0.9007  Std AUC: 0.0017


Calculating model performance on Known test data (Hold out set)

In [90]:
x_test = sc.fit_transform(x_test)

pre=lr.predict_proba(x_test)
y_pre=[p[1] for p in pre]
acc=lr.score(x_test,y_test)
print("Accuracy - ",acc)
auc=roc_auc_score(y_test,y_pre)
print("AUC      - ",auc)


Accuracy -  0.5153
AUC      -  0.8958876536841268
