In [1]:
# To execute the code inside the cell, select that cell and press Ctrl + Enter (or click on the Run button above).
# To add a new cell under the existing cell, select the existing cell and press B (or click on the + button above).
# To save the notebook, press Ctrl + S (or click on the Save button above).
# You can find more information under Help menu.

import json
import pandas as pd

with open('training_data.json', 'r') as ifile:
   data = json.load(ifile)


In [2]:
raw_df = pd.DataFrame(data, columns=['h_cats','h_dogs','h_stocks','sub_reddit','click'])
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   h_cats      5000 non-null   float64
 1   h_dogs      5000 non-null   float64
 2   h_stocks    5000 non-null   float64
 3   sub_reddit  5000 non-null   object 
 4   click       5000 non-null   int64  
dtypes: float64(3), int64(1), object(1)
memory usage: 195.4+ KB


In [3]:
raw_df.describe()

Unnamed: 0,h_cats,h_dogs,h_stocks,click
count,5000.0,5000.0,5000.0,5000.0
mean,1.852538,1.857214,3.693698,0.4056
std,1.168584,1.146824,2.291457,0.491057
min,0.0,0.001,0.0,0.0
25%,0.881,0.888,1.78075,0.0
50%,1.766,1.7795,3.555,0.0
75%,2.72825,2.68425,5.345,1.0
max,4.913,4.967,9.761,1.0


In [4]:
raw_df['sub_reddit'].describe()

count     5000
unique       3
top       dogs
freq      1686
Name: sub_reddit, dtype: object

In [5]:
raw_df['click'].value_counts()

click
0    2972
1    2028
Name: count, dtype: int64

In [6]:
from sklearn.preprocessing import OneHotEncoder

oneHotEncoder = OneHotEncoder(handle_unknown='ignore')
sub_reddit_numeric = oneHotEncoder.fit_transform(raw_df[['sub_reddit']]).toarray()  

raw_df[oneHotEncoder.get_feature_names_out(['sub_reddit'])] = sub_reddit_numeric
raw_df.describe()


Unnamed: 0,h_cats,h_dogs,h_stocks,click,sub_reddit_cats,sub_reddit_dogs,sub_reddit_stocks
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,1.852538,1.857214,3.693698,0.4056,0.3332,0.3372,0.3296
std,1.168584,1.146824,2.291457,0.491057,0.471405,0.472801,0.470115
min,0.0,0.001,0.0,0.0,0.0,0.0,0.0
25%,0.881,0.888,1.78075,0.0,0.0,0.0,0.0
50%,1.766,1.7795,3.555,0.0,0.0,0.0,0.0
75%,2.72825,2.68425,5.345,1.0,1.0,1.0,1.0
max,4.913,4.967,9.761,1.0,1.0,1.0,1.0


In [7]:
# Drop unwanted features
ignore_features = [
    'sub_reddit'
]

train_data =  raw_df.drop(ignore_features, axis=1)
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   h_cats             5000 non-null   float64
 1   h_dogs             5000 non-null   float64
 2   h_stocks           5000 non-null   float64
 3   click              5000 non-null   int64  
 4   sub_reddit_cats    5000 non-null   float64
 5   sub_reddit_dogs    5000 non-null   float64
 6   sub_reddit_stocks  5000 non-null   float64
dtypes: float64(6), int64(1)
memory usage: 273.6 KB


In [8]:
from sklearn.model_selection import train_test_split
RANDOM_STATE = 7
TEST_SIZE = 0.2 

train, test = train_test_split(train_data, random_state=RANDOM_STATE, test_size = TEST_SIZE)

train.shape, test.shape

((4000, 7), (1000, 7))

In [9]:
X_train = train.drop('click', axis = 1)
Y_train = train['click']

X_test = test.drop('click', axis = 1)
Y_test = test['click']


X_train.shape, Y_train.shape, X_test.shape, Y_test.shape

((4000, 6), (4000,), (1000, 6), (1000,))

In [35]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,precision_recall_curve, make_scorer
    
def metrics(Y_test, Y_pred, Y_pred_proba):
    print(f"Confusion Matrix: \n{confusion_matrix(Y_test, Y_pred)}")
    print(f"Accuracy: {accuracy_score(Y_test, Y_pred)}")
    print(f"Precision: {precision_score(Y_test, Y_pred)}")
    print(f"Recall: {recall_score(Y_test, Y_pred)}")
    # print(f"F1 Score: {f1_score(Y_test, Y_pred)}")
    # print(f"ROC-AUC Score: {roc_auc_score(Y_test, Y_pred)}")
    
    p,r,_ = precision_recall_curve(Y_test, Y_pred_proba)
    print(f"PR-AUC Score: {auc(r, p)}")

def pr_auc_score(y_true, y_proba):
    precision, recall, _ = precision_recall_curve(y_true, y_proba)
    return  auc(recall, precision)

In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

model = LogisticRegression(max_iter=200, random_state=RANDOM_STATE)
model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)
Y_pred_proba = model.predict_proba(X_test)
Y_pred_proba = Y_pred_proba[:, 1]
metrics(Y_test, Y_pred, Y_pred_proba)

Confusion Matrix: 
[[502  84]
 [202 212]]
Accuracy: 0.714
Precision: 0.7162162162162162
Recall: 0.5120772946859904
PR-AUC Score: 0.6374608739445999


In [24]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(
    criterion='entropy',
    max_depth=10, 
    min_samples_leaf=10,
    random_state=RANDOM_STATE,
)
model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)
Y_pred_proba = model.predict_proba(X_test)
Y_pred_proba = Y_pred_proba[:, 1]
metrics(Y_test, Y_pred, Y_pred_proba)

Confusion Matrix: 
[[553  33]
 [ 23 391]]
Accuracy: 0.944
Precision: 0.9221698113207547
Recall: 0.9444444444444444
PR-AUC Score: 0.9765491489771837


In [31]:
from sklearn.model_selection import GroupKFold

X = train_data.drop(columns=['click'])
Y = train_data['click']
X.shape, Y.shape

groups = [ i for i in range(len(X))]
gkf = GroupKFold(n_splits=5)

for train_idx, test_idx in gkf.split(X, Y, groups):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    Y_train, Y_test = Y.iloc[train_idx], Y.iloc[test_idx]

    dtc = DecisionTreeClassifier(
        criterion='entropy',
        max_depth=10, 
        min_samples_leaf=10,
        random_state=RANDOM_STATE,
    )
    
    dtc.fit(X_train, Y_train)
    y_proba = dtc.predict_proba(X_test)
    print(y_proba[:, 1].shape)
    pr_auc = pr_auc_score(Y_test, y_proba[:, 1])

    print(f"PR-AUC: {pr_auc}")


(1000,)
PR-AUC: 0.9662139543309475
(1000,)
PR-AUC: 0.9723389922789749
(1000,)
PR-AUC: 0.9654703131144331
(1000,)
PR-AUC: 0.9666539833306538
(1000,)
PR-AUC: 0.9770027278660958


In [39]:
from sklearn.model_selection import GridSearchCV

pr_auc_scorer = make_scorer(pr_auc_score, needs_proba=True)

param_grid = {
    # DecisionTree Hyperparameters
    'criterion': ['gini', 'entropy'],
    'max_depth': [10, 20],
    
    # GradientBoostedTrees Hyperparameters
    # 'n_estimators': [100, 200],
    # 'criterion': ["friedman_mse"],
    # "learning_rate": [0.01, 0.05],
    # 'max_depth': [3, 5],

    # Common Hyperparameters
   
    'min_samples_split': [2, 4],
    'min_samples_leaf': [1, 2],
    'random_state': [42]
}


dtc = DecisionTreeClassifier()

grid_search = GridSearchCV(
    estimator=dtc, 
    param_grid=param_grid, 
    cv=gkf, # Cross validation with 5 splits
    scoring=pr_auc_scorer, # PR AUC
    n_jobs = 1 # Parallel Execution
)


grid_search.fit(X, Y, groups=groups)
print("Best Hyperparameters:", grid_search.best_params_)
print("Best PR-AUC Score:", grid_search.best_score_)


Best Hyperparameters: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2, 'random_state': 42}
Best PR-AUC Score: 0.9574421676984164


In [40]:
features = X.columns
features = list(zip(features, grid_search.best_estimator_.feature_importances_))
features.sort(key=lambda x: x[1], reverse=True)

print(f"\n --- Top Features ------ ")
for i in range(len(X.columns)):
    print(f"{features[i][0]}:\t{round(features[i][1], 4)}")


 --- Top Features ------ 
h_stocks:	0.2884
sub_reddit_stocks:	0.2682
h_dogs:	0.2267
h_cats:	0.216
sub_reddit_dogs:	0.0004
sub_reddit_cats:	0.0003
