In [1]:
# base modules
import os
import sys
import copy

# for manipulating data
import numpy as np
import pandas as pd
import math
import dill
import gc

# for Machine Learning
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, BaggingRegressor
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, plot_tree
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn import metrics


# for visualization
from IPython.display import display
from matplotlib import pyplot as plt

In [2]:
# for package auto reload
%load_ext autoreload
%autoreload 2

# for better rendering of plots in jupyter notebook
%matplotlib inline

In [3]:
# path to repo
path_to_repo = os.path.dirname(os.getcwd())
path_to_repo

'/Users/jenny/Desktop/Course/machine_learning/MLP_Project'

In [4]:
sys.path.insert(0, os.path.join(path_to_repo, 'src'))

# custom module
from mlpcourse.utils import *

ModuleNotFoundError: No module named 'mlpcourse'

### Select Data 

In [None]:
# read the data in chunk

chunk_size = 100000
train_chunks = pd.read_csv('train.csv', chunksize=chunk_size)

for chunk in train_chunks:
    print(chunk.info(memory_usage='deep'))
    print(chunk.head())
    break

In [None]:
# random select
sample_ratio = 0.1  # select 10% each time

# sampling
sampled_data = pd.DataFrame()
for chunk in pd.read_csv('train.csv', chunksize=chunk_size):
    sampled_chunk = chunk.sample(frac=sample_ratio, random_state=42)
    sampled_data = pd.concat([sampled_data, sampled_chunk], ignore_index=True)

print(f"the size of sampled data: {sampled_data.shape}")


In [None]:
# optimization

def optimize_memory(df):
    for col in df.columns:
        if df[col].dtype == 'int64' or df[col].dtype == 'float64':
            df[col] = pd.to_numeric(df[col], downcast='unsigned')
        elif df[col].dtype == 'object':
            df[col] = df[col].astype('category')
    return df

sampled_data = optimize_memory(sampled_data)

print(sampled_data.info(memory_usage='deep'))


In [None]:
# save as a csv file
sampled_data.to_csv('train_sample.csv', index=False, compression='gzip')

### Data Processing

In [5]:
df = pd.read_csv('train_sample.csv', compression='gzip')
df

Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,3926754196077507072,1,14102100,1005,0,f84e52b6,d7e2f29b,28905ebd,ecad2386,7801e8d9,...,1,0,20346,300,250,2331,2,39,-1,23
1,4544181857823990784,1,14102100,1005,1,e151e245,7e091613,f028772b,ecad2386,7801e8d9,...,1,0,17037,320,50,1934,2,39,-1,16
2,12744445434029488128,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15699,320,50,1722,0,35,100083,79
3,4063799378381284352,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,2,15699,320,50,1722,0,35,-1,79
4,6357775859397618688,1,14102100,1005,0,5b08c53b,7687a86e,3e814130,ecad2386,7801e8d9,...,1,0,17654,300,250,1994,2,39,100083,33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4042892,638841397737549056,0,14103023,1005,1,e151e245,7e091613,f028772b,ecad2386,7801e8d9,...,1,0,17573,320,50,1991,2,39,-1,33
4042893,9979815024338341888,0,14103023,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,22261,320,50,2545,0,431,100084,221
4042894,6821580975305070592,0,14103023,1005,0,85f751fd,c4e18dd6,50e219e0,3c4b944d,2347f47a,...,1,2,24041,320,50,2756,3,299,100112,61
4042895,8199834664434430976,0,14103023,1005,0,85f751fd,c4e18dd6,50e219e0,ce183bbd,ae637522,...,1,0,23866,320,50,2736,0,33,100171,246


In [6]:
test_data = pd.read_csv('test.csv')

In [7]:
# get information of time 

df['hour'] = pd.to_datetime(df['hour'].astype(str), format='%y%m%d%H')  

df['day'] = df['hour'].dt.day  

df['weekday'] = df['hour'].dt.weekday
df['is_weekend'] = df['hour'].dt.isocalendar().day >= 6  

df['hour'] = df['hour'].dt.hour  
df['is_morning'] = df['hour'].between(6, 12)
df['is_afternoon'] = df['hour'].between(12, 18)
df['is_evening'] = df['hour'].between(18, 24)

df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)

In [8]:
# category

categorical_columns = [
    'site_id', 'site_domain', 'site_category', 'app_id', 'app_domain', 'app_category',
    'device_id', 'device_ip', 'device_model', 'device_type', 'device_conn_type'
]

label_encoders = {}

for col in categorical_columns:
    label_encoders[col] = LabelEncoder()
    df[col] = label_encoders[col].fit_transform(df[col])


In [9]:
missing_summary = df.isnull().sum()
print("missing：\n", missing_summary[missing_summary > 0])

df.fillna(-1, inplace=True)


missing：
 Series([], dtype: int64)


In [10]:
for col in ['C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21']:
    label_enc = LabelEncoder()
    df[col] = label_enc.fit_transform(df[col])


In [11]:
df['site_app_combination'] = df['site_category'] * 1000 + df['app_category']  
df['device_interaction'] = df['device_type'] * 10 + df['device_conn_type']   


In [12]:
df['site_category_count'] = df.groupby('site_category')['click'].transform('count')  
df['app_category_mean_click'] = df.groupby('app_category')['click'].transform('mean')  


In [13]:
df['site_device_combination'] = df['site_id'] * 100 + df['device_type']
df['app_conn_combination'] = df['app_category'] * 10 + df['device_conn_type']
df['site_app_device'] = df['site_category'] * 1000 + df['app_category'] * 10 + df['device_type']


In [14]:
df['site_click_rate'] = df.groupby('site_id')['click'].transform('mean')
df['app_click_rate'] = df.groupby('app_id')['click'].transform('mean')

df['site_count'] = df.groupby('site_id')['click'].transform('count')
df['app_count'] = df.groupby('app_id')['click'].transform('count')


In [15]:
df['device_type_conn_interaction'] = df['device_type'] * df['device_conn_type']
df['C14_C17_product'] = df['C14'] * df['C17']


In [16]:
df['device_click_rate'] = df.groupby('device_id')['click'].transform('mean')
df['device_count'] = df.groupby('device_id')['click'].transform('count')


In [17]:
df['site_app_click_rate'] = df.groupby(['site_category', 'app_category'])['click'].transform('mean')


In [18]:
df['combined_feature'] = df['C14'] * 100 + df['C17']


In [19]:
df['C14_log'] = np.log1p(df['C14'])  
df['C15_C16_ratio'] = df['C15'] / (df['C16'] + 1)  


In [20]:
print(f"the number of column after processing: {df.shape[1]}")
print(df.head())

the number of column after processing: 51
                     id  click  hour    C1  banner_pos  site_id  site_domain  \
0   3926754196077507072      1     0  1005           0     3301         3586   
1   4544181857823990784      1     0  1005           1     2993         2107   
2  12744445434029488128      0     0  1005           0      422         3991   
3   4063799378381284352      0     0  1005           0      422         3991   
4   6357775859397618688      1     0  1005           0     1256         1985   

   site_category  app_id  app_domain  ...  site_count  app_count  \
0              2    4534         140  ...        2504    2583215   
1             22    4534         140  ...      264164    2583215   
2              2    4534         140  ...      648433    2583215   
3              2    4534         140  ...      648433    2583215   
4              4    4534         140  ...       90832    2583215   

   device_type_conn_interaction  C14_C17_product  device_click_rate 

### Data modeling

In [26]:
X = df.drop(columns=['click', 'id']) 
y = df['click']  


In [28]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"the size of train data: {X_train.shape}")
print(f"the size of valid data: {X_valid.shape}")

the size of train data: (3234317, 49)
the size of valid data: (808580, 49)


In [29]:
def precision_recall_curve_with_threshold(targets, probas, threshold = 0.5):
    
    def sort_pair(X, Y):
        rangee = [(x, y) for x, y in sorted(zip(X, Y), key = lambda pair: pair[0])]
        return [pair[0] for pair in rangee], [pair[1] for pair in rangee]
    
    # --- main
    targets, probas = sort_pair(targets, probas)
    precision, recall, thresholds = metrics.precision_recall_curve(targets, probas)
    
    # calculate precision-recall AUC
    auc_val = metrics.auc(recall, precision)
    print('Area Under Curve (AUC) : {:.3f}'.format(auc_val))
    # plot no skill
    if threshold is not None : 
        index = len([y for y in thresholds if y <= threshold]) # thresholds are sorted

        print('Recall with threshold = {}: {:.2f}%'.format(threshold, recall[index]*100))
        print('Precision with threshold = {} : {:.2f}%'.format(threshold, precision[index]*100))

        plt.plot([0, 1], [precision[index], precision[index]], linestyle='--', color = 'grey')
        plt.plot([recall[index], recall[index]], [0, 1], linestyle='--', color = 'grey')
        
        # plot the precision-recall curve for the model
        plt.plot(recall[:index], precision[:index], color = 'navy', marker='.')
        plt.plot(recall[index:], precision[index:], color = 'orange', marker='.')
        
        # show the plot
        plt.title('Precision-Recall curve with threshold = {}'.format(threshold))
        plt.xlabel('Recall $P(\, \widehat{1}\, | 1)$')
        plt.ylabel('Precision $P(\, 1\, | \widehat{1})$                                ', rotation = 'horizontal')
    else :
        # plot the precision-recall curve for the model
        plt.plot(recall, precision, color = 'orange', marker='.')
        
        # show the plot
        plt.title('Precision-Recall curve')
        plt.xlabel('Recall $P(\, \widehat{1}\, | 1)$')
        plt.ylabel('Precision $P(\, 1\, | \widehat{1})$                                ', rotation = 'horizontal')

    plt.show()
    return

In [None]:
classifier = RandomForestClassifier(
    n_estimators = 300, 
    class_weight = 'balanced', 
    criterion = 'gini', 
    max_depth = 10, 
    min_samples_split = 10, 
    min_samples_leaf = 5, 
    min_weight_fraction_leaf = 0.0, 
    max_features = 'log2', 
    max_leaf_nodes = None, 
    min_impurity_decrease = 0.0, 
    ccp_alpha = 0.0, 
    random_state = 42, 
    bootstrap = True, 
    oob_score = True, 
    max_samples = None,
    warm_start = False, 
    n_jobs = -1, 
    verbose = 0, 
)

%time classifier.fit(X_train, y_train)

print(classifier.score(X_train, y_train)) # classification accuracy on training set
print(classifier.score(X_valid, y_valid)) # classification accuracy on validation set
print(classifier.oob_score_)                    # classification accuracy on oob samples



In [None]:
def specificity_score(y_true, y_predict):
    '''
    Computes the proba that a label is 0 knowing that the prediction is 0
    '''
    y_predict_0  = [(pred, true) for pred, true in zip(y_predict, y_true) if true == 0]
    y_predict_00 = [(pred, true) for pred, true in y_predict_0 if pred == 0]
    specificity  = (0 if len(y_predict_0) == 0 else len(y_predict_00)/len(y_predict_0))
    return specificity

y_predict_valid = classifier.predict(X_valid)

acc = metrics.accuracy_score(y_valid, y_predict_valid)
rec = metrics.recall_score(y_valid, y_predict_valid)
prc = metrics.precision_score(y_valid, y_predict_valid)
spe = specificity_score(y_valid, y_predict_valid)
f1  = metrics.f1_score(y_valid, y_predict_valid)

print('Accuracy : {:.2f}%'.format(acc*100))
print('Recall : {:.2f}%'.format(rec*100))
print('Precision : {:.2f}%'.format(prc*100))
print('Specificity : {:.2f}%'.format(spe*100))
print('F1-score : {:.2f}%'.format(f1*100))

In [None]:
probas = classifier.predict_proba(X_valid)[:, 1]

precision_recall_curve_with_threshold(y_valid, probas, threshold = 0.5)