# Import Library

In [1]:
import os
import math
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.utils.class_weight import compute_class_weight
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    confusion_matrix,
    precision_score,
    recall_score,
    f1_score,
    accuracy_score
)
from sklearn.model_selection import GridSearchCV, PredefinedSplit
import xgboost as xgb
import joblib

# Functions

In [2]:
def get_resource_utilize():
    cpu_count = os.cpu_count()
    if cpu_count > 3:
        resource_util = cpu_count - 2
    else:
        resource_util = 1

    return resource_util

In [3]:
def process_training(df):
    df['feat_avg_contribution'] = df['feat_avg_contribution'].fillna(0)

    for col in ['feat_is_blocked_recently', 'feat_is_unblocked_recently', 'feat_is_rename_user_recently']:
        df[col] = df[col].fillna(0)
        df[col] = df[col].astype(int)

    df['feat_count_comment_bad_user'] = df['feat_count_comment_bad_user'].fillna(0)
    df['feat_count_comment_bad_user'] = df['feat_count_comment_bad_user'].astype(int)

    if 'log_timestamp' in df.columns:
        df['log_timestamp'] = pd.to_datetime(df['log_timestamp'])

In [4]:
def accuracy_report(y_true, y_pred):
    str_confusion_matrix = str(confusion_matrix(y_true, y_pred))\
                                .replace('[[', ' ' * 5 + '[[')\
                                .replace('\n ', '\n' + ' ' * 6)
    
    print("Confusion Matrix : \n{}".format(str_confusion_matrix))
    print("Accuracy Score : {0:.2f}".format(accuracy_score(y_true, y_pred)))
    print("Precision Score : {0:.2f}".format(precision_score(y_true, y_pred)))
    print("Recall Score : {0:.2f}".format(recall_score(y_true, y_pred)))
    print("F1 Score : {0:.2f}".format(f1_score(y_true, y_pred)))

In [5]:
def grid_search_cv(est, grid_params, X_train, y_train, 
                   X_valid, y_valid, scoring, n_jobs=None, 
                   **kwargs):
    X = pd.concat([X_train, X_valid])
    y = np.concatenate([y_train, y_valid])

    n_row_train = X_train.shape[0]
    n_row_valid = X_valid.shape[0]
    list_indices = [-1] * n_row_train + [0] * n_row_valid

    ps = PredefinedSplit(test_fold=list_indices)

    search_params = GridSearchCV(est, param_grid=grid_params, scoring=scoring, refit=False,
                                 cv=ps, n_jobs=n_jobs, return_train_score=True)

    search_params.fit(X, y, **kwargs)

    df_cv_results = pd.DataFrame(search_params.cv_results_)
    
    return df_cv_results.sort_values(by="rank_test_score")

# Experiment Set-Up

- <u>Algorithm</u> : Tree-based model (as it is best-fit with the distribution of data)
  - Decision Tree (baseline model)
  - Random Forest
  - XGBoost
- <u>Training</u>
  - Baseline Performance (Decision Tree model)
  - Model tuning & selecion (Validation Set)
  - Final Evalution (Testing Set)
  - Class Weight (imbalanced learning) + Sample Weight (focus on most recent data)
- <u>Evaluation</u>
  - Precision -> Since we are going to use the model for an automate suspension, we need to prioritize the Precision value to confidently block bad users before causing any trouble to the content on Website

In [6]:
path = Path(os.getcwd())

project_dir = path.parent

process_data_dir = project_dir / 'data' / 'preprocess'
model_dump_dir = project_dir / 'model'

In [7]:
id_cols = ['log_id', 'user_id']
target_col = 'is_block_next_period'

# Import Data

In [8]:
# import preprocessed data
df_train = pd.read_csv(process_data_dir / "train_data.csv")
df_test = pd.read_csv(process_data_dir / "test_data.csv")

df_train.set_index(id_cols, inplace=True)
df_test.set_index(id_cols, inplace=True)

# Initial Data Prep

In [11]:
# initial pre-process
process_training(df_train)

In [12]:
df_train.sort_values(by='log_timestamp', inplace=True)

# train-dev test split
filter_valid = df_train['log_timestamp'].ge('2024-02-01')

df_valid = df_train.loc[filter_valid, :].copy()
df_train = df_train.loc[~filter_valid, :].copy()

In [13]:
print("Training shape : {}".format(df_train.shape))
print("Validation shape : {}".format(df_valid.shape))

Training shape : (798323, 15)
Validation shape : (38324, 15)


# Model Training

In [14]:
random_state = 99

In [15]:
class_vals = np.unique(df_train[target_col])
class_weights = compute_class_weight(class_weight="balanced", classes=class_vals, y=df_train[target_col])
dict_class_weight = dict(zip(class_vals, class_weights))

In [16]:
print("Initial class weight: {}".format(dict_class_weight))

Initial class weight: {np.int64(0): np.float64(0.5270050104632203), np.int64(1): np.float64(9.757541312212771)}


In [17]:
X_train = df_train.filter(regex='^feat', axis=1)
y_train = df_train[target_col]

X_valid = df_valid.filter(regex='^feat', axis=1)
y_valid = df_valid[target_col]

## Version 0: DT Classifier (baseline)

<i>--> Current baseline "Precision" score :</i> <b>0.586</b>

In [18]:
model_v0 = DecisionTreeClassifier(max_depth=7, class_weight=dict_class_weight, random_state=random_state)
model_v0.fit(X_train, y_train)

In [19]:
y_train_pred = model_v0.predict(X_train)
y_valid_pred = model_v0.predict(X_valid)

precision_train = precision_score(y_true=y_train, y_pred=y_train_pred)
precision_valid = precision_score(y_true=y_valid, y_pred=y_valid_pred)

print("Precision score on Training set : {0:.3f}".format(precision_train))
print("Precision score on Validation set : {0:.3f} **".format(precision_valid))

Precision score on Training set : 0.381
Precision score on Training set : 0.586 **


In [20]:
accuracy_report(y_true=y_valid, y_pred=y_valid_pred)

Confusion Matrix : 
     [[36326   822]
      [   11  1165]]
Accuracy Score : 0.98
Precision Score : 0.59
Recall Score : 0.99
F1 Score : 0.74


## Version 1: DT Classifier

<i>--> best "Precision" score :</i> <b>0.64</b>

<u>Tuning</u>:
 - class weight ==> {0:1, 1:7}
 - sample weight ==> more weights on most recent information
 - grid search : learning params
 - grid search : overfit prevention (if any)

<u>Parameter Space</u>:
 - criterion = ["gini", "entropy", "log_loss"] => either "entropy" or "log_loss"
 - max_depth = [7, 9, 11, 13]
 - min_samples_leaf = [3000, 6000, 9000] => min_samples_leaf = 6000
 - min_weight_fraction_leaf = [0, 0.1, 0.2, 0.5]
 - max_features = ['sqrt', 'log2', None]

In [21]:
n_jobs = get_resource_utilize()

print("CPU availables: {}".format(n_jobs))

CPU availables: 14


In [22]:
# new class weight
class_weight = {0: 1, 1: 7}

# compute sample weight
last_timestamp = df_train['log_timestamp'].min()
days_from_first_txn = df_train['log_timestamp'] - last_timestamp
mth_from_first_txn = days_from_first_txn.dt.days / 30

sample_weights = list(map(lambda x: max(1, math.ceil(x)), mth_from_first_txn.tolist()))

In [23]:
# selected parameter:
# - min_samples_leaf: 6000
# - criterion: "entropy"

param_grid = {
    # "criterion": ["gini", "entropy", "log_loss"],
    # "max_depth": [7, 9, 11, 13],
    # "min_samples_leaf": [3000, 6000, 9000],
    "min_weight_fraction_leaf": [0, 0.1, 0.2, 0.5],
    "max_features": ['sqrt', 'log2', None]
}
initial_model = DecisionTreeClassifier(
    class_weight=class_weight, 
    random_state=random_state, 
    min_samples_leaf=6000, 
    criterion="entropy"
)

df_cv_results = grid_search_cv(
    initial_model, 
    param_grid, 
    X_train=X_train, y_train=y_train, 
    X_valid=X_valid, y_valid=y_valid, 
    scoring="precision", 
    sample_weight=sample_weights,
    n_jobs=n_jobs
)

In [24]:
df_cv_results.head(5)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_features,param_min_weight_fraction_leaf,params,split0_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,mean_train_score,std_train_score
8,4.732216,0.0,0.015995,0.0,,0.0,"{'max_features': None, 'min_weight_fraction_le...",0.637466,0.637466,0.0,1,0.382138,0.382138,0.0
9,3.921941,0.0,0.013128,0.0,,0.1,"{'max_features': None, 'min_weight_fraction_le...",0.580119,0.580119,0.0,2,0.323377,0.323377,0.0
10,3.623502,0.0,0.01096,0.0,,0.2,"{'max_features': None, 'min_weight_fraction_le...",0.580119,0.580119,0.0,2,0.323377,0.323377,0.0
0,3.284936,0.0,0.012078,0.0,sqrt,0.0,"{'max_features': 'sqrt', 'min_weight_fraction_...",0.537125,0.537125,0.0,4,0.372899,0.372899,0.0
4,3.27045,0.0,0.01265,0.0,log2,0.0,"{'max_features': 'log2', 'min_weight_fraction_...",0.537125,0.537125,0.0,4,0.372899,0.372899,0.0


In [184]:
# optional : save grid search results
save_dir = model_dump_dir / "model_v1"
save_dir.mkdir(parents=True, exist_ok=True)

current_timestamp = pd.Timestamp.now()

time_prefix = current_timestamp.strftime(r"%Y-%m-%d %H-%M")
df_cv_results.to_csv(save_dir / "grid_search_{}.csv".format(time_prefix), index=False)

Model Performance Evaluation

In [25]:
model_v1 = DecisionTreeClassifier(class_weight=class_weight, random_state=random_state, min_samples_leaf=6000, criterion="entropy")
model_v1.fit(X_train, y_train, sample_weight=sample_weights)

In [26]:
y_train_pred = model_v1.predict(X_train)
y_valid_pred = model_v1.predict(X_valid)

precision_train = precision_score(y_true=y_train, y_pred=y_train_pred)
precision_valid = precision_score(y_true=y_valid, y_pred=y_valid_pred)

print("Precision score on Training set : {0:.3f}".format(precision_train))
print("Precision score on Validation set : {0:.3f} **".format(precision_valid))

Precision score on Training set : 0.382
Precision score on Training set : 0.637 **


In [28]:
accuracy_report(y_true=y_valid, y_pred=y_valid_pred)

Confusion Matrix : 
     [[36490   658]
      [   19  1157]]
Accuracy Score : 0.98
Precision Score : 0.64
Recall Score : 0.98
F1 Score : 0.77


## Version 2: Random Forest Classifier

<i>--> best "Precision" score :</i> <b>0.654</b>

<u>Tuning</u>:
 - class weight ==> {0:1, 1:7}
 - sample weight ==> more weights on most recent information
 - grid search : learning params
 - grid search : overfit prevention (if any)

<u>Parameter Space</u>:
 - n_estimators = [50, 100, 200, 300] => 100
 - criterion = ["gini", "entropy", "log_loss"] => either "entropy" or "log_loss"
 - <s>min_samples_split = [6000, 9000, 12000]</s> (not affected)
 - min_samples_leaf = [3000, 6000, 9000] => 10000
 - <s>min_impurity_decrease = [0, 0.1, 0.2] => .1</s> (worsen performance)
 - max_features = ['sqrt', 'log2', None] => 'sqrt'
 - <s>max_samples = [0.6, 0.8, 1]</s> (worsen performance)

In [45]:
# selected parameter:
# - criterion: "entropy"
# - n_estimators: 100
# - min_samples_split: 10000
# - max_features: "sqrt"

param_grid = {
    # "criterion": ["entropy", "log_loss"],
    # "n_estimators": [50, 100],
    # "min_samples_leaf": [9000, 10000, 12000],
    # "min_samples_split": [6000, 9000, 12000],
    # "min_impurity_decrease": [.1, .2, .05, 0],
    "max_features": [None, 'sqrt', 'log2'],
    "max_samples": [.6, .8, 1.]
}
initial_model = RandomForestClassifier(
    class_weight=class_weight, 
    random_state=random_state, 
    n_jobs=n_jobs,
    criterion="entropy",
    n_estimators=100,
    min_samples_leaf=10000,

)

df_cv_results = grid_search_cv(
    initial_model, 
    param_grid, 
    X_train=X_train, y_train=y_train, 
    X_valid=X_valid, y_valid=y_valid, 
    scoring="precision", 
    sample_weight=sample_weights,
)

In [46]:
df_cv_results.head(5)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_features,param_max_samples,params,split0_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,mean_train_score,std_train_score
5,11.016629,0.0,0.045257,0.0,sqrt,1.0,"{'max_features': 'sqrt', 'max_samples': 1.0}",0.653911,0.653911,0.0,1,0.361493,0.361493,0.0
8,13.394829,0.0,0.045119,0.0,log2,1.0,"{'max_features': 'log2', 'max_samples': 1.0}",0.653911,0.653911,0.0,1,0.361493,0.361493,0.0
7,10.118208,0.0,0.046455,0.0,log2,0.8,"{'max_features': 'log2', 'max_samples': 0.8}",0.651402,0.651402,0.0,3,0.360982,0.360982,0.0
4,10.086172,0.0,0.043092,0.0,sqrt,0.8,"{'max_features': 'sqrt', 'max_samples': 0.8}",0.651402,0.651402,0.0,3,0.360982,0.360982,0.0
6,9.54774,0.0,0.044241,0.0,log2,0.6,"{'max_features': 'log2', 'max_samples': 0.6}",0.651323,0.651323,0.0,5,0.359314,0.359314,0.0


In [190]:
# optional : save grid search results
save_dir = model_dump_dir / "model_v2"
save_dir.mkdir(parents=True, exist_ok=True)

current_timestamp = pd.Timestamp.now()

time_prefix = current_timestamp.strftime(r"%Y-%m-%d %H-%M")
df_cv_results.to_csv(save_dir / "grid_search_{}.csv".format(time_prefix), index=False)

Model Performance Evaluation

In [53]:
model_v2 = RandomForestClassifier(
    class_weight=class_weight, 
    random_state=random_state, 
    n_jobs=n_jobs,
    criterion="entropy",
    n_estimators=100,
    min_samples_leaf=10000,
    max_features="sqrt"
)

model_v2.fit(X_train, y_train, sample_weight=sample_weights)

In [54]:
y_train_pred = model_v2.predict(X_train)
y_valid_pred = model_v2.predict(X_valid)

precision_train = precision_score(y_true=y_train, y_pred=y_train_pred)
precision_valid = precision_score(y_true=y_valid, y_pred=y_valid_pred)

print("Precision score on Training set : {0:.3f}".format(precision_train))
print("Precision score on Validation set : {0:.3f} **".format(precision_valid))

Precision score on Training set : 0.361
Precision score on Training set : 0.654 **


In [55]:
accuracy_report(y_true=y_valid, y_pred=y_valid_pred)

Confusion Matrix : 
     [[36533   615]
      [   14  1162]]
Accuracy Score : 0.98
Precision Score : 0.65
Recall Score : 0.99
F1 Score : 0.79


## Version 3: XGBoost Classifier

<i>--> best "Precision" score :</i> <b>0.66</b>

<u>Tuning</u>:
 - scale_pos_weight ==> 7
 - sample weight ==> more weights on most recent information
 - grid search : learning params
 - grid search : overfit prevention (if any)

<u>Parameter Space</u>:
 - model complexity params
    - n_estimators = [50, 100, 150, 200] => 50
    - max_leaves = [30, 50, 70] => 50
    - learning_rate = [.001, .005, .01, .03, .1, .3] => 0.01
    - grow_policy" = ['depthwise', 'lossguide'] => 'depthwise'
 - loss update params
    - <s>gamma = [0, 0.1, 0.5, 1, 2, 5, 10, 100]</s> (skip since shallow tree will not be affected by)
    - <s>min_child_weight = [3, 5, 7, 9]</s> not affected
    - <s>max_delta_step</s> not affected
 - overfit prevention params
    - reg_alpha = [.01, .1, 1, 2, 5, 10] => 5
    - reg_lambda = [.01, .1, 1, 2, 5, 10] => 10
 - sampling params
    - <s>subsample = [.6, .7, .8, .9, 1.]</s> (worsen performance)
    - <s>colsample_bytree = [.4, .6, .8, 1.]</s> use colsample_bynode instead
    - <s>colsample_bynode = [.4, .6, .8, 1.]</s> (worsen performance)

In [119]:
# selected parameter:
# - learning_rate: 0.01
# - n_estimators: 50
# - max_leaves: 50
# - grow_policy: 'depthwise'
# - reg_alpha: 5
# - reg_lambda: 10

param_grid = {
    # "n_estimators": [50, 100, 150, 200],
    # "max_leaves": [30, 50, 70, 90],
    # "learning_rate": [.01, .03, .1, .3],
    # "grow_policy": ['depthwise', 'lossguide'],
    # "min_child_weight": [0, 2, 3, 5],
    # "max_delta_step": [0, 3, 5, 6],
    # "reg_alpha": [.01, .1, 1, 2, 5, 10],
    # "reg_lambda": [.01, .1, 1, 2, 5, 10],
    "subsample": [.7, .8, .9, 1.],
    "colsample_bynode": [.4, .6, .8]
}
initial_model = xgb.XGBClassifier(
    scale_pos_weight=7, 
    random_state=random_state, 
    n_jobs=n_jobs,
    learning_rate=0.01,
    n_estimators=50,
    max_leaves=50,
    grow_policy="depthwise",
    reg_alpha=5,
    reg_lambda=10
)

df_cv_results = grid_search_cv(
    initial_model, 
    param_grid, 
    X_train=X_train, y_train=y_train, 
    X_valid=X_valid, y_valid=y_valid, 
    scoring="precision", 
    sample_weight=sample_weights,
)

In [132]:
df_cv_results.head(5)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_colsample_bynode,param_subsample,params,split0_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,mean_train_score,std_train_score
2,1.703079,0.0,0.019113,0.0,0.4,0.9,"{'colsample_bynode': 0.4, 'subsample': 0.9}",0.714118,0.714118,0.0,1,0.599475,0.599475,0.0
4,1.568898,0.0,0.018275,0.0,0.6,0.7,"{'colsample_bynode': 0.6, 'subsample': 0.7}",0.710376,0.710376,0.0,2,0.588073,0.588073,0.0
8,1.510413,0.0,0.014472,0.0,0.8,0.7,"{'colsample_bynode': 0.8, 'subsample': 0.7}",0.708661,0.708661,0.0,3,0.586592,0.586592,0.0
6,1.462856,0.0,0.015011,0.0,0.6,0.9,"{'colsample_bynode': 0.6, 'subsample': 0.9}",0.708625,0.708625,0.0,4,0.591203,0.591203,0.0
11,1.331899,0.0,0.014272,0.0,0.8,1.0,"{'colsample_bynode': 0.8, 'subsample': 1.0}",0.707094,0.707094,0.0,5,0.579738,0.579738,0.0


In [None]:
# optional : save grid search results
save_dir = model_dump_dir / "model_v3"
save_dir.mkdir(parents=True, exist_ok=True)

current_timestamp = pd.Timestamp.now()

time_prefix = current_timestamp.strftime(r"%Y-%m-%d %H-%M")
df_cv_results.to_csv(save_dir / "grid_search_{}.csv".format(time_prefix), index=False)

Model Performance Evaluation

In [133]:
model_v3 = xgb.XGBClassifier(
    scale_pos_weight=7, 
    random_state=random_state, 
    n_jobs=n_jobs,
    learning_rate=0.01,
    n_estimators=50,
    max_leaves=50,
    grow_policy="depthwise",
    reg_alpha=5,
    reg_lambda=10,
)

model_v3.fit(X_train, y_train, sample_weight=sample_weights)

In [134]:
y_train_pred = model_v3.predict(X_train)
y_valid_pred = model_v3.predict(X_valid)

precision_train = precision_score(y_true=y_train, y_pred=y_train_pred)
precision_valid = precision_score(y_true=y_valid, y_pred=y_valid_pred)

print("Precision score on Training set : {0:.3f}".format(precision_train))
print("Precision score on Validation set : {0:.3f} **".format(precision_valid))

Precision score on Training set : 0.578
Precision score on Training set : 0.708 **


In [135]:
accuracy_report(y_true=y_valid, y_pred=y_valid_pred)

Confusion Matrix : 
     [[36889   259]
      [  549   627]]
Accuracy Score : 0.98
Precision Score : 0.71
Recall Score : 0.53
F1 Score : 0.61


## Model Development Summary

From all the iterations of ML model development and parameter fine-tuning, we can conclude all model results, in other words, algorithm used along with its performance metrics, as follow.

|           Model          |       Accuracy     |      Precision    |       Recall      |         F1        |
|:------------------------:|:------------------:|:-----------------:|:-----------------:|:-----------------:|
| DT Classifier (baseline) |          0.95      |        0.59       |        0.99       |       0.74        |
|        DT Classifier     |          0.98      |        0.64       |        0.98       |       0.77        |
|<mark>Random Forest Classifier</mark>|          0.98      |  <i>0.65**</i>   |  <i>0.99**</i>   |       0.79        |
|XGBoost Classifier|          0.98      |<b><u>0.71</u></b>|        0.53       |       0.61        |

From the table, we can see that XGBoost Classifier yields the best result on Precision (as underline). However, if we look across other metrics in XGBoost, we can see that <b>overall performance is unstable</b> and <b>too much Recall was sacrafised</b> just to gain additional 6% of Precision (compared to the second-best algorithm)

Hence, we are going to choose <b><u>Random Forest Classifier</u></b> as our final prediction model for real usage.

# Final Model Evaluation

In this step, we are going to make a final evaluation of our prediction model on Hold-out Testing Set as an unbias approximation of model performance on real-world data

In [136]:
# combine all data for final training
df_train_final = pd.concat([df_train, df_valid])

X = df_train_final.filter(regex='^feat', axis=1)
y = df_train_final[target_col]

In [137]:
class_weight = {0: 1, 1: 7}

# compute sample weight
last_timestamp = df_train_final['log_timestamp'].min()
days_from_first_txn = df_train_final['log_timestamp'] - last_timestamp
mth_from_first_txn = days_from_first_txn.dt.days / 30

sample_weights = list(map(lambda x: max(1, math.ceil(x)), mth_from_first_txn.tolist()))

In [142]:
model = RandomForestClassifier(
    class_weight=class_weight, 
    random_state=random_state, 
    n_jobs=n_jobs,
    criterion="entropy",
    n_estimators=100,
    min_samples_leaf=10000,
    max_features="sqrt"
)

model.fit(X, y, sample_weight=sample_weights)

In [143]:
# initial pre-process on Hold-out Testing Set
process_training(df_test)

X_test = df_test.filter(regex='^feat', axis=1)
y_test = df_test[target_col]

y_test_pred = model.predict(X_test)

In [144]:
precision_test = precision_score(y_true=y_test, y_pred=y_test_pred)

print("Final Precision score on Testing set : {0:.3f} **".format(precision_test))

Final Precision score on Testing set : 0.720 **


In [145]:
print("--- Final Accuracy report on Hold-out Testing Set ---")
accuracy_report(y_true=y_test, y_pred=y_test_pred)

--- Final Accuracy report on Hold-out Testing Set ---
Confusion Matrix : 
     [[37115  1549]
      [   30  3985]]
Accuracy Score : 0.96
Precision Score : 0.72
Recall Score : 0.99
F1 Score : 0.83


In [146]:
y_pred = model.predict(X)

print("--- Final Accuracy report on Training Set ---")
accuracy_report(y_true=y, y_pred=y_pred)

--- Final Accuracy report on Training Set ---
Confusion Matrix : 
     [[724314  70249]
      [  1657  40427]]
Accuracy Score : 0.91
Precision Score : 0.37
Recall Score : 0.96
F1 Score : 0.53


From the final evaluation, we can conclude the <b><u>final accuracy</u></b> (Precision) of our predictive model at <mark><b>72 %</b></mark> .

In [147]:
# optional : model saving
save_dir = model_dump_dir / "final"
save_dir.mkdir(parents=True, exist_ok=True)

current_timestamp = pd.Timestamp.now()

time_prefix = current_timestamp.strftime(r"%Y-%m-%d %H-%M")

joblib.dump(model, save_dir / 'model_{}.joblib'.format(time_prefix))

['c:\\Users\\11413929\\repos\\wikimedia\\model\\final\\model_2024-09-29 17-27.joblib']

In [150]:
# save prediction results
prediction_result_dir = project_dir / 'data' / 'prediction'
prediction_result_dir.mkdir(parents=True, exist_ok=True)

df_prediction = df_test[['log_timestamp', target_col]]
df_prediction['y_pred'] = model.predict(X_test)

df_prediction.reset_index().to_csv(prediction_result_dir / 'prediction.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_prediction['y_pred'] = model.predict(X_test)
