# Import Library

In [208]:
import os
import math
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.utils.class_weight import compute_class_weight, compute_sample_weight
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    confusion_matrix,
    precision_score,
    recall_score,
    f1_score,
    accuracy_score
)
from sklearn.model_selection import GridSearchCV, PredefinedSplit
import xgboost as xgb
import joblib

# Functions

In [131]:
def get_resource_utilize():
    cpu_count = os.cpu_count()
    if cpu_count > 3:
        resource_util = cpu_count - 2
    else:
        resource_util = 1

    return resource_util

In [63]:
def process_training(df):
    df['feat_avg_contribution'] = df['feat_avg_contribution'].fillna(0)

    for col in ['feat_is_blocked_recently', 'feat_is_unblocked_recently', 'feat_is_rename_user_recently']:
        df[col] = df[col].fillna(0)
        df[col] = df[col].astype(int)

    df['feat_count_comment_bad_user'] = df['feat_count_comment_bad_user'].fillna(0)
    df['feat_count_comment_bad_user'] = df['feat_count_comment_bad_user'].astype(int)

    if 'log_timestamp' in df.columns:
        df['log_timestamp'] = pd.to_datetime(df['log_timestamp'])

In [64]:
def accuracy_report(y_true, y_pred):
    str_confusion_matrix = str(confusion_matrix(y_true, y_pred))\
                                .replace('[[', ' ' * 5 + '[[')\
                                .replace('\n ', '\n' + ' ' * 6)
    
    print("Confusion Matrix : \n{}".format(str_confusion_matrix))
    print("Accuracy Score : {0:.2f}".format(accuracy_score(y_true, y_pred)))
    print("Precision Score : {0:.2f}".format(precision_score(y_true, y_pred)))
    print("Recall Score : {0:.2f}".format(recall_score(y_true, y_pred)))
    print("F1 Score : {0:.2f}".format(f1_score(y_true, y_pred)))

In [137]:
def grid_search_cv(est, grid_params, X_train, y_train, 
                   X_valid, y_valid, scoring, n_jobs=None, 
                   **kwargs):
    X = pd.concat([X_train, X_valid])
    y = np.concatenate([y_train, y_valid])

    n_row_train = X_train.shape[0]
    n_row_valid = X_valid.shape[0]
    list_indices = [-1] * n_row_train + [0] * n_row_valid

    ps = PredefinedSplit(test_fold=list_indices)

    search_params = GridSearchCV(est, param_grid=grid_params, scoring=scoring, refit=False,
                                 cv=ps, n_jobs=n_jobs, return_train_score=True)

    search_params.fit(X, y, **kwargs)

    df_cv_results = pd.DataFrame(search_params.cv_results_)
    
    return df_cv_results.sort_values(by="rank_test_score")

# Experiment Set-Up

- <u>Algorithm</u> : Tree-based model (as it is best-fit with the distribution of data)
  - Decision Tree (baseline model)
  - Random Forest
  - XGBoost
- <u>Training</u>
  - Baseline Performance (Decision Tree model)
  - Model tuning & selecion (Validation Set)
  - Final Evalution (Testing Set)
  - Class Weight (imbalanced learning) + Sample Weight (focus on most recent data)
- <u>Evaluation</u>
  - Precision -> Since we are going to use the model for an automate suspension, we need to prioritize the Precision value to confidently block bad users before causing any trouble to the content on Website

In [59]:
path = Path(os.getcwd())

project_dir = path.parent

process_data_dir = project_dir / 'data' / 'preprocess'
model_dump_dir = project_dir / 'model'

In [60]:
id_cols = ['log_id', 'user_id']
target_col = 'is_block_next_period'

# Import Data

In [61]:
# import preprocessed data
df_train = pd.read_csv(process_data_dir / "train_data.csv")
df_test = pd.read_csv(process_data_dir / "test_data.csv")

df_train.set_index(id_cols, inplace=True)
df_test.set_index(id_cols, inplace=True)

# Initial Data Prep

In [None]:
# initial pre-process
process_training(df_train)

In [75]:
df_train.sort_values(by='log_timestamp', inplace=True)

# train-dev test split
filter_valid = df_train['log_timestamp'].ge('2024-02-01')

df_valid = df_train.loc[filter_valid, :].copy()
df_train = df_train.loc[~filter_valid, :].copy()

In [76]:
print("Training shape : {}".format(df_train.shape))
print("Validation shape : {}".format(df_valid.shape))

Training shape : (798323, 11)
Validation shape : (38324, 11)


# Model Training

In [77]:
random_state = 99

In [78]:
class_vals = np.unique(df_train[target_col])
class_weights = compute_class_weight(class_weight="balanced", classes=class_vals, y=df_train[target_col])
dict_class_weight = dict(zip(class_vals, class_weights))

In [80]:
print("Initial class weight: {}".format(dict_class_weight))

Initial class weight: {np.int64(0): np.float64(0.5270050104632203), np.int64(1): np.float64(9.757541312212771)}


In [81]:
X_train = df_train.filter(regex='^feat', axis=1)
y_train = df_train[target_col]

X_valid = df_valid.filter(regex='^feat', axis=1)
y_valid = df_valid[target_col]

## Version 0: DT Classifier (baseline)

<i>--> Current baseline "Precision" score :</i> <b>0.380</b>

In [82]:
model_v0 = DecisionTreeClassifier(max_depth=7, class_weight=dict_class_weight, random_state=random_state)
model_v0.fit(X_train, y_train)

In [83]:
y_train_pred = model_v0.predict(X_train)
y_valid_pred = model_v0.predict(X_valid)

precision_train = precision_score(y_true=y_train, y_pred=y_train_pred)
precision_valid = precision_score(y_true=y_valid, y_pred=y_valid_pred)

print("Precision score on Training set : {0:.3f}".format(precision_train))
print("Precision score on Training set : {0:.3f} **".format(precision_valid))

Precision score on Training set : 0.316
Precision score on Training set : 0.383 **


In [84]:
accuracy_report(y_true=y_valid, y_pred=y_valid_pred)

Confusion Matrix : 
     [[35508  1640]
      [  157  1019]]
Accuracy Score : 0.95
Precision Score : 0.38
Recall Score : 0.87
F1 Score : 0.53


## Version 1: DT Classifier

<i>--> best "Precision" score :</i> <b>0.462</b>

<u>Tuning</u>:
 - class weight ==> {0:1, 1:7}
 - sample weight ==> more weights on most recent information
 - grid search : learning params
 - grid search : overfit prevention (if any)

<u>Parameter Space</u>:
 - criterion = ["gini", "entropy", "log_loss"] => either "entropy" or "log_loss"
 - max_depth = [7, 9, 11, 13]
 - min_samples_leaf = [3000, 6000, 9000] => min_samples_leaf = 6000
 - min_weight_fraction_leaf = [0, 0.1, 0.2, 0.5]
 - max_features = ['sqrt', 'log2', None]

In [135]:
n_jobs = get_resource_utilize()

print("CPU availables: {}".format(n_jobs))

CPU availables: 14


In [119]:
# new class weight
class_weight = {0: 1, 1: 7}

# compute sample weight
last_timestamp = df_train['log_timestamp'].min()
days_from_first_txn = df_train['log_timestamp'] - last_timestamp
mth_from_first_txn = days_from_first_txn.dt.days / 30

sample_weights = list(map(lambda x: max(1, math.ceil(x)), mth_from_first_txn.tolist()))

In [144]:
# selected parameter:
# - min_samples_leaf: 6000
# - criterion: "entropy"

param_grid = {
    # "criterion": ["gini", "entropy", "log_loss"],
    # "max_depth": [7, 9, 11, 13],
    # "min_samples_leaf": [3000, 6000, 9000],
    "min_weight_fraction_leaf": [0, 0.1, 0.2, 0.5],
    "max_features": ['sqrt', 'log2', None]
}
initial_model = DecisionTreeClassifier(
    class_weight=class_weight, 
    random_state=random_state, 
    min_samples_leaf=6000, 
    criterion="entropy"
)

df_cv_results = grid_search_cv(
    initial_model, 
    param_grid, 
    X_train=X_train, y_train=y_train, 
    X_valid=X_valid, y_valid=y_valid, 
    scoring="precision", 
    sample_weight=sample_weights,
    n_jobs=n_jobs
)

In [147]:
df_cv_results.head(5)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_features,param_min_weight_fraction_leaf,params,split0_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,mean_train_score,std_train_score
8,2.722018,0.0,0.010003,0.0,,0.0,"{'max_features': None, 'min_weight_fraction_le...",0.461874,0.461874,0.0,1,0.325884,0.325884,0.0
1,1.63584,0.0,0.04031,0.0,sqrt,0.1,"{'max_features': 'sqrt', 'min_weight_fraction_...",0.424649,0.424649,0.0,2,0.281954,0.281954,0.0
5,1.636604,0.0,0.044384,0.0,log2,0.1,"{'max_features': 'log2', 'min_weight_fraction_...",0.424649,0.424649,0.0,2,0.281954,0.281954,0.0
4,2.106997,0.0,0.03414,0.0,log2,0.0,"{'max_features': 'log2', 'min_weight_fraction_...",0.339958,0.339958,0.0,4,0.282627,0.282627,0.0
0,2.109776,0.0,0.037972,0.0,sqrt,0.0,"{'max_features': 'sqrt', 'min_weight_fraction_...",0.339958,0.339958,0.0,4,0.282627,0.282627,0.0


In [184]:
# optional : save grid search results
save_dir = model_dump_dir / "model_v1"
save_dir.mkdir(parents=True, exist_ok=True)

current_timestamp = pd.Timestamp.now()

time_prefix = current_timestamp.strftime(r"%Y-%m-%d %H-%M")
df_cv_results.to_csv(save_dir / "grid_search_{}.csv".format(time_prefix), index=False)

Model Performance Evaluation

In [148]:
model_v1 = DecisionTreeClassifier(class_weight=class_weight, random_state=random_state, min_samples_leaf=6000, criterion="entropy")
model_v1.fit(X_train, y_train, sample_weight=sample_weights)

In [149]:
y_train_pred = model_v1.predict(X_train)
y_valid_pred = model_v1.predict(X_valid)

precision_train = precision_score(y_true=y_train, y_pred=y_train_pred)
precision_valid = precision_score(y_true=y_valid, y_pred=y_valid_pred)

print("Precision score on Training set : {0:.3f}".format(precision_train))
print("Precision score on Training set : {0:.3f} **".format(precision_valid))

Precision score on Training set : 0.326
Precision score on Training set : 0.462 **


In [150]:
accuracy_report(y_true=y_valid, y_pred=y_valid_pred)

Confusion Matrix : 
     [[36160   988]
      [  328   848]]
Accuracy Score : 0.97
Precision Score : 0.46
Recall Score : 0.72
F1 Score : 0.56


## Version 2: Random Forest Classifier

<i>--> best "Precision" score :</i> <b>0.68</b>

<u>Tuning</u>:
 - class weight ==> {0:1, 1:7}
 - sample weight ==> more weights on most recent information
 - grid search : learning params
 - grid search : overfit prevention (if any)

<u>Parameter Space</u>:
 - n_estimators = [50, 100, 200, 300] => 50
 - criterion = ["gini", "entropy", "log_loss"] => gini
 - <s>min_samples_split = [6000, 9000, 12000]</s> (not affected)
 - min_samples_leaf = [3000, 6000, 9000] => 6000
 - min_impurity_decrease = [0, 0.1, 0.2] => .1
 - max_features = ['sqrt', 'log2', None] => 'sqrt'
 - <s>max_samples = [0.6, 0.8, 1]</s> (not affected)

In [201]:
# selected parameter:
# - criterion: "gini"
# - n_estimators: 50
# - min_samples_split: 6000
# - min_impurity_decrease: .1
# - max_features: "sqrt"

param_grid = {
    # "criterion": ["gini", "entropy", "log_loss"],
    # "n_estimators": [50, 100, 150, 200],
    # "min_samples_leaf": [3000, 6000, 9000],
    # "min_samples_split": [6000, 9000, 12000],
    # "min_impurity_decrease": [.1, .12, .05, 0],
    "max_features": [None, 'sqrt', 'log2'],
    "max_samples": [.6, .8, 1.]
}
initial_model = RandomForestClassifier(
    class_weight=class_weight, 
    random_state=random_state, 
    n_jobs=n_jobs,
    criterion="gini",
    n_estimators=50,
    min_samples_split=6000,
    min_impurity_decrease=0.1
)

df_cv_results = grid_search_cv(
    initial_model, 
    param_grid, 
    X_train=X_train, y_train=y_train, 
    X_valid=X_valid, y_valid=y_valid, 
    scoring="precision", 
    sample_weight=sample_weights,
)

In [203]:
df_cv_results.head(5)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_features,param_max_samples,params,split0_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,mean_train_score,std_train_score
3,3.640441,0.0,0.029999,0.0,sqrt,0.6,"{'max_features': 'sqrt', 'max_samples': 0.6}",0.67753,0.67753,0.0,1,0.338243,0.338243,0.0
6,3.660843,0.0,0.031008,0.0,log2,0.6,"{'max_features': 'log2', 'max_samples': 0.6}",0.67753,0.67753,0.0,1,0.338243,0.338243,0.0
5,3.866396,0.0,0.031006,0.0,sqrt,1.0,"{'max_features': 'sqrt', 'max_samples': 1.0}",0.67753,0.67753,0.0,1,0.338239,0.338239,0.0
4,3.483783,0.0,0.032,0.0,sqrt,0.8,"{'max_features': 'sqrt', 'max_samples': 0.8}",0.67753,0.67753,0.0,1,0.338243,0.338243,0.0
7,3.924525,0.0,0.032495,0.0,log2,0.8,"{'max_features': 'log2', 'max_samples': 0.8}",0.67753,0.67753,0.0,1,0.338243,0.338243,0.0


In [190]:
# optional : save grid search results
save_dir = model_dump_dir / "model_v2"
save_dir.mkdir(parents=True, exist_ok=True)

current_timestamp = pd.Timestamp.now()

time_prefix = current_timestamp.strftime(r"%Y-%m-%d %H-%M")
df_cv_results.to_csv(save_dir / "grid_search_{}.csv".format(time_prefix), index=False)

Model Performance Evaluation

In [204]:
model_v2 = RandomForestClassifier(
    class_weight=class_weight, 
    random_state=random_state, 
    n_jobs=n_jobs,
    criterion="gini",
    n_estimators=50,
    min_samples_split=6000,
    min_impurity_decrease=0.1,
    max_features="sqrt"
)

model_v2.fit(X_train, y_train, sample_weight=sample_weights)

In [205]:
y_train_pred = model_v2.predict(X_train)
y_valid_pred = model_v2.predict(X_valid)

precision_train = precision_score(y_true=y_train, y_pred=y_train_pred)
precision_valid = precision_score(y_true=y_valid, y_pred=y_valid_pred)

print("Precision score on Training set : {0:.3f}".format(precision_train))
print("Precision score on Training set : {0:.3f} **".format(precision_valid))

Precision score on Training set : 0.338
Precision score on Training set : 0.678 **


In [206]:
accuracy_report(y_true=y_valid, y_pred=y_valid_pred)

Confusion Matrix : 
     [[36960   188]
      [  781   395]]
Accuracy Score : 0.97
Precision Score : 0.68
Recall Score : 0.34
F1 Score : 0.45


## Version 3: XGBoost Classifier

<i>--> best "Precision" score :</i> <b>0.66</b>

<u>Tuning</u>:
 - scale_pos_weight ==> 7
 - sample weight ==> more weights on most recent information
 - grid search : learning params
 - grid search : overfit prevention (if any)

<u>Parameter Space</u>:
 - model complexity params
    - n_estimators = [50, 100, 150, 200] => 50
    - max_leaves = [30, 50, 70] => 40
    - learning_rate = [.001, .005, .01, .03, .1, .3] => 0.01
    - <s>grow_policy" = ['depthwise', 'lossguide']</s>
 - loss update params
    - <s>gamma = [0, 0.1, 0.5, 1, 2, 5, 10, 100]</s> (skip since shallow tree will not be affected by)
    - <s>min_child_weight = [3, 5, 7, 9]</s> not affected
    - <s>max_delta_step</s> not affected
 - overfit prevention params
    - <s>reg_alpha = [.01, .1, 1, 2, 5, 10]</s> not affected
    - <s>reg_lambda = [.01, .1, 1, 2, 5, 10]</s> not affected
 - sampling params
    - subsample = [.6, .7, .8, .9, 1.] => 1
    - <s>colsample_bytree = [.4, .6, .8, 1.]</s> use colsample_bynode instead
    - colsample_bynode = [.4, .6, .8, 1.] => 0.6

In [277]:
# selected parameter:
# - learning_rate: 0.01
# - n_estimators: 50
# - max_leaves: 40
# - colsample_bytree: 0.7
# - subsample: 1
# - colsample_bynode: 0.6

param_grid = {
    # "n_estimators": [50, 75, 100],
    # "max_leaves": [30, 40, 50],
    # "learning_rate": [.01, .03, .1],
    # "grow_policy": ['depthwise', 'lossguide'],
    # "min_child_weight": [0, 50, 100, 200],
    # "max_delta_step": [3, 5, 6, 10],
    # "reg_alpha": [0, 100],
    # "reg_lambda": [0, 100],
    "subsample": [.7, .8, .9, 1.],
    "colsample_bynode": [.4, .6, .8]
}
initial_model = xgb.XGBClassifier(
    scale_pos_weight=7, 
    random_state=random_state, 
    n_jobs=n_jobs,
    learning_rate=0.01,
    n_estimators=50,
    max_leaves=40
)

df_cv_results = grid_search_cv(
    initial_model, 
    param_grid, 
    X_train=X_train, y_train=y_train, 
    X_valid=X_valid, y_valid=y_valid, 
    scoring="precision", 
    sample_weight=sample_weights,
)

In [279]:
df_cv_results.head(5)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_colsample_bynode,param_subsample,params,split0_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,mean_train_score,std_train_score
6,1.310611,0.0,0.015328,0.0,0.6,0.9,"{'colsample_bynode': 0.6, 'subsample': 0.9}",0.662651,0.662651,0.0,1,0.524684,0.524684,0.0
7,1.157099,0.0,0.014259,0.0,0.6,1.0,"{'colsample_bynode': 0.6, 'subsample': 1.0}",0.660714,0.660714,0.0,2,0.521116,0.521116,0.0
8,1.295574,0.0,0.020771,0.0,0.8,0.7,"{'colsample_bynode': 0.8, 'subsample': 0.7}",0.650602,0.650602,0.0,3,0.484779,0.484779,0.0
5,1.430987,0.0,0.015585,0.0,0.6,0.8,"{'colsample_bynode': 0.6, 'subsample': 0.8}",0.64557,0.64557,0.0,4,0.515169,0.515169,0.0
1,1.44277,0.0,0.01517,0.0,0.4,0.8,"{'colsample_bynode': 0.4, 'subsample': 0.8}",0.643678,0.643678,0.0,5,0.521716,0.521716,0.0


In [None]:
# optional : save grid search results
save_dir = model_dump_dir / "model_v3"
save_dir.mkdir(parents=True, exist_ok=True)

current_timestamp = pd.Timestamp.now()

time_prefix = current_timestamp.strftime(r"%Y-%m-%d %H-%M")
df_cv_results.to_csv(save_dir / "grid_search_{}.csv".format(time_prefix), index=False)

Model Performance Evaluation

In [282]:
model_v3 = xgb.XGBClassifier(
    scale_pos_weight=7, 
    random_state=random_state, 
    n_jobs=n_jobs,
    learning_rate=0.01,
    n_estimators=50,
    max_leaves=40,
    subsample=1,
    colsample_bynode=0.6
)

model_v3.fit(X_train, y_train, sample_weight=sample_weights)

In [283]:
y_train_pred = model_v3.predict(X_train)
y_valid_pred = model_v3.predict(X_valid)

precision_train = precision_score(y_true=y_train, y_pred=y_train_pred)
precision_valid = precision_score(y_true=y_valid, y_pred=y_valid_pred)

print("Precision score on Training set : {0:.3f}".format(precision_train))
print("Precision score on Training set : {0:.3f} **".format(precision_valid))

Precision score on Training set : 0.521
Precision score on Training set : 0.661 **


In [284]:
accuracy_report(y_true=y_valid, y_pred=y_valid_pred)

Confusion Matrix : 
     [[37091    57]
      [ 1065   111]]
Accuracy Score : 0.97
Precision Score : 0.66
Recall Score : 0.09
F1 Score : 0.17


## Model Development Summary

From all the iterations of ML model development and parameter fine-tuning, we can conclude all model results, in other words, algorithm used along with its performance metrics, as follow.

|           Model          |       Accuracy     |      Precision    |       Recall      |         F1        |
|:------------------------:|:------------------:|:-----------------:|:-----------------:|:-----------------:|
| DT Classifier (baseline) |          0.95      |        0.38       |        0.87       |       0.53        |
|        DT Classifier     |          0.97      |        0.46       |        0.72       |       0.56        |
| <mark>Random Forest Classifier</mark> |          0.97      |        <b><u>0.68</u></b>       |        0.34       |       0.45        |
|    XGBoost Classifier    |          0.97      |        0.66       |        0.09       |       0.17        |

From the table, It can be seen that the 2nd model version (<b>Random Forest Classifier</b>) yields the best performance in terms of <b>"Precision"</b> on Validation Set

# Final Model Evaluation

In this step, we are going to make a final evaluation of our prediction model on Hold-out Testing Set as an unbias approximation of model performance on real-world data

In [288]:
# combine all data for final training
df_train_final = pd.concat([df_train, df_valid])

X = df_train_final.filter(regex='^feat', axis=1)
y = df_train_final[target_col]

In [290]:
class_weight = {0: 1, 1: 7}

# compute sample weight
last_timestamp = df_train_final['log_timestamp'].min()
days_from_first_txn = df_train_final['log_timestamp'] - last_timestamp
mth_from_first_txn = days_from_first_txn.dt.days / 30

sample_weights = list(map(lambda x: max(1, math.ceil(x)), mth_from_first_txn.tolist()))

In [348]:
model = RandomForestClassifier(
    class_weight=class_weight, 
    n_jobs=n_jobs,
    criterion="gini",
    n_estimators=50,
    min_samples_split=6000,
    min_impurity_decrease=0.02, # lower the threshold as the prediction only generates 0
    max_features="sqrt"
)

model.fit(X, y, sample_weight=sample_weights)

In [349]:
# initial pre-process on Hold-out Testing Set
process_training(df_test)

X_test = df_test.filter(regex='^feat', axis=1)
y_test = df_test[target_col]

y_test_pred = model.predict(X_test)

In [350]:
precision_test = precision_score(y_true=y_test, y_pred=y_test_pred)

print("Final Precision score on Testing set : {0:.3f} **".format(precision_test))

Final Precision score on Testing set : 0.795 **


In [351]:
print("--- Final Accuracy report on Hold-out Testing Set ---")
accuracy_report(y_true=y_test, y_pred=y_test_pred)

--- Final Accuracy report on Hold-out Testing Set ---
Confusion Matrix : 
     [[37866   798]
      [  928  3087]]
Accuracy Score : 0.96
Precision Score : 0.79
Recall Score : 0.77
F1 Score : 0.78


In [356]:
y_pred = model.predict(X)

print("--- Final Accuracy report on Training Set ---")
accuracy_report(y_true=y, y_pred=y_pred)

--- Final Accuracy report on Training Set ---
Confusion Matrix : 
     [[722647  71916]
      [  6297  35787]]
Accuracy Score : 0.91
Precision Score : 0.33
Recall Score : 0.85
F1 Score : 0.48


From the final evaluationt, we can conclude the model's <b><u>Precision</u></b> on classifying the possibility of user blocking within next 2 days at <mark><b>79.5 %</b></mark> .

In [359]:
# optional : model saving
save_dir = model_dump_dir / "final"
save_dir.mkdir(parents=True, exist_ok=True)

current_timestamp = pd.Timestamp.now()

time_prefix = current_timestamp.strftime(r"%Y-%m-%d %H-%M")

joblib.dump(model, save_dir / 'model_{}.joblib'.format(time_prefix))

['c:\\Users\\11413929\\repos\\wikimedia\\model\\final\\model_2024-09-29 13-46.joblib']

In [363]:
# save prediction results
prediction_result_dir = project_dir / 'data' / 'prediction'
prediction_result_dir.mkdir(parents=True, exist_ok=True)

df_prediction = df_test[['log_timestamp', target_col]]
df_prediction['y_pred'] = model.predict(X_test)

df_prediction.reset_index().to_csv(prediction_result_dir / 'prediction.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_prediction['y_pred'] = model.predict(X_test)
