## Model Training and Tracking
##### working flow
* load the dataset 
* split in train and test
* model training 
* tune the hyperparameter
* experiment tracking with mlflow

##### Import libraries

In [11]:
import pandas as pd
import sys
sys.path.append("..")

In [12]:
from src.model_training import ModelTrainer
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

#### Load the Dataset

In [13]:
df=pd.read_csv("../data/processed/final_data.csv")
df.head()

Unnamed: 0,num__total_amount,num__avg_amount,num__transaction_count,num__std_amount,cat__ProviderId_ProviderId_1,cat__ProviderId_ProviderId_2,cat__ProviderId_ProviderId_3,cat__ProviderId_ProviderId_4,cat__ProviderId_ProviderId_5,cat__ProviderId_ProviderId_6,...,cat__ProductId_ProductId_5,cat__ProductId_ProductId_6,cat__ProductId_ProductId_7,cat__ProductId_ProductId_8,cat__ProductId_ProductId_9,cat__ChannelId_ChannelId_1,cat__ChannelId_ChannelId_2,cat__ChannelId_ChannelId_3,cat__ChannelId_ChannelId_5,is_high_risk
0,0.170118,-0.067623,-0.311831,-0.167524,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1,0.170118,-0.067623,-0.311831,-0.167524,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,0.165122,-0.072568,-0.444993,-0.201719,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
3,0.175567,-0.008155,-0.40402,-0.008737,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.175567,-0.008155,-0.40402,-0.008737,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [14]:
df =df.dropna(subset=["is_high_risk"])
df["is_high_risk"].isnull().sum()


np.int64(0)

#### Intialize Model Trainer

In [15]:
# intialize model trainer class object
trainer=ModelTrainer(df=df,target_col="is_high_risk")
# prepare the data with ModelTrainer class model
trainer.prepare_data()
# add models we want to train to ModelTrainer class
trainer.add_model("logreg",LogisticRegression(random_state=42,max_iter=1000))
trainer.add_model("rf",RandomForestClassifier(random_state=42))
trainer.add_model("gb",GradientBoostingClassifier(random_state=42))
# Train all models and track experiments
results = trainer.train_all()
print(results)




{'logreg': {'accuracy': 0.6181575433911882, 'precision': 0.46153846153846156, 'recall': 0.021052631578947368, 'f1_score': 0.040268456375838924, 'roc_auc': 0.5135700241984271}, 'rf': {'accuracy': 0.5714285714285714, 'precision': 0.4010989010989011, 'recall': 0.256140350877193, 'f1_score': 0.31263383297644537, 'roc_auc': 0.4815600423472474}, 'gb': {'accuracy': 0.595460614152203, 'precision': 0.29545454545454547, 'recall': 0.0456140350877193, 'f1_score': 0.0790273556231003, 'roc_auc': 0.5014443436176649}}


##### Hyperparameter tuning (Random Forest)

In [16]:
param_grid={
    "n_estimators":[100,200],
    "max_depth":[5,10],
    "min_samples_split":[2,5]
}
# Run Grid Search
best_params,best_score=trainer.hyperparameter_tuning("rf",param_grid=param_grid)
print("Best Params",best_params)
print("Best Score",best_score)

Best Params {'max_depth': 5, 'min_samples_split': 2, 'n_estimators': 200}
Best Score 0.5212788153593021


##### Hyperparameter tuning(Logistic Regression)

In [18]:
par_grid={
    "C":[0.01,0.1,1,10,1000],
    "penalty":["l1","l2"],
    "solver":["liblinear"]
}
best_params,best_score=trainer.hyperparameter_tuning("logreg",param_grid=par_grid)
print("Best Params",best_params)
print("Best Score",best_score)

Best Params {'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}
Best Score 0.5129299319626635


