In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np

In [2]:
# Path to the CSV file
file_path = '/home/moraa/Documents/10_academy/Week-8/artifacts/df_merged.csv'

# Load the CSV file into a DataFrame
df_merged = pd.read_csv(file_path)

# Display the first few rows of the loaded DataFrame to verify
df_merged.head()

Unnamed: 0,id,Trip ID,driver_id,driver_action,lat,lng,Trip Origin,Trip Destination,Trip Start Time,Trip End Time,...,Trip Duration Category,Origin-Destination,Origin-Destination Count,Origin Distance to City Center,Destination Distance to City Center,Acceptance Rate,Driver Experience,Avg Trip Distance,Avg Trip Duration,Avg Speed
0,17,392005,171165,rejected,0.467252,-0.165609,"6.565087699999999,3.3844415","6.499696300000001,3.3509075",2021-07-01 10:53:36,2021-07-01 11:27:51,...,Short,"6.565087699999999,3.3844415 to 6.4996963000000...",51,54.83083,73.36347,0.000875,5715,-0.027184,0.026039,197344.050231
1,18,392005,243172,rejected,0.573114,0.332253,"6.565087699999999,3.3844415","6.499696300000001,3.3509075",2021-07-01 10:53:36,2021-07-01 11:27:51,...,Short,"6.565087699999999,3.3844415 to 6.4996963000000...",51,54.83083,73.36347,0.051209,703,0.162084,0.035407,255134.934336
2,19,392005,244078,rejected,0.686601,-0.14447,"6.565087699999999,3.3844415","6.499696300000001,3.3509075",2021-07-01 10:53:36,2021-07-01 11:27:51,...,Short,"6.565087699999999,3.3844415 to 6.4996963000000...",51,54.83083,73.36347,0.010111,3066,-0.111655,-0.082415,199564.036556
3,21,392005,243392,rejected,0.502039,-0.232383,"6.565087699999999,3.3844415","6.499696300000001,3.3509075",2021-07-01 10:53:36,2021-07-01 11:27:51,...,Short,"6.565087699999999,3.3844415 to 6.4996963000000...",51,54.83083,73.36347,0.007194,3475,-0.076518,0.012019,192644.565309
4,23,392005,171165,rejected,0.467252,-0.165609,"6.565087699999999,3.3844415","6.499696300000001,3.3509075",2021-07-01 10:53:36,2021-07-01 11:27:51,...,Short,"6.565087699999999,3.3844415 to 6.4996963000000...",51,54.83083,73.36347,0.000875,5715,-0.027184,0.026039,197344.050231


In [3]:
# 1. Transform 'driver_action' column using LabelEncoder and retain the original column
label_encoder_action = LabelEncoder()
df_merged['driver_action_encoded'] = label_encoder_action.fit_transform(df_merged['driver_action'])

# 2. 'Trip Origin' and 'Trip Destination' columns remain unchanged

# 3. Convert 'Trip Start Time' and 'Trip End Time' to datetime if needed
df_merged['Trip Start Time'] = pd.to_datetime(df_merged['Trip Start Time'])
df_merged['Trip End Time'] = pd.to_datetime(df_merged['Trip End Time'])

# 4. Transform 'Day of Week' column
day_mapping = {
    'Sunday': 1, 'Monday': 2, 'Tuesday': 3, 'Wednesday': 4,
    'Thursday': 5, 'Friday': 6, 'Saturday': 7
}
df_merged['Day of Week Encoded'] = df_merged['Day of Week'].map(day_mapping)

# 5. Transform 'Time of Day' column using LabelEncoder
encoder = LabelEncoder()
df_merged['Time of Day Encoded'] = encoder.fit_transform(df_merged['Time of Day'])

# 6. Transform 'Is Holiday' column (assuming it's boolean)
df_merged['Is Holiday'] = df_merged['Is Holiday'].astype(int)

# 7. Convert 'Previous Trip End Time' to useful time components
df_merged['Previous Trip End Time'] = pd.to_datetime(df_merged['Previous Trip End Time'])
df_merged['Previous Trip End Hour'] = df_merged['Previous Trip End Time'].dt.hour
df_merged['Previous Trip End Day of Week'] = df_merged['Previous Trip End Time'].dt.dayofweek
df_merged['Previous Trip End Month'] = df_merged['Previous Trip End Time'].dt.month

# 8. Label Encoding for 'Trip Duration Category'
label_encoder_duration = LabelEncoder()
df_merged['Trip Duration Category Encoded'] = label_encoder_duration.fit_transform(df_merged['Trip Duration Category'])

# 9. Frequency Encoding for 'Origin-Destination'
origin_destination_counts = df_merged['Origin-Destination'].value_counts().to_dict()
df_merged['Origin-Destination Encoded'] = df_merged['Origin-Destination'].map(origin_destination_counts)

# 10. Drop original non-numerical columns if they are no longer needed
original_non_numerical_columns = ['driver_action', 'Trip Origin', 'Trip Destination', 'Trip Start Time', 
                                  'Trip End Time', 'Day of Week', 'Time of Day', 'Is Holiday', 
                                  'Previous Trip End Time', 'Trip Duration Category', 'Origin-Destination']
df_merged = df_merged.drop(columns=original_non_numerical_columns)

# Display the DataFrame to verify the changes
df_merged.head()

Unnamed: 0,id,Trip ID,driver_id,lat,lng,Hour,Origin Lat,Origin Lng,Destination Lat,Destination Lng,...,Avg Trip Duration,Avg Speed,driver_action_encoded,Day of Week Encoded,Time of Day Encoded,Previous Trip End Hour,Previous Trip End Day of Week,Previous Trip End Month,Trip Duration Category Encoded,Origin-Destination Encoded
0,17,392005,171165,0.467252,-0.165609,10,0.48886,0.082547,-0.497597,-0.435928,...,0.026039,197344.050231,1,5,2,11,3,7,0,33
1,18,392005,243172,0.573114,0.332253,10,0.48886,0.082547,-0.497597,-0.435928,...,0.035407,255134.934336,1,5,2,11,3,7,0,33
2,19,392005,244078,0.686601,-0.14447,10,0.48886,0.082547,-0.497597,-0.435928,...,-0.082415,199564.036556,1,5,2,11,3,7,0,33
3,21,392005,243392,0.502039,-0.232383,10,0.48886,0.082547,-0.497597,-0.435928,...,0.012019,192644.565309,1,5,2,11,3,7,0,33
4,23,392005,171165,0.467252,-0.165609,10,0.48886,0.082547,-0.497597,-0.435928,...,0.026039,197344.050231,1,5,2,11,3,7,0,33


# Set the number of rows you want in your subset
subset_size = 100000  # Adjust this based on your computational resources

# Randomly sample from the DataFrame
df_merged = df_merged.sample(n=subset_size, random_state=1)

In [4]:
# Function to replace spaces with underscores in column names
def replace_spaces_with_underscores(df):
    df.columns = [col.replace(' ', '_') for col in df.columns]
    return df

# Applying the function to your DataFrame
df_merged = replace_spaces_with_underscores(df_merged)

df_merged

Unnamed: 0,id,Trip_ID,driver_id,lat,lng,Hour,Origin_Lat,Origin_Lng,Destination_Lat,Destination_Lng,...,Avg_Trip_Duration,Avg_Speed,driver_action_encoded,Day_of_Week_Encoded,Time_of_Day_Encoded,Previous_Trip_End_Hour,Previous_Trip_End_Day_of_Week,Previous_Trip_End_Month,Trip_Duration_Category_Encoded,Origin-Destination_Encoded
0,17,392005,171165,0.467252,-0.165609,10,0.488860,0.082547,-0.497597,-0.435928,...,0.026039,197344.050231,1,5,2,11,3,7,0,33
1,18,392005,243172,0.573114,0.332253,10,0.488860,0.082547,-0.497597,-0.435928,...,0.035407,255134.934336,1,5,2,11,3,7,0,33
2,19,392005,244078,0.686601,-0.144470,10,0.488860,0.082547,-0.497597,-0.435928,...,-0.082415,199564.036556,1,5,2,11,3,7,0,33
3,21,392005,243392,0.502039,-0.232383,10,0.488860,0.082547,-0.497597,-0.435928,...,0.012019,192644.565309,1,5,2,11,3,7,0,33
4,23,392005,171165,0.467252,-0.165609,10,0.488860,0.082547,-0.497597,-0.435928,...,0.026039,197344.050231,1,5,2,11,3,7,0,33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1554500,1557736,517948,243774,-1.116936,3.524931,11,-1.529919,3.262599,-0.485125,2.866731,...,0.019337,239364.383468,1,6,2,13,4,9,0,59
1554501,1557737,517948,245447,-1.843909,2.783971,11,-1.529919,3.262599,-0.485125,2.866731,...,0.037682,201045.736898,1,6,2,13,4,9,0,59
1554502,1557738,517948,239866,-1.600805,2.746854,11,-1.529919,3.262599,-0.485125,2.866731,...,-0.284521,214846.010819,1,6,2,13,4,9,0,59
1554503,1557739,517948,243774,-1.116936,3.524931,11,-1.529919,3.262599,-0.485125,2.866731,...,0.019337,239364.383468,1,6,2,13,4,9,0,59


In [5]:
# Separate predictors (X) and target variable (y)
X = df_merged.drop(columns=['driver_action_encoded'])
y = df_merged['driver_action_encoded']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Show the shapes of the resulting datasets
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

X_train shape: (1243604, 34)
y_train shape: (1243604,)
X_test shape: (310901, 34)
y_test shape: (310901,)


## XGBoost Model 

All Variables

In [6]:
# Base XGBoost classifier
base_classifier = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss'
)

# Parameter grid
param_grid = {
    'max_depth': [6, 8, 10],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'gamma': [0, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'min_child_weight': [1, 5, 10],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [1, 1.5, 2]
}

# Randomized Search Cross-Validation
random_search = RandomizedSearchCV(
    estimator=base_classifier,
    param_distributions=param_grid,
    n_iter=10,  # adjust as needed
    scoring='accuracy',
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Fit the random search model
random_search.fit(X_train, y_train)



Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_depth=6, min_child_weight=5, n_estimators=200, reg_alpha=0.5, reg_lambda=1.5, subsample=1.0; total time= 1.7min
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_depth=6, min_child_weight=5, n_estimators=200, reg_alpha=0.5, reg_lambda=1.5, subsample=1.0; total time= 1.9min
[CV] END colsample_bytree=1.0, gamma=0.1, learning_rate=0.01, max_depth=10, min_child_weight=1, n_estimators=100, reg_alpha=0, reg_lambda=1, subsample=0.8; total time= 2.0min
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_depth=6, min_child_weight=5, n_estimators=200, reg_alpha=0.5, reg_lambda=1.5, subsample=1.0; total time= 2.1min
[CV] END colsample_bytree=1.0, gamma=0.1, learning_rate=0.01, max_depth=10, min_child_weight=1, n_estimators=100, reg_alpha=0, reg_lambda=1, subsample=0.8; total time= 2.1min
[CV] END colsample_bytree=1.0, gamma=0.1, learning_rate=0.01, ma

Evaluate Model

In [7]:
# Best parameters found
best_params = random_search.best_params_
print("Best Parameters:", best_params)

# Evaluate on test set
y_pred = random_search.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Test Accuracy: {accuracy:.2f}')

# Classification report
print('\nClassification Report:')
print(classification_report(y_test, y_pred))

# Confusion matrix
print('\nConfusion Matrix:')
print(confusion_matrix(y_test, y_pred))



Best Parameters: {'subsample': 1.0, 'reg_lambda': 1.5, 'reg_alpha': 0.5, 'n_estimators': 200, 'min_child_weight': 5, 'max_depth': 6, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 0.8}
Test Accuracy: 0.98

Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.07      0.12      5026
           1       0.98      1.00      0.99    305875

    accuracy                           0.98    310901
   macro avg       0.84      0.53      0.56    310901
weighted avg       0.98      0.98      0.98    310901


Confusion Matrix:
[[   336   4690]
 [   146 305729]]


Selected Variables

In [11]:
# Specify the feature columns
feature_columns = ['id', 'Trip_ID', 'driver_id', 'Hour', 'Start_Hour', 'Geodesic_Distance',
                   'Haversine_Distance', 'Average_Speed', 'Time_Since_Last_Trip',
                   'Origin-Destination_Count', 'Origin_Distance_to_City_Center',
                   'Destination_Distance_to_City_Center', 'Driver_Experience', 'Avg_Speed',
                   'Day_of_Week_Encoded', 'Previous_Trip_End_Hour',
                   'Previous_Trip_End_Day_of_Week', 'Previous_Trip_End_Month', 'Origin-Destination_Encoded']

# Separate predictors (X) and target variable (y)
X = df_merged[feature_columns]
y = df_merged['driver_action_encoded']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Show the shapes of the resulting datasets
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)


X_train shape: (1243604, 19)
y_train shape: (1243604,)
X_test shape: (310901, 19)
y_test shape: (310901,)


In [12]:
# Base XGBoost classifier
base_classifier = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    use_label_encoder=False,
    tree_method='hist'  # Use histogram-based algorithm to reduce memory usage
)


In [13]:
# Parameter grid
param_grid = {
    'max_depth': [6, 8, 10],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'gamma': [0, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'min_child_weight': [1, 5, 10],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [1, 1.5, 2]
}


In [14]:
# Randomized Search Cross-Validation
random_search = RandomizedSearchCV(
    estimator=base_classifier,
    param_distributions=param_grid,
    n_iter=10,  # Adjust as needed
    scoring='accuracy',
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Fit the random search model
random_search.fit(X_train, y_train)


Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] END colsample_bytree=1.0, gamma=0.1, learning_rate=0.01, max_depth=10, min_child_weight=1, n_estimators=100, reg_alpha=0, reg_lambda=1, subsample=0.8; total time= 1.8min
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_depth=6, min_child_weight=5, n_estimators=200, reg_alpha=0.5, reg_lambda=1.5, subsample=1.0; total time= 1.9min
[CV] END colsample_bytree=1.0, gamma=0.1, learning_rate=0.01, max_depth=10, min_child_weight=1, n_estimators=100, reg_alpha=0, reg_lambda=1, subsample=0.8; total time= 1.9min
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_depth=6, min_child_weight=5, n_estimators=200, reg_alpha=0.5, reg_lambda=1.5, subsample=1.0; total time= 2.0min
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_depth=6, min_child_weight=5, n_estimators=200, reg_alpha=0.5, reg_lambda=1.5, subsample=1.0; total time= 2.0min
[CV] END colsample_bytree=1.0, gamma=0.1, learning_rate=0.01, ma

In [15]:
# Best parameters found
best_params = random_search.best_params_
print("Best Parameters:", best_params)

# Evaluate on test set
y_pred = random_search.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Test Accuracy: {accuracy:.2f}')

# Classification report
print('\nClassification Report:')
print(classification_report(y_test, y_pred))

# Confusion matrix
print('\nConfusion Matrix:')
print(confusion_matrix(y_test, y_pred))


Best Parameters: {'subsample': 1.0, 'reg_lambda': 1.5, 'reg_alpha': 0.5, 'n_estimators': 200, 'min_child_weight': 5, 'max_depth': 6, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 0.8}
Test Accuracy: 0.98

Classification Report:
              precision    recall  f1-score   support

           0       0.63      0.05      0.10      5026
           1       0.98      1.00      0.99    305875

    accuracy                           0.98    310901
   macro avg       0.81      0.53      0.54    310901
weighted avg       0.98      0.98      0.98    310901


Confusion Matrix:
[[   267   4759]
 [   158 305717]]


# Random Forest

All Variables

In [9]:
# Base Random Forest classifier
base_rf = RandomForestClassifier(random_state=42, n_jobs=-1)

# Parameter grid for Randomized Search
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Randomized Search Cross-Validation
random_search_rf = RandomizedSearchCV(
    estimator=base_rf,
    param_distributions=param_grid,
    n_iter=10,  # adjust as needed
    scoring='accuracy',
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Fit the random search model
random_search_rf.fit(X_train, y_train)


Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] END bootstrap=False, max_depth=None, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=300; total time=   0.9s
[CV] END bootstrap=False, max_depth=None, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=300; total time=   0.3s
[CV] END bootstrap=False, max_depth=20, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=   0.3s
[CV] END bootstrap=False, max_depth=20, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=   0.3s
[CV] END bootstrap=False, max_depth=20, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=   0.5s
[CV] END bootstrap=False, max_depth=None, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=300; total time=   2.4s
[CV] END bootstrap=True, max_depth=10, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_esti



[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time=82.7min
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time=84.1min
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time=84.6min
[CV] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=49.7min
[CV] END bootstrap=True, max_depth=30, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=100; total time=20.6min
[CV] END bootstrap=True, max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=300; total time=53.8min
[CV] END bootstrap=True, max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=300; total time=54.5min
[CV] END bootstrap=True, max_depth=3

9 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
6 fits failed with the following error:
Traceback (most recent call last):
  File "/home/moraa/Documents/10_academy/Week-8/venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/moraa/Documents/10_academy/Week-8/venv/lib/python3.10/site-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/home/moraa/Documents/10_academy/Week-8/venv/lib/python3.10/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/home/moraa/Documents/10_academy/Week-8/venv/lib/python3.1

Evaluation

In [10]:
# Best parameters found
best_params_rf = random_search_rf.best_params_
print("Best Parameters for Random Forest:", best_params_rf)

# Evaluate on test set
y_pred_rf = random_search_rf.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f'Test Accuracy for Random Forest: {accuracy_rf:.2f}')

# Classification report
print('\nClassification Report for Random Forest:')
print(classification_report(y_test, y_pred_rf))

# Confusion matrix
print('\nConfusion Matrix for Random Forest:')
print(confusion_matrix(y_test, y_pred_rf))


Best Parameters for Random Forest: {'n_estimators': 300, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 20, 'bootstrap': True}
Test Accuracy for Random Forest: 0.98

Classification Report for Random Forest:
              precision    recall  f1-score   support

           0       0.85      0.05      0.10      5026
           1       0.98      1.00      0.99    305875

    accuracy                           0.98    310901
   macro avg       0.92      0.53      0.55    310901
weighted avg       0.98      0.98      0.98    310901


Confusion Matrix for Random Forest:
[[   263   4763]
 [    47 305828]]


Selected Variables

In [16]:
# Specify the feature columns
feature_columns = ['id', 'Trip_ID', 'driver_id', 'Hour', 'Start_Hour', 'Geodesic_Distance',
                   'Haversine_Distance', 'Average_Speed', 'Time_Since_Last_Trip',
                   'Origin-Destination_Count', 'Origin_Distance_to_City_Center',
                   'Destination_Distance_to_City_Center', 'Driver_Experience', 'Avg_Speed',
                   'Day_of_Week_Encoded', 'Previous_Trip_End_Hour',
                   'Previous_Trip_End_Day_of_Week', 'Previous_Trip_End_Month', 'Origin-Destination_Encoded']

# Separate predictors (X) and target variable (y)
X = df_merged[feature_columns]
y = df_merged['driver_action_encoded']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Show the shapes of the resulting datasets
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)


X_train shape: (1243604, 19)
y_train shape: (1243604,)
X_test shape: (310901, 19)
y_test shape: (310901,)


In [17]:
# Base Random Forest classifier
base_rf = RandomForestClassifier(random_state=42, n_jobs=-1)

# Parameter grid for Randomized Search
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}


In [18]:
# Randomized Search Cross-Validation
random_search_rf = RandomizedSearchCV(
    estimator=base_rf,
    param_distributions=param_grid,
    n_iter=10,  # Adjust as needed
    scoring='accuracy',
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Fit the random search model
random_search_rf.fit(X_train, y_train)


Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] END bootstrap=False, max_depth=None, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=300; total time=   0.2s
[CV] END bootstrap=False, max_depth=None, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=300; total time=   0.2s
[CV] END bootstrap=False, max_depth=None, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=300; total time=   0.5s
[CV] END bootstrap=False, max_depth=20, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=   0.2s
[CV] END bootstrap=False, max_depth=20, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=   0.5s
[CV] END bootstrap=False, max_depth=20, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=   0.6s
[CV] END bootstrap=True, max_depth=10, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_esti



[CV] END bootstrap=True, max_depth=30, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=29.4min
[CV] END bootstrap=True, max_depth=30, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=29.6min
[CV] END bootstrap=True, max_depth=30, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time=   0.2s
[CV] END bootstrap=True, max_depth=30, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time=   1.8s
[CV] END bootstrap=True, max_depth=30, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time=   1.8s
[CV] END bootstrap=True, max_depth=30, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=30.1min
[CV] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=36.9min
[CV] END bootstrap=False, max_depth=20, max_fe

9 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
6 fits failed with the following error:
Traceback (most recent call last):
  File "/home/moraa/Documents/10_academy/Week-8/venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/moraa/Documents/10_academy/Week-8/venv/lib/python3.10/site-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/home/moraa/Documents/10_academy/Week-8/venv/lib/python3.10/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/home/moraa/Documents/10_academy/Week-8/venv/lib/python3.1

In [19]:
# Best parameters found
best_params_rf = random_search_rf.best_params_
print("Best Parameters for Random Forest:", best_params_rf)

# Evaluate on test set
y_pred_rf = random_search_rf.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f'Test Accuracy for Random Forest: {accuracy_rf:.2f}')

# Classification report
print('\nClassification Report for Random Forest:')
print(classification_report(y_test, y_pred_rf))

# Confusion matrix
print('\nConfusion Matrix for Random Forest:')
print(confusion_matrix(y_test, y_pred_rf))


Best Parameters for Random Forest: {'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'log2', 'max_depth': 10, 'bootstrap': True}
Test Accuracy for Random Forest: 0.98

Classification Report for Random Forest:
              precision    recall  f1-score   support

           0       0.87      0.05      0.09      5026
           1       0.98      1.00      0.99    305875

    accuracy                           0.98    310901
   macro avg       0.93      0.52      0.54    310901
weighted avg       0.98      0.98      0.98    310901


Confusion Matrix for Random Forest:
[[   245   4781]
 [    36 305839]]
