### Imports

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

#from lazypredict.Supervised import LazyClassifier
import joblib

### Load Dataset

In [2]:
df = pd.read_csv("synthetic_combined_dataset_lift_predictive.csv")
print(df.shape)
print(df.head())

(10000, 4)
   ball_bearing  vibration   humidity risk_level
0     53.287651  31.511211  74.755610     Medium
1     54.138179  23.779652  73.539365     Medium
2     82.554662   8.822824  74.516011        Low
3     53.451727  26.665425  73.719980     Medium
4     60.749528  33.679418  73.780989     Medium


### Preprocess data

#### Check for missing values

In [3]:
print("\nMissing values per column:")
print(df.isnull().sum())


Missing values per column:
ball_bearing    0
vibration       0
humidity        0
risk_level      0
dtype: int64


#### Ensure numeric columns are of proper type

In [4]:
numeric_columns = ['ball_bearing', 'vibration', 'humidity']
df[numeric_columns] = df[numeric_columns].apply(pd.to_numeric, errors='coerce')

#### Drop any rows with missing values (if any)

In [5]:
df.dropna(inplace=True)

#### Adjust noise_factor based on the expected sensor precision.

In [6]:
noise_factor_ball = 5.0  # Adjust these factors based on expected sensor error
noise_factor_vib = 5.0
noise_factor_hum = 0.2

df['ball_bearing'] = df['ball_bearing'] + np.random.normal(0, noise_factor_ball, df.shape[0])
df['vibration'] = df['vibration'] + np.random.normal(0, noise_factor_vib, df.shape[0])
df['humidity'] = df['humidity'] + np.random.normal(0, noise_factor_hum, df.shape[0])

#### Separate features and target variable

In [7]:
X = df[["ball_bearing", "vibration", "humidity"]]
y = df["risk_level"]

### Encode target variable

In [8]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
print("\nRisk Level Classes:", label_encoder.classes_)


Risk Level Classes: ['High' 'Low' 'Medium']


### Split data

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42)

### Using LazyPredict to find simple model

In [155]:
lazy_clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = lazy_clf.fit(X_train, X_test, y_train, y_test)

print("\nModel Comparison:")
print(models)

NameError: name 'LazyClassifier' is not defined

### Finding best parameters for Hyperparametric fine tuning

In [154]:
# Define a parameter grid to search over
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10, 15, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 3, 5],
    'max_features': ['sqrt', 'log2', None]
}

# Instantiate the RandomForestClassifier
rf = RandomForestClassifier(random_state=42)

# Use GridSearchCV to search for the best parameters using 5-fold CV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best CV Score:", grid_search.best_score_)

Best Parameters: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}
Best CV Score: 0.9977142857142857


### Cross Validation

In [None]:
# Using the best estimator from grid search
best_rf = grid_search.best_estimator_

# Perform 5-fold cross-validation
cv_scores = cross_val_score(best_rf, X, y_encoded, cv=5)
print("CV Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())

In [None]:
fine_param_grid = {
    'max_depth': [8, 10, 12],
    'min_samples_split': [8, 10, 12],
    'min_samples_leaf': [3, 5, 7],
}

fine_rf = RandomForestClassifier(n_estimators=grid_search.best_params_['n_estimators'],
                                 max_features=grid_search.best_params_['max_features'],
                                 random_state=42)

fine_grid_search = GridSearchCV(estimator=fine_rf, param_grid=fine_param_grid, cv=5, scoring='accuracy', n_jobs=-1)
fine_grid_search.fit(X_train, y_train)

print("Fine-tuned Best Parameters:", fine_grid_search.best_params_)
print("Fine-tuned Best CV Score:", fine_grid_search.best_score_)

### Train model using Hyperparametric fine tuning

##### Reduce the complexity of the Random Forest by adjusting hyperparameters such as:

max_depth: Limit the maximum depth of the trees.

min_samples_split / min_samples_leaf: Increase the minimum number of samples required to split a node or be at a leaf.

max_features: Limit the number of features to consider when splitting.

In [10]:
clf = RandomForestClassifier(
    n_estimators=100,
    max_depth=12,         # restrict depth to reduce complexity
    min_samples_split=8, # require at least 10 samples to split
    min_samples_leaf=3,   # require at least 5 samples at a leaf
    random_state=42
)
clf.fit(X_train, y_train)

### Model Evaluation

In [11]:
y_pred = clf.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

        High       0.99      0.98      0.98       722
         Low       0.99      0.97      0.98      1244
      Medium       0.96      0.98      0.97      1034

    accuracy                           0.98      3000
   macro avg       0.98      0.98      0.98      3000
weighted avg       0.98      0.98      0.98      3000

Confusion Matrix:
[[ 707    0   15]
 [   0 1212   32]
 [   8   13 1013]]


### Trying with logistic regression

In [12]:
# L2 regularization with C (inverse regularization strength) set to a lower value
log_reg = LogisticRegression(penalty='l2', C=0.1, solver='liblinear', random_state=42)
log_reg.fit(X_train, y_train)
print("Logistic Regression CV Score:", np.mean(cross_val_score(log_reg, X, y_encoded, cv=5)))

Logistic Regression CV Score: 0.9685


In [13]:
y_pred = log_reg.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

        High       0.98      0.98      0.98       722
         Low       0.97      0.98      0.97      1244
      Medium       0.95      0.95      0.95      1034

    accuracy                           0.97      3000
   macro avg       0.97      0.97      0.97      3000
weighted avg       0.97      0.97      0.97      3000

Confusion Matrix:
[[ 706    0   16]
 [   0 1213   31]
 [  14   35  985]]


### Save model

In [14]:
joblib.dump(clf, 'synthetic_risk_model.pkl')
print("\nModel saved as 'synthetic_risk_model.pkl'")


Model saved as 'synthetic_risk_model.pkl'
