In [1]:
import pandas as pd
import os
from imblearn.over_sampling import SMOTE

In [2]:
# Get the current working directory
current_dir = os.getcwd()

# Navigate one folder back
parent_dir = os.path.dirname(current_dir)

# Change the current directory to the data folder
os.chdir(parent_dir)

In [3]:
from src import utils as hf

In [4]:
file_path = "data/data_clean.xlsx"
data_clean=pd.read_excel(file_path)

In [5]:
data_clean.shape

(291442, 28)

In [7]:
data_clean.columns

Index(['Unnamed: 0', 'THICKNESS', 'WIDTH', 'YS', 'UTS', 'EL', 'C', 'MN', 'S',
       'P', 'SI', 'AL', 'N', 'TI', 'B', 'CR', 'V', 'NB', 'MO', 'CR TDC',
       'Application_Automotive Internal',
       'Application_Drum,Bareels,Containers', 'Application_Export',
       'Application_Furnitures and Panels', 'Application_General Engineering',
       'Application_Other', 'Application_Tubes', 'Application_White Goods'],
      dtype='object')

In [6]:
X_Columns = ['THICKNESS', 'WIDTH', 'YS', 'UTS', 'EL', 'C', 'MN', 'S', 'P', 'SI',
       'AL', 'N', 'TI', 'B', 'CR', 'V', 'NB', 'MO',
       'Application_Automotive Internal',
       'Application_Drum,Bareels,Containers', 'Application_Export',
       'Application_Furnitures and Panels', 'Application_General Engineering',
       'Application_Other', 'Application_Tubes', 'Application_White Goods']
Y_Column = ['CR TDC']

## Model building without Sacling

In [None]:
X = data_clean[X_Columns]

In [None]:
y = data_clean[Y_Column]

In [None]:
X_train, X_test, y_train, y_test = hf.split_data(X,y)

In [None]:
results = hf.train_and_evaluate_models(X_train, X_test, y_train, y_test)

## Upsacling to handle imbalanced dataset

In [None]:
from sklearn.utils import resample
import traceback

def upscale_dataframe_with_random_oversampling(df, target_column):
    """
    Upscale a DataFrame using random oversampling.

    Parameters:
        df (DataFrame): Input DataFrame.
        target_column (str): Name of the target column containing class labels.

    Returns:
        DataFrame: Upscaled DataFrame.
    """
    try:
        # Separate majority and minority classes
        majority_class = df[df[target_column] == df[target_column].mode()[0]]
        minority_class = df[df[target_column] != df[target_column].mode()[0]]
        
        # Upsample minority class
        #if minority_class<=50:
        minority_upsampled = resample(minority_class,
                                      replace=True,  # Sample with replacement
                                      n_samples=len(majority_class),  # Match majority class size
                                      random_state=42)  # Reproducible results
        
        # Combine majority class with upsampled minority class
        df_upsampled = pd.concat([majority_class, minority_upsampled])

        print("DataFrame upscaled successfully using random oversampling!")
        return df_upsampled
    except Exception as e:
        print(f"Error: {e}")
        traceback.print_exc()

# Example usage:
# Assuming df is your DataFrame and 'target_column' is the name of your target column
# df_upscaled = upscale_dataframe_with_random_oversampling(df, 'target_column')


In [None]:
import pickle

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
def train_and_evaluate_models(X_train, X_test, y_train, y_test,app):
    models = {
        'Random Forest': RandomForestClassifier(),
    }
    
    results = {}
    for name, model in models.items():
        model.fit(X_train, y_train)
        
        with open(f"models/{app}_model.pkl", "wb") as f:
            pickle.dump(model, f)
            y_pred = model.predict(X_test)
        
#         accuracy = accuracy_score(y_test, y_pred)
#         precision = precision_score(y_test, y_pred)
#         recall = recall_score(y_test, y_pred)
#         f1 = f1_score(y_test, y_pred)
        
        cr = classification_report(y_test, y_pred)
        
        print("_"*30)
        print("Model Name: ",name)
        print("CLassification Report",cr)
        
        results[name]=cr
    
#     results_df = pd.DataFrame(results)
#     best_model = results_df.loc[results_df['F1 Score'].idxmax()]
    
    return results

In [None]:
one_hot_encoded_cols = ['Application_Automotive Internal',
       'Application_Drum,Bareels,Containers', 'Application_Export',
       'Application_Furnitures and Panels', 'Application_General Engineering',
       'Application_Other', 'Application_Tubes', 'Application_White Goods']

In [None]:
X_Columns_updated = ['THICKNESS', 'WIDTH', 'YS', 'UTS', 'EL', 'C', 'MN', 'S', 'P', 'SI',
       'AL', 'N', 'TI', 'B', 'CR', 'V', 'NB', 'MO']

In [None]:
data_clean.head()

In [None]:
applications_list = ['Application_Furnitures and Panels', 'Application_Automotive Internal', 'Application_Export',
       'Application_Automotive Exposed-OEM', 'Application_White Goods', 'Application_General Engineering',
       'Application_Tubes', 'Application_Drum,Bareels,Containers','Application_Other']

In [None]:
for app in applications_list:
    data_AU = data_clean[data_clean[app]==1].drop(one_hot_encoded_cols,axis=1)
    #Add code here for up-scaling
    data_clean_us = upscale_dataframe_with_random_oversampling(data_AU,'CR TDC')

    X = data_AU[X_Columns_updated]
    y = data_AU[Y_Column]
    X_train, X_test, y_train, y_test = hf.split_data(X,y)
    results_AU = train_and_evaluate_models(X_train, X_test, y_train, y_test,app)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

def plot_multiclass_roc_auc(y_test, y_score, n_classes):
    """
    Plot ROC_AUC curve for a multiclass classification problem.

    Parameters:
    y_test (array-like): True labels for the test set.
    y_score (array-like): Predicted probabilities for the test set.
    n_classes (int): Number of classes in the classification problem.
    """

    # Compute ROC curve and ROC area for each class
    fpr = dict()
    tpr = dict()
    roc_auc = dict()

    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    # Compute micro-average ROC curve and ROC area
    fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

    # Compute macro-average ROC curve and ROC area
    # First aggregate all false positive rates
    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

    # Then interpolate all ROC curves
    mean_tpr = np.zeros_like(all_fpr)
    for i in range(n_classes):
        mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])

    # Average it and compute AUC
    mean_tpr /= n_classes

    fpr["macro"] = all_fpr
    tpr["macro"] = mean_tpr
    roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

    # Plot ROC curve
    plt.figure(figsize=(10, 7))
    lw = 2

    # Plot micro-average ROC curve
    plt.plot(fpr["micro"], tpr["micro"],
             label='micro-average ROC curve (area = {0:0.2f})'
                   ''.format(roc_auc["micro"]),
             color='deeppink', linestyle=':', linewidth=4)

    # Plot macro-average ROC curve
    plt.plot(fpr["macro"], tpr["macro"],
             label='macro-average ROC curve (area = {0:0.2f})'
                   ''.format(roc_auc["macro"]),
             color='navy', linestyle=':', linewidth=4)

    colors = ['aqua', 'darkorange', 'cornflowerblue']  # You can extend this list for more classes
    for i, color in zip(range(n_classes), colors):
        plt.plot(fpr[i], tpr[i], color=color, lw=lw,
                 label='ROC curve of class {0} (area = {1:0.2f})'
                 ''.format(i, roc_auc[i]))

    plt.plot([0, 1], [0, 1], 'k--', lw=lw)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC curve for multiclass classification')
    plt.legend(loc="lower right")
    plt.show()


In [None]:
app = 'Application_Other'
data_AU = data_clean[data_clean[app]==1].drop(one_hot_encoded_cols,axis=1)
#Add code here for up-scaling
data_clean_us = upscale_dataframe_with_random_oversampling(data_AU,'CR TDC')

X = data_AU[X_Columns_updated]
y = data_AU[Y_Column]
X_train, X_test, y_train, y_test = hf.split_data(X,y)
n_classes = y_test.shape[1]
results_AU = train_and_evaluate_models(X_train, X_test, y_train, y_test,app,n_classes)

In [None]:
data_clean_us = upscale_dataframe_with_random_oversampling(data_AU,'CR TDC')

In [None]:
data_clean_us

In [None]:
data_AU.shape

In [None]:
len(data_AU['CR TDC'].unique())

In [None]:
67*22336

In [None]:
data_AU['CR TDC'].value_counts()