In [5]:
import pandas as pd
import numpy as np
import mlflow
import mlflow.pyfunc
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, FunctionTransformer
from imblearn.over_sampling import SMOTE
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [6]:
# Read the CSV file into a DataFrame, specifying the file path using a raw string literal to handle backslashes
data = pd.read_csv(r"C:\Users\gagan\Downloads\Data\Winter 23-24\Capstone\transformed.csv")

In [7]:
data

Unnamed: 0.1,Unnamed: 0,Sender_Country,Sender_Sector,Bene_Country,USD_amount,Label,Transaction_Type,Sender_Type,Bene_Type,Time,Year,Month,Day
0,1,USA,15287.0,CANADA,622.78,0,QUICK-PAYMENT,JPMC-CLIENT,CLIENT,37440,2022,3,15
1,3,USA,38145.0,USA,989.09,0,PAY-CHECK,JPMC-CLIENT,JPMC-CLIENT,37440,2022,3,15
2,5,USA,14645.0,OTHERS,543.87,0,PAY-CHECK,JPMC-CLIENT,JPMC-COMPANY,37440,2022,3,15
3,8,OTHERS,25040.0,USA,336.55,0,MAKE-PAYMENT,CLIENT,JPMC-CLIENT,37440,2022,3,15
4,9,USA,34576.0,OTHERS,265.66,0,QUICK-PAYMENT,JPMC-CLIENT,COMPANY,37440,2022,3,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1063393,1498172,USA,24701.0,USA,159.64,0,MAKE-PAYMENT,JPMC-CLIENT,BILL-COMPANY,47844,2054,9,14
1063394,1498173,USA,4953.0,USA,120.07,0,QUICK-PAYMENT,JPMC-CLIENT,CLIENT,3444,2054,9,15
1063395,1498174,USA,34784.0,USA,25.32,0,MAKE-PAYMENT,JPMC-CLIENT,BILL-COMPANY,4644,2054,9,15
1063396,1498175,USA,23822.0,USA,610.91,0,QUICK-PAYMENT,JPMC-CLIENT,CLIENT,46644,2054,9,15


In [3]:
# Define the pipeline
transformed_pipeline = Pipeline(steps=[
    ('preprocessor', ColumnTransformer(
        transformers=[
            ('standard', StandardScaler(), ["Time"]),
            ('minmax', MinMaxScaler(), ['Year', 'Month', 'Day']),
            ('onehot', OneHotEncoder(handle_unknown='ignore'), ['Sender_Country', 'Bene_Country', 'Transaction_Type', 'Sender_Type', 'Bene_Type'])
        ],
        remainder='drop'  # Drop columns not explicitly transformed
    )),
    ('concatenation', FunctionTransformer(lambda X: pd.concat([pd.DataFrame(X), data[['Sender_Sector', 'USD_amount']]], axis=1)))
])

# Fit and transform the transformed columns
transformed_data = transformed_pipeline.fit_transform(data)


In [4]:
# Display the transformed DataFrame
data = pd.concat([data['Label','Transaction_id'],transformed_data],axis=1)
data.columns = data.columns.astype(str)

KeyError: ('Label', 'Transaction_id')

# Supervised Classification

In [5]:
def balance_df(df):
    # Assuming your data is in a DataFrame called 'data'
    X = df.drop('Label', axis=1)
    y = df['Label']

    # Using SMOTE to oversample the minority class
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X, y)

    # Creating a new balanced DataFrame
    df = pd.concat([pd.DataFrame(X_resampled), pd.DataFrame(y_resampled, columns=['Label'])], axis=1)

    return df

# Apply the 'balance_df' function to balance the data (explained in the comments of the 'balance_df' function)
data = balance_df(data)

In [6]:
# Split the data into features (X) and target variable (y)
X = data.drop('Label', axis=1)  # Features
y = data['Label']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define the DecisionTreeClassifier model
dt_model_regularized = DecisionTreeClassifier(max_depth=7, min_samples_split=5, min_samples_leaf=2, random_state=42)

# Fit the DecisionTreeClassifier model to the training data
dt_model_regularized.fit(X_train, y_train)

# Log preprocessing pipeline as an artifact
mlflow.sklearn.log_model(transformed_pipeline, "preprocessing_pipeline")

# Log trained model as an artifact without specifying the flavor
mlflow.sklearn.log_model(dt_model_regularized, "trained_model", registered_model_name="Regularized_DecisionTree")

# Get the run ID
run_id = mlflow.active_run().info.run_id

# Construct the model URI
model_uri = f"runs:/{run_id}/trained_model"
# preprocessing_pipeline = f"runs:/{run_id}/preprocessing_pipeline"

print("Model URI:", model_uri)
# print("Preprocessing Pipeline URI:", preprocessing_pipeline)




Model URI: runs:/c388a373054149939a5b98747ad94b7c/trained_model


Registered model 'Regularized_DecisionTree' already exists. Creating a new version of this model...
Created version '5' of model 'Regularized_DecisionTree'.


In [6]:
# Split the data into features (X) and target variable (y)
X = data.drop('Label', axis=1)  # Features
y = data['Label']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define a PyFunc wrapper for the DecisionTreeClassifier model
class DecisionTreePyFunc(mlflow.pyfunc.PythonModel):
    def load_context(self, context):
        # Load the DecisionTreeClassifier model
        self.model = mlflow.sklearn.load_model(context.artifacts["trained_model"])
        
    def predict(self, context, model_input):
        # Use the loaded model to make predictions
        return self.model.predict(model_input)

# Define the DecisionTreeClassifier model
dt_model_regularized = DecisionTreeClassifier(max_depth=7, min_samples_split=5, min_samples_leaf=2, random_state=42)

# Fit the DecisionTreeClassifier model to the training data
dt_model_regularized.fit(X_train, y_train)

# Log preprocessing pipeline as an artifact
mlflow.sklearn.log_model(transformed_pipeline, "preprocessing_pipeline")

# Log trained model as an artifact without specifying the flavor
mlflow.sklearn.log_model(dt_model_regularized, "trained_model", registered_model_name="Regularized_DecisionTree")

# Get the run ID
run_id = mlflow.active_run().info.run_id

# Construct the model URI
model_uri = f"runs:/{run_id}/trained_model"
preprocessing_pipeline = f"runs:/{run_id}/preprocessing_pipeline"

print("Model URI:", model_uri)
print("preprocessing_pipeline URI:", preprocessing_pipeline)




Model URI: runs:/3a88aa504884461fa8df8c2526f1244e/trained_model
preprocessing_pipeline URI: runs:/3a88aa504884461fa8df8c2526f1244e/preprocessing_pipeline


Registered model 'Regularized_DecisionTree' already exists. Creating a new version of this model...
Created version '4' of model 'Regularized_DecisionTree'.


In [None]:
# Split the data into features (X) and target variable (y)
X = data.drop('Label', axis=1)  # Features
y = data['Label']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define a PyFunc wrapper for the DecisionTreeClassifier model
class DecisionTreePyFunc(mlflow.pyfunc.PythonModel):
    def load_context(self, context):
        # Load the DecisionTreeClassifier model
        self.model = mlflow.sklearn.load_model(context.artifacts["trained_model"])
        
    def predict(self, context, model_input):
        # Use the loaded model to make predictions
        return self.model.predict(model_input)

# Define the DecisionTreeClassifier model
dt_model_regularized = DecisionTreeClassifier(max_depth=7, min_samples_split=5, min_samples_leaf=2, random_state=42)

# Fit the DecisionTreeClassifier model to the training data
dt_model_regularized.fit(X_train, y_train)

# Log preprocessing pipeline as an artifact
mlflow.sklearn.log_model(transformed_pipeline, "preprocessing_pipeline")

# Log trained model as an artifact without specifying the flavor
mlflow.sklearn.log_model(dt_model_regularized, "trained_model", registered_model_name="Regularized_DecisionTree")

# Get the run ID
run_id = mlflow.active_run().info.run_id

# Construct the model URI
model_uri = f"runs:/{run_id}/trained_model"

print("Model URI:", model_uri)


# Decision Tree (Regularized)

In [None]:
# Split the data into features (X) and target variable (y)
X = data.drop('Label', axis=1)  # Features
y = data['Label']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize and train decision tree model with regularization
dt_model_regularized = DecisionTreeClassifier(max_depth=7,min_samples_split=5, min_samples_leaf=2, random_state=42)
dt_model_regularized.fit(X_train, y_train)

class DecisionTreePyFunc(mlflow.pyfunc.PythonModel):
    def load_context(self, context):
        # Load the DecisionTreeClassifier model
        self.model = mlflow.sklearn.load_model(context.artifacts["trained_model"])
        
    def predict(self, context, model_input):
        # Use the loaded model to make predictions
        return self.model.predict(model_input)

# Define a predict function for the model
def predict(X):
    return dt_model_regularized.predict(X)

# Assign the predict function to the model object
dt_model_regularized.predict = predict


In [None]:
with mlflow.start_run():
    # Log preprocessing pipeline as an artifact
    mlflow.sklearn.log_model(transformed_pipeline, "preprocessing_pipeline")

    # Log trained model as an artifact
    mlflow.sklearn.log_model(dt_model_regularized, "trained_model")

    # Get the run ID
    run_id = mlflow.active_run().info.run_id

    # Construct the model URI
    model_uri = f"runs:/{run_id}/model"

print("Model URI:", model_uri)