In [13]:
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
import io
from ipywidgets import FileUpload

# Step 1: Create a file upload button
upload = FileUpload(accept='.csv', multiple=False)  # Accept only .csv files
upload

# Step 2: Function to load the uploaded CSV file into a DataFrame
def load_file(uploaded_file):
    # Get the file content
    for filename, file in uploaded_file.value.items():
        # Read the CSV file into a DataFrame
        df = pd.read_csv(io.BytesIO(file['content']))
        print(f"Loaded {filename} successfully.")
        return df
    return None  # Return None if no file was loaded

# Step 3: Load the file and display the DataFrame
df = load_file(upload)
if df is not None:
    print("DataFrame Preview:")
    print(df.head())
else:
    print("No file uploaded or file is empty.")

# Step 4: Check for missing values
if df is not None:
    print("Missing values in each column:\n", df.isnull().sum())

    # Step 5: Handle categorical data
    if 'type' in df.columns:
        df = pd.get_dummies(df, columns=['type'], drop_first=True)
    else:
        print("Column 'type' not found in dataset.")

    # Step 6: Outlier detection and removal using Interquartile Range (IQR)
    def remove_outliers(df, column):
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

    # Remove outliers for "amount"
    if 'amount' in df.columns:
        df = remove_outliers(df, 'amount')
    else:
        print("Column 'amount' not found in dataset.")

    # Step 7: Ensure the dataset contains the required columns
    required_columns = ['step', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest', 'isFraud']
    if all(column in df.columns for column in required_columns):
        # Select relevant features and target variable
        X = df.drop(['isFraud'], axis=1)
        y = df['isFraud']
    else:
        raise ValueError("Dataset does not contain all required columns.")

    # Step 8: Detect multi-collinearity with a heatmap
    if not df.empty:
        # Select only numeric columns for correlation
        numeric_df = df.select_dtypes(include=[np.number])
        if not numeric_df.empty:
            corr_matrix = numeric_df.corr()
            plt.figure(figsize=(10, 8))
            sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
            plt.title("Correlation Matrix for Feature Selection")
            plt.show()
        else:
            print("No numeric data available for correlation.")
    else:
        print("DataFrame is empty after removing outliers.")

    # Step 9: Split data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Step 10: Standardize the features
    scaler = StandardScaler()
    X_train_numeric = X_train.select_dtypes(include=[np.number])
    X_test_numeric = X_test.select_dtypes(include=[np.number])
    X_train_scaled = scaler.fit_transform(X_train_numeric)
    X_test_scaled = scaler.transform(X_test_numeric)

    # Step 11: Define machine learning models
    models = {
        'Logistic Regression': LogisticRegression(),
        'K-Nearest Neighbors': KNeighborsClassifier(),
        'Decision Tree': DecisionTreeClassifier(),
        'Random Forest': RandomForestClassifier(),
        'Gradient Boosting': GradientBoostingClassifier(),
        'Support Vector Machine': SVC(probability=True),
        'MLP Classifier': MLPClassifier(max_iter=300)
    }

    # Step 12: Function to evaluate models
    def evaluate_model(model, X_train, X_test, y_train, y_test):
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None
        return {
            'Accuracy': accuracy_score(y_test, y_pred),
            'Precision': precision_score(y_test, y_pred),
            'Recall': recall_score(y_test, y_pred),
            'F1 Score': f1_score(y_test, y_pred),
            'ROC-AUC': roc_auc_score(y_test, y_pred_proba) if y_pred_proba is not None else "N/A"
        }

    # Step 13: Evaluate all models and store results
    results = {}
    for model_name, model in models.items():
        try:
            # Use scaled features for training and testing
            results[model_name] = evaluate_model(model, X_train_scaled, X_test_scaled, y_train, y_test)
        except Exception as e:
            print(f"Error with model {model_name}: {e}")

    # Step 14: Display results in a DataFrame for easy comparison
    results_df = pd.DataFrame(results).T
    print("Model Performance Comparison:\n", results_df)

    # Step 15: Plot model performance
    plt.figure(figsize=(12, 6))
    sns.barplot(x=results_df.index, y=results_df['Accuracy'], palette='viridis')
    plt.title("Model Accuracy Comparison")
    plt.ylabel("Accuracy")
    plt.xticks(rotation=45)
    plt.show()

else:
    print("Data loading failed. Please upload a valid CSV file.")


No file uploaded or file is empty.
Data loading failed. Please upload a valid CSV file.
