<a href="https://colab.research.google.com/github/HugoPfeffer/rfb-poc/blob/main/model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pandas
!pip install numpy
!pip install scikit-learn



In [13]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

In [16]:
class FraudDetectionModel:
    def __init__(self):
        self.model = None
        self.best_params = None

    def load_data(self):
        """Load training and test datasets"""
        try:
            train_data = pd.read_csv('datasets/train_data.csv')
            test_data = pd.read_csv('datasets/test_data.csv')

            # Define features and target
            features = ['income', 'filing_status', 'state', 'deductions', 'tax_paid', 'refund_claimed']
            target = 'compliance_score'  # Assuming this is your target variable

            # Split into features and target
            X_train = train_data[features]
            y_train = train_data[target]
            X_test = test_data[features]
            y_test = test_data[target]

            # Handle categorical variables (one-hot encoding for 'filing_status' and 'state')
            X_train = pd.get_dummies(X_train, columns=['filing_status', 'state'])
            X_test = pd.get_dummies(X_test, columns=['filing_status', 'state'])

            # Ensure X_train and X_test have the same columns
            missing_cols = set(X_train.columns) - set(X_test.columns)
            for col in missing_cols:
                X_test[col] = 0
            X_test = X_test[X_train.columns]

            return X_train, X_test, y_train, y_test
        except Exception as e:
            print(f"Error loading data: {str(e)}")
            return None

    def tune_hyperparameters(self, X_train, y_train):
        """Perform hyperparameter tuning using RandomizedSearchCV"""
        # Define the parameter grid for regression
        param_grid = {
            'n_estimators': [100, 200, 300, 400, 500],
            'max_depth': [10, 20, 30, 40, 50, None],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'max_features': ['auto', 'sqrt']
            # Removed class_weight as it's not used in regression
        }

        # Initialize Random Forest Regressor
        rf = RandomForestRegressor(random_state=42)

        # Perform RandomizedSearchCV
        random_search = RandomizedSearchCV(
            estimator=rf,
            param_distributions=param_grid,
            n_iter=20,
            cv=3,
            scoring='neg_mean_squared_error',  # Changed scoring metric for regression
            n_jobs=-1,
            random_state=42
        )

        # Fit the random search
        random_search.fit(X_train, y_train)
        self.best_params = random_search.best_params_
        return random_search.best_estimator_

    def train_model(self, X_train, y_train):
        """Train the Random Forest model with best parameters"""
        print("Training model with best parameters:", self.best_params)
        self.model = RandomForestRegressor(**self.best_params, random_state=42)
        self.model.fit(X_train, y_train)

    def evaluate_model(self, X_test, y_test):
        """Evaluate the regression model performance"""
        if self.model is None:
            print("Model not trained yet!")
            return

        # Make predictions
        y_pred = self.model.predict(X_test)

        # Calculate regression metrics
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        # Print summary metrics
        print("\nRegression Metrics:")
        print(f"Mean Squared Error: {mse:.4f}")
        print(f"Root Mean Squared Error: {rmse:.4f}")
        print(f"Mean Absolute Error: {mae:.4f}")
        print(f"R² Score: {r2:.4f}")

        return mse, rmse, mae, r2


In [15]:
def main():
    # Initialize model
    fraud_model = FraudDetectionModel()

    # Load data
    print("Loading data...")
    data = fraud_model.load_data()
    if data is None:
        return
    X_train, X_test, y_train, y_test = data

    # Tune hyperparameters
    print("\nTuning hyperparameters...")
    best_model = fraud_model.tune_hyperparameters(X_train, y_train)

    # Train model
    print("\nTraining model...")
    fraud_model.train_model(X_train, y_train)

    # Evaluate model
    print("\nEvaluating model...")
    fraud_model.evaluate_model(X_test, y_test)


In [17]:
if __name__ == "__main__":
    main()

Loading data...

Tuning hyperparameters...

Training model...
Training model with best parameters: {'n_estimators': 400, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 10}

Evaluating model...

Regression Metrics:
Mean Squared Error: 0.9844
Root Mean Squared Error: 0.9922
Mean Absolute Error: 0.7401
R² Score: -0.0380
