In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

# from pandas_profiling import ProfileReport


In [2]:
df = pd.read_csv("RaceCar.csv")

In [3]:
df.shape

(1940, 22)

In [4]:
df.head()

Unnamed: 0,Date,raceId,driverId,Race,Driver Name,constructorId,Racing Team,Race Results,grid,points,...,fastestLap,rank,fastestLapTime,fastestLapSpeed,statusId,Status,Win,Top 10 Grid,Finished Race,Pit Stop less than 30 Seconds
0,9/17/2023,1113,832,Singapore Grand Prix,sainz,6,Ferrari,1,1,25.0,...,47,7,0.00113,182.089,1,Finished,1,1,1,1
1,9/17/2023,1113,846,Singapore Grand Prix,norris,1,McLaren,2,4,18.0,...,46,8,0.001135,181.384,1,Finished,0,1,1,1
2,9/17/2023,1113,1,Singapore Grand Prix,hamilton,131,Mercedes,3,5,16.0,...,47,1,0.00111,185.507,1,Finished,0,1,1,0
3,9/17/2023,1113,844,Singapore Grand Prix,leclerc,6,Ferrari,4,3,12.0,...,46,11,0.001137,180.961,1,Finished,0,1,1,0
4,9/17/2023,1113,830,Singapore Grand Prix,max_verstappen,9,Red Bull,5,11,10.0,...,61,4,0.001118,184.147,1,Finished,0,0,1,1


In [5]:
def build_classification_model(data, target_column, model_choice):
    """
    Build a classification model based on the user's choice of model and print out the top 3 important features.
    
    Parameters:
        data (DataFrame): The input DataFrame.
        target_column (str): The name of the target column.
        model_choice (str): The chosen classification model 
                            ('Random Forest', 'Decision Tree', 'Logistic Regression', 'SVM', 'Naive Bayes', or 'XGBoost').
    
    Returns:
        dict: A dictionary containing train score, test score, confusion matrix, and predictions.
    """
    # Extract features and target
    X = data.drop(columns=[target_column])
    y = data[target_column]
    
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Initialize the chosen classification model
    if model_choice == 'Random Forest':
        model = RandomForestClassifier()
    elif model_choice == 'Decision Tree':
        model = DecisionTreeClassifier()
    elif model_choice == 'Logistic Regression':
        model = LogisticRegression()
    elif model_choice == 'SVM':
        model = SVC()
    elif model_choice == 'Naive Bayes':
        model = GaussianNB()
    elif model_choice == 'XGBoost':
        model = XGBClassifier()
    elif model_choice == 'KNN':
        model = KNeighborsClassifier()
    elif model_choice == 'AdaBoost':
        model = AdaBoostClassifier()
    elif model_choice == 'Gradient Boosting':
        model = GradientBoostingClassifier()
    elif model_choice == 'Neural Network':
        model = MLPClassifier()
    else:
        raise ValueError("Invalid model choice. Choose from 'Random Forest', 'Decision Tree', 'Logistic Regression', 'SVM', 'Naive Bayes', 'XGBoost', 'KNN', 'AdaBoost', 'Gradient Boosting', or 'Neural Network'.")
   
    # Train the model
    model.fit(X_train, y_train)
    
    # Get feature importance (if applicable)
    if hasattr(model, 'feature_importances_'):
        feature_importance = model.feature_importances_
        if len(feature_importance) >= 3:
            top_features_idx = np.argsort(feature_importance)[::-1][:3]
            top_features = X.columns[top_features_idx]
            print(f"Top 3 important features affecting {target_column}:")
            for i, feature in enumerate(top_features):
                print(f"{i+1}. {feature}")
        else:
            print("Feature importance is not available for this model.")
    else:
        print("Feature importance is not available for this model.")
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate train and test scores
    train_score = model.score(X_train, y_train)
    test_score = model.score(X_test, y_test)
    
    # Calculate confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    # Create dictionary to store results
    results = {
        'train_score': train_score,
        'test_score': test_score,
        'confusion_matrix': conf_matrix,
        'predictions': y_pred
    }
    
    return results

In [8]:
def create_dummy_data(data):
    """
    Generate dummy data for all categorical columns in the DataFrame.
    
    Parameters:
        data (DataFrame): The input DataFrame.
    
    Returns:
        DataFrame: A new DataFrame with dummy variables for all categorical columns.
    """
    # Get list of categorical columns
    categorical_columns = data.select_dtypes(include=['object']).columns
    
    # Create dummy variables for categorical columns
    dummy_data = pd.get_dummies(data, columns=categorical_columns)
    
    return dummy_data

In [9]:
df = create_dummy_data(df)

In [10]:
results = build_classification_model(df, "Win", "Random Forest")
print("Train Score:", results['train_score'])
print("Test Score:", results['test_score'])
print("Confusion Matrix:")
print(results['confusion_matrix'])
print("Predictions:")
print(results['predictions'])

Top 3 important features affecting Win:
1. points
2. Race Results
3. grid
Train Score: 1.0
Test Score: 1.0
Confusion Matrix:
[[372   0]
 [  0  16]]
Predictions:
[0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
