In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.model_selection import cross_val_score

# Loading your dataset
df = pd.read_excel('data_report4.xlsx')

# Preprocessing data: Encode categorical variables
le_race = LabelEncoder()
df['race_encoded'] = le_race.fit_transform(df['race'])
le_sex = LabelEncoder()
df['sex_encoded'] = le_sex.fit_transform(df['sex'])

# Splitting data into features and target
X = df[['age', 'race_encoded', 'sex_encoded']]
y = df['frisked']

# Binary encode the target
le_frisked = LabelEncoder()
y = le_frisked.fit_transform(y)

# Splitting data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Creating classification models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=10, n_jobs=-1),
    'Support Vector Machine': SVC(kernel='linear', max_iter=1000, probability=True)  # Enable probability
}

# Empty dict to hold model performances
model_performance = {}

# Evaluating each model
for name, model in models.items():
    # Fitting the model on the training data
    model.fit(X_train_scaled, y_train)
    # Making predictions
    y_pred = model.predict(X_test_scaled)
    # Generating classification report
    report = classification_report(y_test, y_pred, output_dict=True)
    # Checking if the model has predict_proba method
    if hasattr(model, "predict_proba"):
        roc_auc = roc_auc_score(y_test, model.predict_proba(X_test_scaled)[:, 1])
    else:
        # If predict_proba is not available, set AUC-ROC to None
        roc_auc = None
    # Performing cross-validation
    cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5)
    # Aggregating the results
    model_performance[name] = {
        'Accuracy': report['accuracy'],
        'Precision': report['weighted avg']['precision'],
        'Recall': report['weighted avg']['recall'],
        'F1 Score': report['weighted avg']['f1-score'],
        'AUC-ROC': roc_auc,
        'Cross-Validation Scores': cv_scores,
        'CV Mean': cv_scores.mean(),
        'CV Std': cv_scores.std()
    }

# Displaying model performances
for model, performance in model_performance.items():
    print(f"Model: {model}")
    for metric, value in performance.items():
        if isinstance(value, float):
            print(f"{metric}: {value:.4f}")
        elif value is not None:
            print(f"{metric}: {value}")
    print("\n")




Model: Logistic Regression
Accuracy: 0.6011
Precision: 0.6115
Recall: 0.6011
F1 Score: 0.5500
AUC-ROC: 0.6011
Cross-Validation Scores: [0.60321816 0.60391012 0.6046138  0.6030024  0.60603999]
CV Mean: 0.6042
CV Std: 0.0011


Model: Random Forest
Accuracy: 0.6104
Precision: 0.6075
Recall: 0.6104
F1 Score: 0.5894
AUC-ROC: 0.6185
Cross-Validation Scores: [0.61011423 0.60820257 0.61019633 0.60860846 0.61345218]
CV Mean: 0.6101
CV Std: 0.0018


Model: Support Vector Machine
Accuracy: 0.5574
Precision: 0.5812
Recall: 0.5574
F1 Score: 0.3993
AUC-ROC: 0.4419
Cross-Validation Scores: [0.55800671 0.5579598  0.43181338 0.55796634 0.44202193]
CV Mean: 0.5096
CV Std: 0.0594


