In [9]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from collections import Counter

# Deliberately not setting random seed to ensure variation between runs

In [10]:
# Create an imbalanced dataset with correlated features
def create_dataset(n_samples=1000):
    X_majority = np.random.normal(0.7, 0.3, (int(0.8 * n_samples), 5))
    y_majority = np.zeros(int(0.8 * n_samples))
    
    X_minority = np.random.normal(0.3, 0.3, (int(0.2 * n_samples), 5))
    y_minority = np.ones(int(0.2 * n_samples))
    
    X = np.vstack([X_majority, X_minority])
    y = np.hstack([y_majority, y_minority])
    
    # Add random noise
    X += np.random.normal(0, 0.2, X.shape)
    
    feature_names = ['income', 'education', 'age', 'experience', 'skill_level']
    X = pd.DataFrame(X, columns=feature_names)
    
    return X, y

X, y = create_dataset()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Display class distribution (will be different each run)
distribution = pd.Series(y_train).value_counts()
distribution

0.0    637
1.0    163
Name: count, dtype: int64

In [11]:
# Train initial model with random undersampling
minority_indices = np.where(y_train == 1)[0]
majority_indices = np.where(y_train == 0)[0]
sampled_majority = np.random.choice(majority_indices, size=len(minority_indices) * 2, replace=False)

# majority size == minority size
balanced_indices = np.concatenate([minority_indices, sampled_majority])
X_balanced = X_train.iloc[balanced_indices]
y_balanced = y_train[balanced_indices]

# Using a small number of trees and limited depth for more variance
model = RandomForestClassifier(n_estimators=20, max_depth=3)
model.fit(X_balanced, y_balanced)

# Get predictions and probabilities
predictions = model.predict(X_test)
probabilities = model.predict_proba(X_test)

# Store results as cell output
initial_results = {
    'accuracy': accuracy_score(y_test, predictions),
    'mean_prob_class0': probabilities[:, 0].mean(),
    'mean_prob_class1': probabilities[:, 1].mean(),
    'predictions_distribution': pd.Series(predictions).value_counts().to_dict()
}
initial_results

{'accuracy': 0.89,
 'mean_prob_class0': 0.7445828498818731,
 'mean_prob_class1': 0.255417150118127,
 'predictions_distribution': {0.0: 169, 1.0: 31}}