In [29]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from collections import Counter

In [32]:
# Create an imbalanced dataset with correlated features
def create_dataset(n_samples=1000):
    # Create biased feature for majority class
    X_majority = np.random.normal(0.7, 0.3, (int(0.8 * n_samples), 5))
    y_majority = np.zeros(int(0.8 * n_samples))
    
    # Create biased feature for minority class
    X_minority = np.random.normal(0.3, 0.3, (int(0.2 * n_samples), 5))
    y_minority = np.ones(int(0.2 * n_samples))
    
    # Combine classes
    X = np.vstack([X_majority, X_minority])
    y = np.hstack([y_majority, y_minority])
    
    # Add noise to make it more realistic
    X += np.random.normal(0, 0.1, X.shape)
    
    # Create meaningful feature names
    feature_names = ['income', 'education', 'age', 'experience', 'skill_level']
    X = pd.DataFrame(X, columns=feature_names)
    
    return X, y

# Create dataset
X, y = create_dataset()

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

print("Class distribution in training set:", Counter(y_train))

Class distribution in training set: Counter({0.0: 643, 1.0: 157})


In [33]:
# Create and train model with random undersampling
def train_model_with_undersampling(X_train, y_train):
    # Find minority class samples
    minority_indices = np.where(y_train == 1)[0]
    n_minority = len(minority_indices)
    
    # Randomly sample majority class
    majority_indices = np.where(y_train == 0)[0]
    majority_indices = np.random.choice(majority_indices, size=n_minority * 2, replace=False)
    
    # Combine indices and create balanced dataset
    balanced_indices = np.concatenate([minority_indices, majority_indices])
    X_balanced = X_train.iloc[balanced_indices]
    y_balanced = y_train[balanced_indices]
    
    # Train model
    model = RandomForestClassifier(n_estimators=50, max_depth=5)
    model.fit(X_balanced, y_balanced)
    
    return model

# Train model
model = train_model_with_undersampling(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)
print("Model Accuracy:", accuracy_score(y_test, y_pred))

Model Accuracy: 0.92
