In [4]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.model_selection import train_test_split

# Read in dataset
df = pd.read_csv("heloc_dataset_v1.csv")
df.head(3)

# Remove rows with missing values
df = df[~df.isin([-9]).any(axis=1)]

# Calculate group means for numerical columns where -7 needs to be replaced
group_means = df.replace(-7, np.nan).groupby('RiskPerformance').mean()

# Function to impute -7 with group means based on Risk_Performance
def impute_with_group_mean(row):
    for col in df.columns:
        # Skip non-numerical columns (like 'Risk_Performance')
        if col == 'RiskPerformance' or not np.issubdtype(df[col].dtype, np.number):
            continue
        # Replace -7 with the corresponding group's mean
        if row[col] == -7:
            row[col] = group_means.loc[row['RiskPerformance'], col]
    return row

# Apply the function row by row
df = df.apply(impute_with_group_mean, axis=1)



# Check and remove duplicate rows
duplicates = df.duplicated()
num_duplicates = duplicates.sum()
#print(f"Number of duplicate rows: {num_duplicates}")

duplicate_rows = df[duplicates]
#print("Duplicate rows:\n", duplicate_rows)

df = df.drop_duplicates()


label_encoder = LabelEncoder()

# Separate features and target variable
X = df.drop(columns=['RiskPerformance'])
y = label_encoder.fit_transform(df['RiskPerformance'])

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply Lasso regression with cross-validation
lasso = LassoCV(cv=5, random_state=42)
lasso.fit(X_scaled, y)

# Extract feature importance
lasso_importance = pd.Series(lasso.coef_, index=X.columns)
important_features = lasso_importance[lasso_importance != 0].sort_values(ascending=False)

# Print the top 10 most important features
top_n = 24
print(f"Top {top_n} Important Features Based on Lasso Regression:")
print(important_features.head(top_n))

# Class Imbalance
counts = df["RiskPerformance"].value_counts()
print("Counts of 'Bad' vs 'Good':")
print(counts)

# Step 2: Split the data into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# New class distribution
print("Resampled class distribution:", Counter(y_resampled))