In [46]:
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, RobustScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import KNNImputer
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings("ignore")

# Load the dataset
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Display basic information
print(f"Training data shape: {train_data.shape}")
print(f"Test data shape: {test_data.shape}")

# Check for missing values
print("Missing values in training data:")
print(train_data.isnull().sum())

# Display data types of columns
print("Data types in training data:")
print(train_data.dtypes)

# Clean numeric columns
columns_to_clean = ['Age', 'Income_Annual', 'Current_Debt_Outstanding', 'Monthly_Investment', 
                    'Monthly_Balance', 'Total_Current_Loans', 'Total_Delayed_Payments', 'Credit_Limit']
for col in columns_to_clean:
    train_data[col] = pd.to_numeric(train_data[col].replace('[^0-9.]', '', regex=True), errors='coerce')
    test_data[col] = pd.to_numeric(test_data[col].replace('[^0-9.]', '', regex=True), errors='coerce')

# Drop irrelevant columns
train_data.drop(columns=['Customer_ID', 'Name', 'Number'], inplace=True)
test_data.drop(columns=['Customer_ID', 'Name', 'Number'], inplace=True)

# Replace placeholders with NaN
train_data['Profession'] = train_data['Profession'].replace('_', np.nan)
test_data['Profession'] = test_data['Profession'].replace('_', np.nan)
train_data['Credit_Mix'] = train_data['Credit_Mix'].replace('_', np.nan)
test_data['Credit_Mix'] = test_data['Credit_Mix'].replace('_', np.nan)

# Convert credit history age to months
def Month_Converter(val):
    if pd.notnull(val):
        years = int(val.split(' ')[0])
        month = int(val.split(' ')[3])
        return (years * 12) + month
    else:
        return val

train_data['Credit_History_Age'] = train_data['Credit_History_Age'].apply(lambda x: Month_Converter(x)).astype(float)
test_data['Credit_History_Age'] = test_data['Credit_History_Age'].apply(lambda x: Month_Converter(x)).astype(float)

# Handle categorical variables
train_data['Payment_Behaviour'] = train_data['Payment_Behaviour'].replace('!@9#%8', np.nan)
test_data['Payment_Behaviour'] = test_data['Payment_Behaviour'].replace('!@9#%8', np.nan)
train_data['Credit_Score'] = train_data['Credit_Score'].replace({'Poor': 0, 'Standard': 1, 'Good': 2})

# Impute missing values in categorical columns based on mode within groups
columns_to_impute_mode = ['Profession', 'Payment_Behaviour']
def fill_missing_with_group_mode(df, groupby, column):
    mode_per_group = df.groupby(groupby)[column].transform(lambda x: x.mode()[0] if not x.mode().empty else np.nan)
    df[column].fillna(mode_per_group, inplace=True)

for col in columns_to_impute_mode:
    fill_missing_with_group_mode(train_data, 'ID', col)
    fill_missing_with_group_mode(test_data, 'ID', col)

# Convert Loan_Type to binary columns
from sklearn.preprocessing import MultiLabelBinarizer
train_data['Loan_Type'] = train_data['Loan_Type'].fillna('Not Specified').str.replace(r'\band \b', '', regex=True).str.strip()
train_data['Loan_Type_List'] = train_data['Loan_Type'].str.split(', ')
mlb = MultiLabelBinarizer()
loan_type_encoded_train = mlb.fit_transform(train_data['Loan_Type_List'])
loan_type_df_train = pd.DataFrame(loan_type_encoded_train, columns=mlb.classes_, index=train_data.index)
train_data = pd.concat([train_data, loan_type_df_train], axis=1).drop(columns=['Loan_Type', 'Loan_Type_List'])

# Apply similar transformation to test data
test_data['Loan_Type'] = test_data['Loan_Type'].fillna('Not Specified').str.replace(r'\band \b', '', regex=True).str.strip()
test_data['Loan_Type_List'] = test_data['Loan_Type'].str.split(', ')
loan_type_encoded_test = mlb.transform(test_data['Loan_Type_List'])
loan_type_df_test = pd.DataFrame(loan_type_encoded_test, columns=mlb.classes_, index=test_data.index)
test_data = pd.concat([test_data, loan_type_df_test], axis=1).drop(columns=['Loan_Type', 'Loan_Type_List'])

# Encode other categorical features
categorical_columns = ['Month', 'Profession', 'Credit_Mix', 'Payment_of_Min_Amount', 'Payment_Behaviour']
label_encoders = {}
for column in categorical_columns:
    label_encoders[column] = LabelEncoder()
    train_data[column] = label_encoders[column].fit_transform(train_data[column])
    test_data[column] = label_encoders[column].transform(test_data[column])

# Impute missing values in numeric columns
imputer = KNNImputer(n_neighbors=5)
columns_to_impute = train_data.loc[:, 'Age':'Monthly_Balance'].columns
train_data[columns_to_impute] = imputer.fit_transform(train_data[columns_to_impute])
test_data[columns_to_impute] = imputer.transform(test_data[columns_to_impute])

# Define features and target
X = train_data.loc[:, 'Month':'Monthly_Balance']
y = train_data['Credit_Score']

# Apply RobustScaler
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns, index=X.index)

# Scale test data
X_test = test_data.loc[:, 'Month':'Monthly_Balance']
X_test_scaled = scaler.transform(X_test)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

# Define models and parameters
smote = SMOTE()
X_sm, y_sm = smote.fit_resample(X_scaled, y)
skfold = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)

# Decision Tree
tree_params = {"criterion": ["entropy"], "splitter": ["best"], "max_depth": [15], "min_samples_split": [2], "min_samples_leaf": [5]}
tree_clf = DecisionTreeClassifier(random_state=42)
tree_cv = GridSearchCV(tree_clf, tree_params, scoring="accuracy", n_jobs=-1, verbose=1, cv=skfold)
tree_cv.fit(X_sm, y_sm)

# Random Forest
rf_params = {'n_estimators': [200], 'max_features': ['sqrt'], 'max_depth': [10], 'min_samples_split': [2], 'min_samples_leaf': [4], 'bootstrap': [True]}
rf_clf = RandomForestClassifier(random_state=42)
rf_cv = GridSearchCV(rf_clf, rf_params, scoring="accuracy", cv=3, verbose=2, n_jobs=-1)
rf_cv.fit(X_sm, y_sm)

# XGBoost
xgb_params = {'n_estimators': [200], 'max_depth': [15, 10], 'learning_rate': [0.5, 0.25], 'gamma': [0.03]}
xgb_clf = XGBClassifier(random_state=42)
xgb_cv = GridSearchCV(xgb_clf, xgb_params, scoring='accuracy', cv=3, verbose=2, n_jobs=-1)
xgb_cv.fit(X_sm, y_sm)

# KNN
knn_params = {'n_neighbors': [3, 5, 7], 'weights': ['uniform', 'distance'], 'metric': ['euclidean', 'manhattan']}
knn_clf = KNeighborsClassifier()
knn_cv = GridSearchCV(knn_clf, knn_params, scoring='accuracy', cv=3, verbose=2, n_jobs=-1)
knn_cv.fit(X_scaled, y)

# Naive Bayes
nb_clf = GaussianNB()
nb_scores = cross_val_score(nb_clf, X_scaled, y, cv=skfold, scoring='accuracy')
nb_accuracy = nb_scores.mean()

# Compare model performances
models = {
    'Decision Tree': tree_cv.best_score_,
    'Random Forest': rf_cv.best_score_,
    'XGBoost': xgb_cv.best_score_,
    'KNN': knn_cv.best_score_,
    'Naive Bayes': nb_accuracy
}

best_model_name = max(models, key=models.get)
print(f"\nBest performing model: {best_model_name} with accuracy: {models[best_model_name]:.4f}")

# Get the best model
if best_model_name == 'Decision Tree':
    best_model = tree_cv.best_estimator_
elif best_model_name == 'Random Forest':
    best_model = rf_cv.best_estimator_
elif best_model_name == 'XGBoost':
    best_model = xgb_cv.best_estimator_
elif best_model_name == 'KNN':
    best_model = knn_cv.best_estimator_
else:
    best_model = nb_clf

# Make predictions on test set
predictions = best_model.predict(X_test_scaled)

# Convert numeric predictions back to categories
prediction_map = {0: 'Poor', 1: 'Standard', 2: 'Good'}
predictions_categorical = [prediction_map[pred] for pred in predictions]

# Create submission DataFrame
submission = pd.DataFrame({
    'ID': test_data['ID'],
    'Credit_Score': predictions_categorical
})

# Save predictions to CSV
submission.to_csv('submission.csv', index=False)
print("\nPredictions have been saved to 'credit_score_predictions.csv'")

Training data shape: (80000, 28)
Test data shape: (20000, 27)
Missing values in training data:
ID                              0
Customer_ID                     0
Month                           0
Name                         8029
Age                             0
Number                          0
Profession                      0
Income_Annual                   0
Base_Salary_PerMonth        12032
Total_Bank_Accounts             0
Total_Credit_Cards              0
Rate_Of_Interest                0
Total_Current_Loans             0
Loan_Type                    9157
Delay_from_due_date             0
Total_Delayed_Payments       5595
Credit_Limit                    0
Total_Credit_Enquiries       1549
Credit_Mix                      0
Current_Debt_Outstanding        0
Ratio_Credit_Utilization        0
Credit_History_Age           7240
Payment_of_Min_Amount           0
Per_Month_EMI                   0
Monthly_Investment           3605
Payment_Behaviour               0
Monthly_Balance      