In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer
from sklearn.feature_selection import SelectKBest, chi2

In [None]:
df = pd.read_csv('train_raw.csv')

  df = pd.read_csv('train.csv')


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 28 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   ID                        100000 non-null  object 
 1   Customer_ID               100000 non-null  object 
 2   Month                     100000 non-null  object 
 3   Name                      90015 non-null   object 
 4   Age                       100000 non-null  object 
 5   SSN                       100000 non-null  object 
 6   Occupation                100000 non-null  object 
 7   Annual_Income             100000 non-null  object 
 8   Monthly_Inhand_Salary     84998 non-null   float64
 9   Num_Bank_Accounts         100000 non-null  int64  
 10  Num_Credit_Card           100000 non-null  int64  
 11  Interest_Rate             100000 non-null  int64  
 12  Num_of_Loan               100000 non-null  object 
 13  Type_of_Loan              88592 non-null   ob

In [4]:
missing_values = df.isnull().sum()
missing_values = missing_values[missing_values > 0]

print("Columns with missing values:")
print(missing_values)

Columns with missing values:
Name                        9985
Monthly_Inhand_Salary      15002
Type_of_Loan               11408
Num_of_Delayed_Payment      7002
Num_Credit_Inquiries        1965
Credit_History_Age          9030
Amount_invested_monthly     4479
Monthly_Balance             1200
dtype: int64


In [6]:
def clean_numeric_columns(df, num_cols):
    for col in num_cols:
        df[col] = df[col].astype(str).str.replace(r"[^\d.]", "", regex=True)  
        df[col] = pd.to_numeric(df[col], errors='coerce') 
    return df


num_cols = [
    "Age", "Annual_Income", "Monthly_Inhand_Salary", "Num_Bank_Accounts", "Num_Credit_Card",
    "Interest_Rate", "Num_of_Loan", "Delay_from_due_date", "Num_of_Delayed_Payment",
    "Changed_Credit_Limit", "Num_Credit_Inquiries", "Outstanding_Debt",
    "Credit_Utilization_Ratio", "Total_EMI_per_month", "Amount_invested_monthly",
    "Monthly_Balance", "Credit_History_Age"
]

# Clean numerical columns
df = clean_numeric_columns(df, num_cols)

# Impute
knn_imputer = KNNImputer(n_neighbors=5)

df[num_cols] = knn_imputer.fit_transform(df[num_cols])



In [7]:
print(df.isnull().sum())


Month                       0
Age                         0
Occupation                  0
Annual_Income               0
Monthly_Inhand_Salary       0
Num_Bank_Accounts           0
Num_Credit_Card             0
Interest_Rate               0
Num_of_Loan                 0
Delay_from_due_date         0
Num_of_Delayed_Payment      0
Changed_Credit_Limit        0
Num_Credit_Inquiries        0
Credit_Mix                  0
Outstanding_Debt            0
Credit_Utilization_Ratio    0
Credit_History_Age          0
Payment_of_Min_Amount       0
Total_EMI_per_month         0
Amount_invested_monthly     0
Payment_Behaviour           0
Monthly_Balance             0
Credit_Score                0
dtype: int64


In [8]:
print(df.duplicated().sum())

0


In [None]:
df_encoded = df.copy()
for col in df_encoded.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col].astype(str))


X = df_encoded.drop(columns=['Credit_Score'])
y = df_encoded['Credit_Score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

k = min(10, X_train.shape[1])

# Apply SelectKBest
select_k_best = SelectKBest(score_func=chi2, k=k)
X_train_k_best = select_k_best.fit_transform(X_train, y_train)

# Get selected feature names
selected_features = X_train.columns[select_k_best.get_support()]

# Print selected features
print("Selected features:", selected_features.tolist())

# Create DataFrame for feature scores
feature_scores = pd.DataFrame({'Feature': X_train.columns, 'Score': select_k_best.scores_})
feature_scores = feature_scores.sort_values(by="Score", ascending=False)

# Print feature scores
print("\nFeature Scores:")
print(feature_scores)



Selected features: ['Annual_Income', 'Monthly_Inhand_Salary', 'Interest_Rate', 'Delay_from_due_date', 'Changed_Credit_Limit', 'Outstanding_Debt', 'Credit_History_Age', 'Total_EMI_per_month', 'Amount_invested_monthly', 'Monthly_Balance']

Feature Scores:
                     Feature         Score
21           Monthly_Balance  1.687044e+27
3              Annual_Income  1.071016e+08
14          Outstanding_Debt  1.000486e+07
4      Monthly_Inhand_Salary  7.411940e+06
16        Credit_History_Age  1.026920e+06
18       Total_EMI_per_month  1.933281e+05
9        Delay_from_due_date  1.360119e+05
19   Amount_invested_monthly  1.005454e+05
7              Interest_Rate  1.773547e+04
11      Changed_Credit_Limit  1.521642e+04
10    Num_of_Delayed_Payment  1.424371e+04
12      Num_Credit_Inquiries  1.229941e+04
1                        Age  1.154198e+04
5          Num_Bank_Accounts  3.231692e+03
8                Num_of_Loan  2.503742e+03
17     Payment_of_Min_Amount  2.145090e+03
13             

In [None]:
selected_data = X_train[selected_features]
selected_data.to_csv("cleaned_train.csv", index=False)