In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [3]:
# Load dataset
data = pd.read_csv('accepted_final.csv')  # Use the correct file path here

# Check for missing values
print(data.isnull().sum())

id                         0
issue_d                    0
loan_amnt                  0
term                       0
installment                0
emp_length             22114
home_ownership             0
annual_inc                 0
verification_status        0
loan_status                0
purpose                    0
addr_state                 0
dti                      501
delinq_2yrs                0
fico_range_low             0
fico_range_high            0
inq_last_6mths             0
delinquent                 0
dtype: int64


In [4]:
# Check for missing values
print(data.isnull().sum())

id                         0
issue_d                    0
loan_amnt                  0
term                       0
installment                0
emp_length             22114
home_ownership             0
annual_inc                 0
verification_status        0
loan_status                0
purpose                    0
addr_state                 0
dti                      501
delinq_2yrs                0
fico_range_low             0
fico_range_high            0
inq_last_6mths             0
delinquent                 0
dtype: int64


In [6]:
# Check the column names to find the target column
print(data.columns)

Index(['id', 'issue_d', 'loan_amnt', 'term', 'installment', 'emp_length',
       'home_ownership', 'annual_inc', 'verification_status', 'loan_status',
       'purpose', 'addr_state', 'dti', 'delinq_2yrs', 'fico_range_low',
       'fico_range_high', 'inq_last_6mths', 'delinquent'],
      dtype='object')


In [7]:
# Separate features (X) and target (y)
X = data.drop(columns=['loan_status'])  # Drop the target column 'loan_status'
y = data['loan_status']  # The target column is 'loan_status'

# Check the first few rows of X and y to confirm
print(X.head())
print(y.head())

          id     issue_d  loan_amnt        term  installment emp_length  \
0  112575999  2017-07-01    35000.0   60 months       851.51    9 years   
1  122347826  2017-11-01    12100.0   36 months       467.83    4 years   
2  133439083  2018-06-01    17000.0   36 months       533.35    4 years   
3  119216681  2017-10-01    12000.0   60 months       327.69  10+ years   
4  142845437  2018-10-01    14000.0   36 months       426.61    4 years   

  home_ownership  annual_inc verification_status             purpose  \
0           RENT    175000.0            Verified               other   
1       MORTGAGE    106000.0        Not Verified  debt_consolidation   
2       MORTGAGE     65000.0     Source Verified         credit_card   
3           RENT     60000.0            Verified  debt_consolidation   
4       MORTGAGE     57000.0     Source Verified  debt_consolidation   

  addr_state    dti  delinq_2yrs  fico_range_low  fico_range_high  \
0         CA  13.51          0.0           710.

In [8]:
# Identify categorical and numeric columns
categorical_cols = X.select_dtypes(include=['object']).columns
numeric_cols = X.select_dtypes(exclude=['object']).columns

In [9]:
# Create preprocessing pipeline for numeric and categorical columns
numeric_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Handle missing values by replacing with mean
    ('scaler', StandardScaler())  # Normalize numeric features
])

categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Handle missing values by replacing with the most frequent value
    ('encoder', OneHotEncoder(handle_unknown='ignore'))  # Encode categorical features using one-hot encoding
])

In [10]:
# Combine both pipelines into a single ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_pipeline, numeric_cols),
        ('cat', categorical_pipeline, categorical_cols)
    ])

In [11]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [12]:
# Apply preprocessing pipeline to training and testing data
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

In [13]:
# Optionally, if you want to check the shapes of the transformed data
print(f"Shape of preprocessed training data: {X_train_preprocessed.shape}")
print(f"Shape of preprocessed testing data: {X_test_preprocessed.shape}")

Shape of preprocessed training data: (197152, 117)
Shape of preprocessed testing data: (84494, 117)
