In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from zipfile import ZipFile

In [10]:
zip_path = r'C:\Users\nadav\PycharmProjects\Loan-Approval-Prediction\playground-series-s4e10.zip'

In [11]:
# Open the zip file
with ZipFile(zip_path) as z:
    # List all files in the zip archive
    files_in_zip = z.namelist()
    
    # Print available files to check names
    print("Files in the zip:", files_in_zip)
    
    # Load the CSV files into dataframes
    train_df = pd.read_csv(z.open('train.csv'))
    test_df = pd.read_csv(z.open('test.csv'))
    sample_submission_df = pd.read_csv(z.open('sample_submission.csv'))


Files in the zip: ['sample_submission.csv', 'test.csv', 'train.csv']


In [12]:
train_df

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
0,0,37,35000,RENT,0.0,EDUCATION,B,6000,11.49,0.17,N,14,0
1,1,22,56000,OWN,6.0,MEDICAL,C,4000,13.35,0.07,N,2,0
2,2,29,28800,OWN,8.0,PERSONAL,A,6000,8.90,0.21,N,10,0
3,3,30,70000,RENT,14.0,VENTURE,B,12000,11.11,0.17,N,5,0
4,4,22,60000,RENT,2.0,MEDICAL,A,6000,6.92,0.10,N,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
58640,58640,34,120000,MORTGAGE,5.0,EDUCATION,D,25000,15.95,0.21,Y,10,0
58641,58641,28,28800,RENT,0.0,MEDICAL,C,10000,12.73,0.35,N,8,1
58642,58642,23,44000,RENT,7.0,EDUCATION,D,6800,16.00,0.15,N,2,1
58643,58643,22,30000,RENT,2.0,EDUCATION,A,5000,8.90,0.17,N,3,0


In [13]:
test_df

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,58645,23,69000,RENT,3.0,HOMEIMPROVEMENT,F,25000,15.76,0.36,N,2
1,58646,26,96000,MORTGAGE,6.0,PERSONAL,C,10000,12.68,0.10,Y,4
2,58647,26,30000,RENT,5.0,VENTURE,E,4000,17.19,0.13,Y,2
3,58648,33,50000,RENT,4.0,DEBTCONSOLIDATION,A,7000,8.90,0.14,N,7
4,58649,26,102000,MORTGAGE,8.0,HOMEIMPROVEMENT,D,15000,16.32,0.15,Y,4
...,...,...,...,...,...,...,...,...,...,...,...,...
39093,97738,22,31200,MORTGAGE,2.0,DEBTCONSOLIDATION,B,3000,10.37,0.10,N,4
39094,97739,22,48000,MORTGAGE,6.0,EDUCATION,A,7000,6.03,0.15,N,3
39095,97740,51,60000,MORTGAGE,0.0,PERSONAL,A,15000,7.51,0.25,N,25
39096,97741,22,36000,MORTGAGE,4.0,PERSONAL,D,14000,15.62,0.39,Y,4


In [14]:
sample_submission_df

Unnamed: 0,id,loan_status
0,58645,0.5
1,58646,0.5
2,58647,0.5
3,58648,0.5
4,58649,0.5
...,...,...
39093,97738,0.5
39094,97739,0.5
39095,97740,0.5
39096,97741,0.5


In [15]:
# Check for missing data in train dataframe
print("Missing values in train dataframe:")
print(train_df.isnull().sum())

# Check for missing data in test dataframe
print("\nMissing values in test dataframe:")
print(test_df.isnull().sum())

# Alternatively, if you want a more compact summary:
print("\nSummary of missing values in train dataframe:")
print(train_df.isnull().sum().sort_values(ascending=False))

print("\nSummary of missing values in test dataframe:")
print(test_df.isnull().sum().sort_values(ascending=False))


Missing values in train dataframe:
id                            0
person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
loan_status                   0
dtype: int64

Missing values in test dataframe:
id                            0
person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
dtype: int64

Summary of missing values in train dataframe:
id                            0
person_age               

Baseline Model

In [16]:
# Define features and target
X = train_df.drop(columns=['loan_status'])  # Replace 'loan_status' with your actual target column name
y = train_df['loan_status']


In [17]:
# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [18]:
# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(exclude=['object']).columns

# Preprocessing for numerical data (imputation of missing values)
numerical_transformer = SimpleImputer(strategy='mean')

# Preprocessing for categorical data (imputation + one-hot encoding)
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine the transformations
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)


In [19]:
# Define the model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Create a pipeline that first transforms the data and then fits the model
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)])


In [20]:
# Train the model
clf.fit(X_train, y_train)
# Train the model
clf.fit(X_train, y_train)


In [21]:
# Make predictions on the validation set
y_pred = clf.predict(X_val)

# Evaluate the model
print("Accuracy:", accuracy_score(y_val, y_pred))
print("\nClassification Report:\n", classification_report(y_val, y_pred))


Accuracy: 0.952510870491943

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.99      0.97     10087
           1       0.93      0.72      0.81      1642

    accuracy                           0.95     11729
   macro avg       0.94      0.85      0.89     11729
weighted avg       0.95      0.95      0.95     11729



In [22]:
# Make predictions on the test set
test_predictions = clf.predict(test_df)


In [24]:
# Extract the ID column from the test set
test_ids = test_df['id']

# Create a dataframe for submission
submission_df = pd.DataFrame({
    'id': test_ids,
    'loan_status': test_predictions
})


In [25]:
# Save the results to a CSV file
submission_df.to_csv('Baseline_Model.csv', index=False)
