In [15]:
# import all nesseary lib and frameworks
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [16]:
# Loading of  the datasets in to notebook
train_df = pd.read_csv('Assignment_Train.csv')
test_df = pd.read_csv('Assignment_Test.csv')
feature_dict = pd.read_excel('Assignment_FeatureDictionary.xlsx')

In [17]:
# testing that all the datasetv have been loaded correctly by displaying some rows of each framework.
print(train_df.head())
print(test_df.head())
print(feature_dict.head())

   DEALER ID APPLICATION LOGIN DATE HDB BRANCH NAME HDB BRANCH STATE  \
0     106989             07/20/2022        DELHI-SF            DELHI   
1     108975             07/28/2022        PATNA-SF            BIHAR   
2     111004             07/15/2022   DARJEELING-SF      WEST BENGAL   
3     192020               07/04/22   SAHARANPUR-SF    UTTAR PRADESH   
4      55095             07/15/2022       MODASA-SF          GUJARAT   

  FIRST NAME MIDDLE NAME  LAST NAME      mobile AADHAR VERIFIED Cibil Score  \
0      SUNIL         NaN    CHANDER  9210574080              NO         726   
1      AMRIT         NaN      KUMAR  8877987018              NO         NaN   
2    ANIMESH         NaN      THAPA  8910862135              NO         737   
3     ADITYA         NaN      SINGH  9758428017              NO         713   
4     PARMAR  HARESHBHAI  AMRUTBHAI  9687028486              NO         669   

   ...  Phone Social Premium.shaadi Phone Social Premium.skype  \
0  ...                    

In [19]:
# checking for missing values in both datasets
print(train_df.isnull().sum())
print(test_df.isnull().sum())

DEALER ID                                 0
APPLICATION LOGIN DATE                    0
HDB BRANCH NAME                           1
HDB BRANCH STATE                        854
FIRST NAME                                0
MIDDLE NAME                            7145
LAST NAME                               681
mobile                                    0
AADHAR VERIFIED                           0
Cibil Score                            4297
MOBILE VERIFICATION                       0
DEALER NAME                               4
TOTAL ASSET COST                       5108
ASSET CTG                              5108
ASSET MODEL NO                            0
APPLIED AMOUNT                            0
PRIMARY ASSET MAKE                        0
Primary Asset Model No                    0
Personal Email Address                    0
MARITAL STATUS                         4894
GENDER                                    0
DOB                                       0
AGE                             

In [29]:
# drop columns with more than 50% missing values
# because we need to imporve the quality and reliablity of datasets used and incrsese efficency of datasert
threshold = 0.5 * len(train_df)
train_df_cleaned = train_df.dropna(thresh=threshold, axis=1)
test_df_cleaned = test_df[train_df_cleaned.columns.drop('Application Status')]
#now we have train_df_cleaned and test_df_cleaned cleaned to perform futher steps.

In [30]:
# Separate features and target variable
#x is feacture matrix and y is target variable. 
X = train_df_cleaned.drop(['Application Status'], axis=1)
y = train_df_cleaned['Application Status']

In [31]:
# identificatuon of  categorical and numerical features separately to apply preprocessing in each group
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

In [32]:
# Preprocessing pipelines for both numerical and categorical data groups which we divided above
numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [34]:
# Combine pipelines using ColumnTransformer in both numerical and categorical with in single pipeline.
# using ColumnTransformer we can incrrses efficency by combining preprocessing steps into single transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_cols),
        ('cat', categorical_pipeline, categorical_cols)
    ])

In [36]:
# spliting of  the training data into train and validation sets 
# dividing at 80 to 20 and using 20 for test and ramdon_statr=42 to ensure same random number seq generates every time
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the model pipeline
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier(random_state=42))
])

# Train the model
model_pipeline.fit(X_train, y_train)

In [37]:
# evaluateing  the model
y_train_pred = model_pipeline.predict(X_train)
y_val_pred = model_pipeline.predict(X_val)

train_accuracy = accuracy_score(y_train, y_train_pred)
val_accuracy = accuracy_score(y_val, y_val_pred)
classification_rep = classification_report(y_val, y_val_pred)

print("Training Accuracy:", train_accuracy)
print("Validation Accuracy:", val_accuracy)
print("Classification Report:\n", classification_rep)

Training Accuracy: 0.99975
Validation Accuracy: 0.7695
Classification Report:
               precision    recall  f1-score   support

    APPROVED       0.76      0.96      0.85      1327
    DECLINED       0.84      0.39      0.53       673

    accuracy                           0.77      2000
   macro avg       0.80      0.68      0.69      2000
weighted avg       0.78      0.77      0.74      2000



In [38]:
# predict on the test set
test_predictions = model_pipeline.predict(test_df_cleaned)

# prepare the predictions DataFrame
predictions_df = pd.DataFrame({'UID': test_df_cleaned['DEALER ID'], 'Prediction': test_predictions})

# saving the predictions to a CSV file
predictions_df.to_csv('predictions.csv', index=False)
print("Predictions saved to predictions.csv")

Predictions saved to predictions.csv


In [39]:
pds=pd.read_csv("predictions.csv")
print(pds.head())

      UID Prediction
0  105615   APPROVED
1   91593   APPROVED
2   74152   APPROVED
3  110164   DECLINED
4  113037   APPROVED


In [None]:
# we have used The Random Forest model because it handles a wide range of dtat types and provide roburst predcitions
# and manitain good performance even when we have missing data.