# Predicting Credit Card Approvals

In [1]:
import pandas as pd

# Load the actual data and target data from different sheets or files
actual_data = pd.read_csv("application_record.csv")
target_data = pd.read_csv("credit_record.csv")

In [2]:
actual_data

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS
0,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0
1,5008805,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0
2,5008806,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,-21474,-1134,1,0,0,0,Security staff,2.0
3,5008808,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0
4,5008809,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
438552,6840104,M,N,Y,0,135000.0,Pensioner,Secondary / secondary special,Separated,House / apartment,-22717,365243,1,0,0,0,,1.0
438553,6840222,F,N,N,0,103500.0,Working,Secondary / secondary special,Single / not married,House / apartment,-15939,-3007,1,0,0,0,Laborers,1.0
438554,6841878,F,N,N,0,54000.0,Commercial associate,Higher education,Single / not married,With parents,-8169,-372,1,1,0,0,Sales staff,1.0
438555,6842765,F,N,Y,0,72000.0,Pensioner,Secondary / secondary special,Married,House / apartment,-21673,365243,1,0,0,0,,2.0


In [3]:
target_data

Unnamed: 0,ID,MONTHS_BALANCE,STATUS
0,5001711,0,X
1,5001711,-1,0
2,5001711,-2,0
3,5001711,-3,0
4,5001712,0,C
...,...,...,...
1048570,5150487,-25,C
1048571,5150487,-26,C
1048572,5150487,-27,C
1048573,5150487,-28,C


In [4]:
# Merge the two DataFrames using the common 'id' column
merged_data = pd.merge(actual_data, target_data, on='ID', how='inner')

In [5]:
merged_data.head()

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,MONTHS_BALANCE,STATUS
0,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0,0,C
1,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0,-1,C
2,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0,-2,C
3,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0,-3,C
4,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0,-4,C


In [6]:
merged_data.shape

(777715, 20)

# Preprocessing the data  

In [7]:
merged_data.isnull().sum()

ID                          0
CODE_GENDER                 0
FLAG_OWN_CAR                0
FLAG_OWN_REALTY             0
CNT_CHILDREN                0
AMT_INCOME_TOTAL            0
NAME_INCOME_TYPE            0
NAME_EDUCATION_TYPE         0
NAME_FAMILY_STATUS          0
NAME_HOUSING_TYPE           0
DAYS_BIRTH                  0
DAYS_EMPLOYED               0
FLAG_MOBIL                  0
FLAG_WORK_PHONE             0
FLAG_PHONE                  0
FLAG_EMAIL                  0
OCCUPATION_TYPE        240048
CNT_FAM_MEMBERS             0
MONTHS_BALANCE              0
STATUS                      0
dtype: int64

In [8]:
merged_data['STATUS'].value_counts()

C    329536
0    290654
X    145950
1      8747
5      1527
2       801
3       286
4       214
Name: STATUS, dtype: int64

In [9]:
import pandas as pd

# Assuming you have a DataFrame named 'merged_data' with a 'STATUS' column
# Replace 'merged_data' with your actual DataFrame name and 'STATUS' with your column name
merged_data = merged_data[~merged_data['STATUS'].isin(['X'])]
# Categorize values 3 or greater in the 'STATUS' column as 'bad customer'
merged_data.loc[merged_data['STATUS'].isin(['3', '4', '5']), 'credit_card_approval'] = 'bad customer'

# Categorize 'x' and values less than 3 in the 'STATUS' column as 'indeterminates'
merged_data.loc[merged_data['STATUS'].isin(['0', '1', '2']), 'credit_card_approval'] = 'indeterminates'

# Categorize 'c' in the 'STATUS' column as 'good customer'
merged_data.loc[merged_data['STATUS'] == 'C', 'credit_card_approval'] = 'good customer'


In [10]:
merged_data['credit_card_approval'].value_counts()

good customer     329536
indeterminates    300202
bad customer        2027
Name: credit_card_approval, dtype: int64

In [11]:
merged_data

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,...,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,MONTHS_BALANCE,STATUS,credit_card_approval
0,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,...,-4542,1,1,0,0,,2.0,0,C,good customer
1,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,...,-4542,1,1,0,0,,2.0,-1,C,good customer
2,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,...,-4542,1,1,0,0,,2.0,-2,C,good customer
3,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,...,-4542,1,1,0,0,,2.0,-3,C,good customer
4,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,...,-4542,1,1,0,0,,2.0,-4,C,good customer
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
777710,5150337,M,N,Y,0,112500.0,Working,Secondary / secondary special,Single / not married,Rented apartment,...,-1193,1,0,0,0,Laborers,1.0,-9,0,indeterminates
777711,5150337,M,N,Y,0,112500.0,Working,Secondary / secondary special,Single / not married,Rented apartment,...,-1193,1,0,0,0,Laborers,1.0,-10,2,indeterminates
777712,5150337,M,N,Y,0,112500.0,Working,Secondary / secondary special,Single / not married,Rented apartment,...,-1193,1,0,0,0,Laborers,1.0,-11,1,indeterminates
777713,5150337,M,N,Y,0,112500.0,Working,Secondary / secondary special,Single / not married,Rented apartment,...,-1193,1,0,0,0,Laborers,1.0,-12,0,indeterminates


In [12]:
# Create a new column 'credit_card_approval' with default value 'yes'
merged_data['credit_card_approval'] = 'yes'

# Define the conditions for credit card approval
bad_status_conditions = (merged_data['STATUS'].isin(['3', '4', '5']))  # Bad status codes 3, 4, and 5
exclude_x_status_conditions = (merged_data['STATUS'] != 'X')  # Exclude 'X' status
recording_period_conditions = (merged_data.groupby('ID')['ID'].transform('count') >= 12)  # Recording period of at least 12 months

# Apply conditions to set 'credit_card_approval' to 'no' for bad customers
merged_data.loc[bad_status_conditions, 'credit_card_approval'] = 'no'

# Exclude customers who only have 'X' status or have recording periods less than 12 months
merged_data.loc[~exclude_x_status_conditions | ~recording_period_conditions, 'credit_card_approval'] = 'no'

# Handling the missing values

In [13]:
merged_data['OCCUPATION_TYPE'].fillna('None', inplace=True)

In [14]:
merged_data.isnull().sum()

ID                      0
CODE_GENDER             0
FLAG_OWN_CAR            0
FLAG_OWN_REALTY         0
CNT_CHILDREN            0
AMT_INCOME_TOTAL        0
NAME_INCOME_TYPE        0
NAME_EDUCATION_TYPE     0
NAME_FAMILY_STATUS      0
NAME_HOUSING_TYPE       0
DAYS_BIRTH              0
DAYS_EMPLOYED           0
FLAG_MOBIL              0
FLAG_WORK_PHONE         0
FLAG_PHONE              0
FLAG_EMAIL              0
OCCUPATION_TYPE         0
CNT_FAM_MEMBERS         0
MONTHS_BALANCE          0
STATUS                  0
credit_card_approval    0
dtype: int64

# Splitting the dataset into train and test sets 

In [15]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

X = merged_data.drop(columns=['credit_card_approval','STATUS'])  
y = merged_data['credit_card_approval']  

# Convert categorical features to numerical using one-hot encoding
X = pd.get_dummies(X, columns=['CODE_GENDER','FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_HOUSING_TYPE',
                              'NAME_INCOME_TYPE','NAME_EDUCATION_TYPE','NAME_FAMILY_STATUS',
                               'OCCUPATION_TYPE'], drop_first=True)

# Generate a unique identifier for each sample
X['unique_id'] = range(len(X))

In [16]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

X = merged_data.drop(columns=['credit_card_approval','STATUS'])  
y = merged_data['credit_card_approval']  

# Convert categorical features to numerical using one-hot encoding
X = pd.get_dummies(X, columns=['CODE_GENDER','FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_HOUSING_TYPE',
                              'NAME_INCOME_TYPE','NAME_EDUCATION_TYPE','NAME_FAMILY_STATUS',
                               'OCCUPATION_TYPE'], drop_first=True)

# Generate a unique identifier for each sample
X['unique_id'] = range(len(X))

# Split the data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create a Logistic Regression model
model = LogisticRegression()

# Fit the model on the training data
model.fit(X_train, y_train)

# Predict on the testing data
y_pred = model.predict(X_test)

# Fitting a model

In [17]:
from sklearn.impute import SimpleImputer

# Instantiate the imputer with the strategy you prefer (e.g., 'mean', 'median', 'most_frequent')
imputer = SimpleImputer(strategy='mean')

# Fit the imputer on X_train and transform it to fill missing values
X_train_imputed = imputer.fit_transform(X_train)



# Making predictions and evaluating performance

In [18]:
# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)
classification_report_str = classification_report(y_test, y_pred)

# Print the evaluation results
print("Accuracy:", accuracy)
print("Confusion Matrix:\n", confusion)
print("Classification Report:\n", classification_report_str)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.8762623331398723
Confusion Matrix:
 [[     0  23452]
 [     0 166078]]
Classification Report:
               precision    recall  f1-score   support

          no       0.00      0.00      0.00     23452
         yes       0.88      1.00      0.93    166078

    accuracy                           0.88    189530
   macro avg       0.44      0.50      0.47    189530
weighted avg       0.77      0.88      0.82    189530



  _warn_prf(average, modifier, msg_start, len(result))


# Grid searching and making the model perform better 

In [19]:
# Define hyperparameter grid for grid search
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization parameter
    'penalty': ['l1', 'l2']  # Regularization type
}

# Create a Logistic Regression model
model = LogisticRegression()

# Create a GridSearchCV object
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the GridSearchCV to the training data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_

# Print the best hyperparameters
print("Best Hyperparameters:", best_params)


30 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 447, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

      nan 0.875383      nan 

Best Hyperparameters: {'C': 0.001, 'penalty': 'l2'}


# Finding the best performing model. 


In [22]:
# Create a Logistic Regression model with the best hyperparameters
best_model = LogisticRegression(C=0.001, penalty='l2')

# Fit the model to the training data
best_model.fit(X_train, y_train)

# Predict on the testing data using the best model
y_preds = best_model.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_preds)
confusion = confusion_matrix(y_test, y_preds)
classification_report_str = classification_report(y_test, y_preds)

# Print the evaluation results
print("Accuracy:", accuracy)
print("Confusion Matrix:\n", confusion)
print("Classification Report:\n", classification_report_str)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.8762623331398723
Confusion Matrix:
 [[     0  23452]
 [     0 166078]]
Classification Report:
               precision    recall  f1-score   support

          no       0.00      0.00      0.00     23452
         yes       0.88      1.00      0.93    166078

    accuracy                           0.88    189530
   macro avg       0.44      0.50      0.47    189530
weighted avg       0.77      0.88      0.82    189530



  _warn_prf(average, modifier, msg_start, len(result))
