In [1]:
import pandas as pd

# Load the dataset
data_url = "https://drive.google.com/uc?id=1-VAbiNvEFiLa4e9xJ8fq7UeqxMKk63r7"
df = pd.read_csv(data_url)

In [2]:
print("\nShape of the DataFrame:", df.shape)


Shape of the DataFrame: (50000, 1)


In [3]:
# Load the dataset with low_memory=False
df = pd.read_csv(data_url, delimiter='\t', low_memory=False)

In [4]:
print("\nShape of the DataFrame:", df.shape)


Shape of the DataFrame: (50000, 54)


In [5]:
print(df.columns)

Index(['ID_CLIENT', 'CLERK_TYPE', 'PAYMENT_DAY', 'APPLICATION_SUBMISSION_TYPE',
       'QUANT_ADDITIONAL_CARDS', 'POSTAL_ADDRESS_TYPE', 'SEX',
       'MARITAL_STATUS', 'QUANT_DEPENDANTS', 'EDUCATION_LEVEL',
       'STATE_OF_BIRTH', 'CITY_OF_BIRTH', 'NATIONALITY', 'RESIDENCIAL_STATE',
       'RESIDENCIAL_CITY', 'RESIDENCIAL_BOROUGH', 'FLAG_RESIDENCIAL_PHONE',
       'RESIDENCIAL_PHONE_AREA_CODE', 'RESIDENCE_TYPE', 'MONTHS_IN_RESIDENCE',
       'FLAG_MOBILE_PHONE', 'FLAG_EMAIL', 'PERSONAL_MONTHLY_INCOME',
       'OTHER_INCOMES', 'FLAG_VISA', 'FLAG_MASTERCARD', 'FLAG_DINERS',
       'FLAG_AMERICAN_EXPRESS', 'FLAG_OTHER_CARDS', 'QUANT_BANKING_ACCOUNTS',
       'QUANT_SPECIAL_BANKING_ACCOUNTS', 'PERSONAL_ASSETS_VALUE', 'QUANT_CARS',
       'COMPANY', 'PROFESSIONAL_STATE', 'PROFESSIONAL_CITY',
       'PROFESSIONAL_BOROUGH', 'FLAG_PROFESSIONAL_PHONE',
       'PROFESSIONAL_PHONE_AREA_CODE', 'MONTHS_IN_THE_JOB', 'PROFESSION_CODE',
       'OCCUPATION_TYPE', 'MATE_PROFESSION_CODE', 'EDUCATION_LEV

In [6]:
# Display the first few rows of the dataset
print(df.head())

   ID_CLIENT CLERK_TYPE  PAYMENT_DAY APPLICATION_SUBMISSION_TYPE  \
1          1          C       -99999                           0   
2          2          C       -99999                         Web   
3          3          C       -99999                           0   
4          4          C       -99999                         Web   
5          5          C       -99999                         Web   

   QUANT_ADDITIONAL_CARDS  POSTAL_ADDRESS_TYPE SEX  MARITAL_STATUS  \
1                       0                    1   F               6   
2                       0                    1   F               2   
3                       0                    1   F               2   
4                       0                    1   F               2   
5                       0                    1   M               2   

   QUANT_DEPENDANTS  EDUCATION_LEVEL  ... FLAG_HOME_ADDRESS_DOCUMENT FLAG_RG  \
1                 1              NaN  ...                          0       0   
2         

In [7]:
# Display basic information about the dataset
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50000 entries, 1 to 50000
Data columns (total 54 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   ID_CLIENT                       50000 non-null  int64  
 1   CLERK_TYPE                      50000 non-null  object 
 2   PAYMENT_DAY                     50000 non-null  int64  
 3   APPLICATION_SUBMISSION_TYPE     50000 non-null  object 
 4   QUANT_ADDITIONAL_CARDS          50000 non-null  int64  
 5   POSTAL_ADDRESS_TYPE             50000 non-null  int64  
 6   SEX                             50000 non-null  object 
 7   MARITAL_STATUS                  50000 non-null  int64  
 8   QUANT_DEPENDANTS                50000 non-null  int64  
 9   EDUCATION_LEVEL                 826 non-null    float64
 10  STATE_OF_BIRTH                  50000 non-null  object 
 11  CITY_OF_BIRTH                   50000 non-null  object 
 12  NATIONALITY                     

In [8]:
# Summary statistics
print(df.describe())

          ID_CLIENT   PAYMENT_DAY  QUANT_ADDITIONAL_CARDS  \
count  50000.000000  50000.000000                 50000.0   
mean   25000.500000   -167.151640                     0.0   
std    14433.901067   4239.371262                     0.0   
min        1.000000 -99999.000000                     0.0   
25%    12500.750000     10.000000                     0.0   
50%    25000.500000     10.000000                     0.0   
75%    37500.250000     15.000000                     0.0   
max    50000.000000     25.000000                     0.0   

       POSTAL_ADDRESS_TYPE  MARITAL_STATUS  QUANT_DEPENDANTS  EDUCATION_LEVEL  \
count         50000.000000     50000.00000      50000.000000       826.000000   
mean              1.006540         2.14840          0.650520         1.721550   
std               0.080606         1.32285          1.193655         0.448508   
min               1.000000         0.00000          0.000000         1.000000   
25%               1.000000         1.00000   

In [9]:
# 1. Handling Missing Values
# Identify columns with missing values
columns_with_missing_values = df.columns[df.isnull().any()].tolist()
print("Columns with missing values:", columns_with_missing_values)

Columns with missing values: ['EDUCATION_LEVEL', 'RESIDENCIAL_PHONE_AREA_CODE', 'RESIDENCE_TYPE', 'MONTHS_IN_RESIDENCE', 'PROFESSIONAL_CITY', 'PROFESSIONAL_BOROUGH', 'PROFESSIONAL_PHONE_AREA_CODE', 'PROFESSION_CODE', 'OCCUPATION_TYPE', 'MATE_PROFESSION_CODE', 'EDUCATION_LEVEL.1']


In [10]:
# Check the percentage of missing values in each column
missing_value_percentages = df[columns_with_missing_values].isnull().mean() * 100
print("\nPercentage of missing values in each column:")
print(missing_value_percentages)



Percentage of missing values in each column:
EDUCATION_LEVEL                 98.348
RESIDENCIAL_PHONE_AREA_CODE     16.424
RESIDENCE_TYPE                   2.698
MONTHS_IN_RESIDENCE              7.554
PROFESSIONAL_CITY               67.566
PROFESSIONAL_BOROUGH            67.566
PROFESSIONAL_PHONE_AREA_CODE    73.064
PROFESSION_CODE                 15.512
OCCUPATION_TYPE                 14.626
MATE_PROFESSION_CODE            57.768
EDUCATION_LEVEL.1               64.676
dtype: float64


In [11]:
# Impute missing values in numerical columns with median
numerical_columns_to_impute = ['RESIDENCIAL_PHONE_AREA_CODE', 'MONTHS_IN_RESIDENCE']
for column in numerical_columns_to_impute:
    median_value = df[column].median()
    df[column] = df[column].fillna(median_value)

In [12]:
# Impute missing values in categorical columns with mode
categorical_columns_to_impute = ['RESIDENCE_TYPE', 'PROFESSION_CODE', 'OCCUPATION_TYPE']
for column in categorical_columns_to_impute:
    mode_value = df[column].mode()[0]
    df[column] = df[column].fillna(mode_value)

In [13]:
# Drop columns with high percentage of missing values
columns_to_drop = ['EDUCATION_LEVEL', 'PROFESSIONAL_CITY', 'PROFESSIONAL_BOROUGH', 
                   'PROFESSIONAL_PHONE_AREA_CODE', 'MATE_PROFESSION_CODE', 'EDUCATION_LEVEL.1']
df.drop(columns=columns_to_drop, inplace=True, axis=1)

In [14]:
# Verify if missing values have been handled
print("\nMissing values after imputation and dropping columns:")
print(df.isnull().sum())


Missing values after imputation and dropping columns:
ID_CLIENT                         0
CLERK_TYPE                        0
PAYMENT_DAY                       0
APPLICATION_SUBMISSION_TYPE       0
QUANT_ADDITIONAL_CARDS            0
POSTAL_ADDRESS_TYPE               0
SEX                               0
MARITAL_STATUS                    0
QUANT_DEPENDANTS                  0
STATE_OF_BIRTH                    0
CITY_OF_BIRTH                     0
NATIONALITY                       0
RESIDENCIAL_STATE                 0
RESIDENCIAL_CITY                  0
RESIDENCIAL_BOROUGH               0
FLAG_RESIDENCIAL_PHONE            0
RESIDENCIAL_PHONE_AREA_CODE       0
RESIDENCE_TYPE                    0
MONTHS_IN_RESIDENCE               0
FLAG_MOBILE_PHONE                 0
FLAG_EMAIL                        0
PERSONAL_MONTHLY_INCOME           0
OTHER_INCOMES                     0
FLAG_VISA                         0
FLAG_MASTERCARD                   0
FLAG_DINERS                       0
FLAG_AMER

In [15]:
!pip install category_encoders



In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from category_encoders import TargetEncoder, OneHotEncoder

# Assuming 'df' is your DataFrame and 'TARGET_LABEL_BAD.1' is the target
X = df.drop('TARGET_LABEL_BAD.1', axis=1)
y = df['TARGET_LABEL_BAD.1']

# Split your data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [17]:
# Assuming 'df' is your original DataFrame and includes all columns including 'MARITAL_STATUS'
X = df.drop('TARGET_LABEL_BAD.1', axis=1)  # Keep all other columns
y = df['TARGET_LABEL_BAD.1']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print("Initial columns in X_train:", X_train.columns)  # Check which columns are available


Initial columns in X_train: Index(['ID_CLIENT', 'CLERK_TYPE', 'PAYMENT_DAY', 'APPLICATION_SUBMISSION_TYPE',
       'QUANT_ADDITIONAL_CARDS', 'POSTAL_ADDRESS_TYPE', 'SEX',
       'MARITAL_STATUS', 'QUANT_DEPENDANTS', 'STATE_OF_BIRTH', 'CITY_OF_BIRTH',
       'NATIONALITY', 'RESIDENCIAL_STATE', 'RESIDENCIAL_CITY',
       'RESIDENCIAL_BOROUGH', 'FLAG_RESIDENCIAL_PHONE',
       'RESIDENCIAL_PHONE_AREA_CODE', 'RESIDENCE_TYPE', 'MONTHS_IN_RESIDENCE',
       'FLAG_MOBILE_PHONE', 'FLAG_EMAIL', 'PERSONAL_MONTHLY_INCOME',
       'OTHER_INCOMES', 'FLAG_VISA', 'FLAG_MASTERCARD', 'FLAG_DINERS',
       'FLAG_AMERICAN_EXPRESS', 'FLAG_OTHER_CARDS', 'QUANT_BANKING_ACCOUNTS',
       'QUANT_SPECIAL_BANKING_ACCOUNTS', 'PERSONAL_ASSETS_VALUE', 'QUANT_CARS',
       'COMPANY', 'PROFESSIONAL_STATE', 'FLAG_PROFESSIONAL_PHONE',
       'MONTHS_IN_THE_JOB', 'PROFESSION_CODE', 'OCCUPATION_TYPE',
       'FLAG_HOME_ADDRESS_DOCUMENT', 'FLAG_RG', 'FLAG_CPF',
       'FLAG_INCOME_PROOF', 'PRODUCT', 'FLAG_ACSP_RECORD', '

In [18]:
print("Columns in X_train:", X_train.columns)
print("Columns in X_test:", X_test.columns)


Columns in X_train: Index(['ID_CLIENT', 'CLERK_TYPE', 'PAYMENT_DAY', 'APPLICATION_SUBMISSION_TYPE',
       'QUANT_ADDITIONAL_CARDS', 'POSTAL_ADDRESS_TYPE', 'SEX',
       'MARITAL_STATUS', 'QUANT_DEPENDANTS', 'STATE_OF_BIRTH', 'CITY_OF_BIRTH',
       'NATIONALITY', 'RESIDENCIAL_STATE', 'RESIDENCIAL_CITY',
       'RESIDENCIAL_BOROUGH', 'FLAG_RESIDENCIAL_PHONE',
       'RESIDENCIAL_PHONE_AREA_CODE', 'RESIDENCE_TYPE', 'MONTHS_IN_RESIDENCE',
       'FLAG_MOBILE_PHONE', 'FLAG_EMAIL', 'PERSONAL_MONTHLY_INCOME',
       'OTHER_INCOMES', 'FLAG_VISA', 'FLAG_MASTERCARD', 'FLAG_DINERS',
       'FLAG_AMERICAN_EXPRESS', 'FLAG_OTHER_CARDS', 'QUANT_BANKING_ACCOUNTS',
       'QUANT_SPECIAL_BANKING_ACCOUNTS', 'PERSONAL_ASSETS_VALUE', 'QUANT_CARS',
       'COMPANY', 'PROFESSIONAL_STATE', 'FLAG_PROFESSIONAL_PHONE',
       'MONTHS_IN_THE_JOB', 'PROFESSION_CODE', 'OCCUPATION_TYPE',
       'FLAG_HOME_ADDRESS_DOCUMENT', 'FLAG_RG', 'FLAG_CPF',
       'FLAG_INCOME_PROOF', 'PRODUCT', 'FLAG_ACSP_RECORD', 'AGE',
  

In [19]:
from sklearn.preprocessing import OneHotEncoder

# Initialize the OneHotEncoder
encoder = OneHotEncoder(sparse=False, drop='first')  # drop='first' to avoid multicollinearity
encoded_columns = ['SEX', 'MARITAL_STATUS']

# Fit and transform the training data
encoder.fit(X_train[encoded_columns])
encoded_data = encoder.transform(X_train[encoded_columns])
encoded_features = encoder.get_feature_names_out(encoded_columns)  # Updated method call here
X_train_encoded = pd.DataFrame(encoded_data, columns=encoded_features, index=X_train.index)

# Transform the test data using the same encoder
encoded_data_test = encoder.transform(X_test[encoded_columns])
X_test_encoded = pd.DataFrame(encoded_data_test, columns=encoded_features, index=X_test.index)

# Concatenate encoded columns back to the original DataFrame
X_train = pd.concat([X_train.drop(encoded_columns, axis=1), X_train_encoded], axis=1)
X_test = pd.concat([X_test.drop(encoded_columns, axis=1), X_test_encoded], axis=1)





In [20]:
from category_encoders import TargetEncoder

# Initialize the TargetEncoder
target_encoder = TargetEncoder()

# Apply target encoding to 'RESIDENCIAL_CITY'
X_train['RESIDENCIAL_CITY'] = target_encoder.fit_transform(X_train['RESIDENCIAL_CITY'], y_train)
X_test['RESIDENCIAL_CITY'] = target_encoder.transform(X_test['RESIDENCIAL_CITY'])


In [21]:
# Calculate frequencies from the training set
frequency = X_train['PROFESSIONAL_ZIP_3'].value_counts(normalize=True)

# Map frequencies onto both training and test sets
X_train['PROFESSIONAL_ZIP_3'] = X_train['PROFESSIONAL_ZIP_3'].map(frequency)
X_test['PROFESSIONAL_ZIP_3'] = X_test['PROFESSIONAL_ZIP_3'].map(frequency)


In [22]:
# Verify the columns in the transformed datasets
print("Columns in X_train post-encoding:", X_train.columns)
print("Columns in X_test post-encoding:", X_test.columns)


Columns in X_train post-encoding: Index(['ID_CLIENT', 'CLERK_TYPE', 'PAYMENT_DAY', 'APPLICATION_SUBMISSION_TYPE',
       'QUANT_ADDITIONAL_CARDS', 'POSTAL_ADDRESS_TYPE', 'QUANT_DEPENDANTS',
       'STATE_OF_BIRTH', 'CITY_OF_BIRTH', 'NATIONALITY', 'RESIDENCIAL_STATE',
       'RESIDENCIAL_CITY', 'RESIDENCIAL_BOROUGH', 'FLAG_RESIDENCIAL_PHONE',
       'RESIDENCIAL_PHONE_AREA_CODE', 'RESIDENCE_TYPE', 'MONTHS_IN_RESIDENCE',
       'FLAG_MOBILE_PHONE', 'FLAG_EMAIL', 'PERSONAL_MONTHLY_INCOME',
       'OTHER_INCOMES', 'FLAG_VISA', 'FLAG_MASTERCARD', 'FLAG_DINERS',
       'FLAG_AMERICAN_EXPRESS', 'FLAG_OTHER_CARDS', 'QUANT_BANKING_ACCOUNTS',
       'QUANT_SPECIAL_BANKING_ACCOUNTS', 'PERSONAL_ASSETS_VALUE', 'QUANT_CARS',
       'COMPANY', 'PROFESSIONAL_STATE', 'FLAG_PROFESSIONAL_PHONE',
       'MONTHS_IN_THE_JOB', 'PROFESSION_CODE', 'OCCUPATION_TYPE',
       'FLAG_HOME_ADDRESS_DOCUMENT', 'FLAG_RG', 'FLAG_CPF',
       'FLAG_INCOME_PROOF', 'PRODUCT', 'FLAG_ACSP_RECORD', 'AGE',
       'RESIDENCIAL_

In [23]:
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)


Shape of X_train: (35000, 55)
Shape of X_test: (15000, 55)


In [24]:
# Check for any missing values in the training set
missing_values_train = X_train.isnull().sum()
print("Missing values in X_train:")
print(missing_values_train[missing_values_train > 0])  # Display columns with missing values only

# Check for any missing values in the test set
missing_values_test = X_test.isnull().sum()
print("Missing values in X_test:")
print(missing_values_test[missing_values_test > 0])  # Display columns with missing values only


Missing values in X_train:
Series([], dtype: int64)
Missing values in X_test:
PROFESSIONAL_ZIP_3    27
dtype: int64


In [25]:
print("Data types in X_train:")
print(X_train.dtypes)

Data types in X_train:
ID_CLIENT                           int64
CLERK_TYPE                         object
PAYMENT_DAY                         int64
APPLICATION_SUBMISSION_TYPE        object
QUANT_ADDITIONAL_CARDS              int64
POSTAL_ADDRESS_TYPE                 int64
QUANT_DEPENDANTS                    int64
STATE_OF_BIRTH                     object
CITY_OF_BIRTH                      object
NATIONALITY                         int64
RESIDENCIAL_STATE                  object
RESIDENCIAL_CITY                  float64
RESIDENCIAL_BOROUGH                object
FLAG_RESIDENCIAL_PHONE             object
RESIDENCIAL_PHONE_AREA_CODE       float64
RESIDENCE_TYPE                    float64
MONTHS_IN_RESIDENCE               float64
FLAG_MOBILE_PHONE                  object
FLAG_EMAIL                          int64
PERSONAL_MONTHLY_INCOME           float64
OTHER_INCOMES                     float64
FLAG_VISA                           int64
FLAG_MASTERCARD                     int64
FLAG_DINERS

In [26]:
# Check for missing values in y_train and y_test
missing_y_train = y_train.isnull().sum()
missing_y_test = y_test.isnull().sum()
print("Missing values in y_train:", missing_y_train)
print("Missing values in y_test:", missing_y_test)


Missing values in y_train: 0
Missing values in y_test: 0


In [27]:
# Check the distribution of target classes
print("Distribution in y_train:")
print(y_train.value_counts(normalize=True))  # Displays the percentage of each class

print("Distribution in y_test:")
print(y_test.value_counts(normalize=True))  # Displays the percentage of each class


Distribution in y_train:
0    0.739514
1    0.260486
Name: TARGET_LABEL_BAD.1, dtype: float64
Distribution in y_test:
0    0.7384
1    0.2616
Name: TARGET_LABEL_BAD.1, dtype: float64


In [28]:
# Check data type of y_train and y_test
print("Data type of y_train:", y_train.dtype)
print("Data type of y_test:", y_test.dtype)


Data type of y_train: int64
Data type of y_test: int64


In [29]:
# Check lengths of datasets to ensure consistency
print("Number of instances in X_train:", X_train.shape[0])
print("Number of instances in y_train:", y_train.shape[0])
print("Number of instances in X_test:", X_test.shape[0])
print("Number of instances in y_test:", y_test.shape[0])


Number of instances in X_train: 35000
Number of instances in y_train: 35000
Number of instances in X_test: 15000
Number of instances in y_test: 15000


In [30]:
# Impute missing values in 'PROFESSIONAL_ZIP_3' using median from the training data
median_value = X_train['PROFESSIONAL_ZIP_3'].median()
X_test['PROFESSIONAL_ZIP_3'].fillna(median_value, inplace=True)


In [31]:
# Check for any missing values in the training set
missing_values_train = X_train.isnull().sum()
print("Missing values in X_train:")
print(missing_values_train[missing_values_train > 0])  # Display columns with missing values only

# Check for any missing values in the test set
missing_values_test = X_test.isnull().sum()
print("Missing values in X_test:")
print(missing_values_test[missing_values_test > 0])  # Display columns with missing values only

Missing values in X_train:
Series([], dtype: int64)
Missing values in X_test:
Series([], dtype: int64)


In [32]:
# Verify no missing values are left in both training and test datasets
print("Missing values in X_train:", X_train.isnull().sum().sum())
print("Missing values in X_test:", X_test.isnull().sum().sum())


Missing values in X_train: 0
Missing values in X_test: 0


In [33]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Define the categorical columns you want to one-hot encode
categorical_features = ['CLERK_TYPE', 'APPLICATION_SUBMISSION_TYPE', 'STATE_OF_BIRTH', 'CITY_OF_BIRTH', 'RESIDENCIAL_STATE', 
                        'RESIDENCIAL_BOROUGH', 'FLAG_RESIDENCIAL_PHONE', 'FLAG_MOBILE_PHONE', 'COMPANY', 'PROFESSIONAL_STATE',
                        'FLAG_PROFESSIONAL_PHONE', 'PRODUCT', 'FLAG_ACSP_RECORD', 'RESIDENCIAL_ZIP_3']

# Create the encoder
column_transformer = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ], remainder='passthrough')

# Apply the transformer to the training data
X_train_transformed = column_transformer.fit_transform(X_train)
X_test_transformed = column_transformer.transform(X_test)

# Note: This will change X_train and X_test from a DataFrame to a numpy array


In [34]:
# Check the first few rows of the transformed data to understand its structure
print("Sample data from X_train_transformed:\n", X_train_transformed[:5])


Sample data from X_train_transformed:
   (0, 0)	1.0
  (0, 3)	1.0
  (0, 24)	1.0
  (0, 7774)	1.0
  (0, 8149)	1.0
  (0, 9429)	1.0
  (0, 19696)	1.0
  (0, 19697)	1.0
  (0, 19699)	1.0
  (0, 19720)	1.0
  (0, 19729)	1.0
  (0, 19730)	1.0
  (0, 19733)	1.0
  (0, 20138)	1.0
  (0, 20507)	38095.0
  (0, 20508)	25.0
  (0, 20510)	1.0
  (0, 20512)	1.0
  (0, 20513)	0.29446107077316536
  (0, 20514)	105.0
  (0, 20515)	1.0
  (0, 20516)	1.0
  (0, 20517)	1.0
  (0, 20518)	355.0
  (0, 20530)	9.0
  :	:
  (4, 19697)	1.0
  (4, 19699)	1.0
  (4, 19700)	1.0
  (4, 19728)	1.0
  (4, 19730)	1.0
  (4, 19733)	1.0
  (4, 20459)	1.0
  (4, 20507)	41709.0
  (4, 20508)	10.0
  (4, 20510)	1.0
  (4, 20511)	3.0
  (4, 20513)	0.2265943154066442
  (4, 20514)	61.0
  (4, 20515)	1.0
  (4, 20517)	1.0
  (4, 20518)	776.0
  (4, 20525)	1.0
  (4, 20526)	1.0
  (4, 20528)	1.0
  (4, 20530)	9.0
  (4, 20531)	2.0
  (4, 20536)	41.0
  (4, 20537)	0.00017142857142857143
  (4, 20538)	1.0
  (4, 20542)	1.0


In [35]:
print("Columns in X_train:", X_train.columns.tolist())

Columns in X_train: ['ID_CLIENT', 'CLERK_TYPE', 'PAYMENT_DAY', 'APPLICATION_SUBMISSION_TYPE', 'QUANT_ADDITIONAL_CARDS', 'POSTAL_ADDRESS_TYPE', 'QUANT_DEPENDANTS', 'STATE_OF_BIRTH', 'CITY_OF_BIRTH', 'NATIONALITY', 'RESIDENCIAL_STATE', 'RESIDENCIAL_CITY', 'RESIDENCIAL_BOROUGH', 'FLAG_RESIDENCIAL_PHONE', 'RESIDENCIAL_PHONE_AREA_CODE', 'RESIDENCE_TYPE', 'MONTHS_IN_RESIDENCE', 'FLAG_MOBILE_PHONE', 'FLAG_EMAIL', 'PERSONAL_MONTHLY_INCOME', 'OTHER_INCOMES', 'FLAG_VISA', 'FLAG_MASTERCARD', 'FLAG_DINERS', 'FLAG_AMERICAN_EXPRESS', 'FLAG_OTHER_CARDS', 'QUANT_BANKING_ACCOUNTS', 'QUANT_SPECIAL_BANKING_ACCOUNTS', 'PERSONAL_ASSETS_VALUE', 'QUANT_CARS', 'COMPANY', 'PROFESSIONAL_STATE', 'FLAG_PROFESSIONAL_PHONE', 'MONTHS_IN_THE_JOB', 'PROFESSION_CODE', 'OCCUPATION_TYPE', 'FLAG_HOME_ADDRESS_DOCUMENT', 'FLAG_RG', 'FLAG_CPF', 'FLAG_INCOME_PROOF', 'PRODUCT', 'FLAG_ACSP_RECORD', 'AGE', 'RESIDENCIAL_ZIP_3', 'PROFESSIONAL_ZIP_3', 'SEX_F', 'SEX_M', 'SEX_N', 'MARITAL_STATUS_1', 'MARITAL_STATUS_2', 'MARITAL_STATU

In [36]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Ensure these are exactly the column names as they appear in X_train
categorical_features = ['CLERK_TYPE', 'APPLICATION_SUBMISSION_TYPE', 'STATE_OF_BIRTH', 'CITY_OF_BIRTH', 'RESIDENCIAL_STATE', 
                        'RESIDENCIAL_BOROUGH', 'FLAG_RESIDENCIAL_PHONE', 'FLAG_MOBILE_PHONE', 'COMPANY', 'PROFESSIONAL_STATE',
                        'FLAG_PROFESSIONAL_PHONE', 'PRODUCT', 'FLAG_ACSP_RECORD', 'RESIDENCIAL_ZIP_3']

column_transformer = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'
)

# Apply the transformer to your training data
X_train_transformed = column_transformer.fit_transform(X_train)
X_test_transformed = column_transformer.transform(X_test)


In [37]:
# Getting new feature names from the transformer for categorical features
transformed_cat_features = column_transformer.named_transformers_['cat'].get_feature_names_out()

# Combine with non-categorical feature names
new_feature_names = list(transformed_cat_features) + [name for name in X_train.columns if name not in categorical_features]


In [38]:
# Convert the transformed data to dense format if necessary
X_train_dense = X_train_transformed.toarray() if hasattr(X_train_transformed, "toarray") else X_train_transformed
X_test_dense = X_test_transformed.toarray() if hasattr(X_test_transformed, "toarray") else X_test_transformed

# Create DataFrames
X_train = pd.DataFrame(X_train_dense, columns=new_feature_names, index=X_train.index)
X_test = pd.DataFrame(X_test_dense, columns=new_feature_names, index=X_test.index)


In [39]:
### MODELLINGGGGGGGGGGGGG #####

In [40]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Initialize and train the Logistic Regression model
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train, y_train)

# Predict on the test data
y_pred_log = log_model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred_log))
print("Classification Report:\n", classification_report(y_test, y_pred_log))


Accuracy: 0.7384
Classification Report:
               precision    recall  f1-score   support

           0       0.74      1.00      0.85     11076
           1       0.50      0.00      0.00      3924

    accuracy                           0.74     15000
   macro avg       0.62      0.50      0.43     15000
weighted avg       0.68      0.74      0.63     15000



In [41]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Initialize the logistic regression model with class weight adjusted
log_model = LogisticRegression(max_iter=1000, class_weight='balanced', solver='lbfgs', random_state=42)

# Train the model
log_model.fit(X_train, y_train)

# Predict on the test set
y_pred_log = log_model.predict(X_test)

# Evaluate the model
print("Adjusted Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_log))
print("Adjusted Classification Report:\n", classification_report(y_test, y_pred_log))


Adjusted Logistic Regression Accuracy: 0.5741333333333334
Adjusted Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.61      0.68     11076
           1       0.30      0.46      0.36      3924

    accuracy                           0.57     15000
   macro avg       0.53      0.54      0.52     15000
weighted avg       0.64      0.57      0.60     15000



In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize the Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')

# Fit the model on the training data
rf_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred_rf = rf_model.predict(X_test)

# Evaluate the model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
report_rf = classification_report(y_test, y_pred_rf)
conf_matrix_rf = confusion_matrix(y_test, y_pred_rf)

print("Random Forest Accuracy:", accuracy_rf)
print("Random Forest Classification Report:\n", report_rf)
print("Random Forest Confusion Matrix:\n", conf_matrix_rf)
