In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from scipy.stats import norm

## DATA VISUALISATION

In [2]:
data_indiv = pd.read_csv('application_data.csv',sep = ',')

FileNotFoundError: [Errno 2] No such file or directory: 'application_data.csv'

In [None]:
data_indiv

In [None]:
data_indiv.info(verbose=True, show_counts=True)

In [None]:
unique_value_counts = data_indiv.nunique()
unique_value_counts

In [None]:
unique_columns = unique_value_counts[unique_value_counts == 1].index.to_list()
unique_columns

In [None]:
data_indiv.describe()

In [None]:
numerical_columns = data_indiv.select_dtypes(include=['number']).columns
categorical_columns = data_indiv.select_dtypes(exclude=['number']).columns

In [None]:
numerical_columns

In [None]:
# Categorical columns distribution

for column in categorical_columns:
    data_indiv[column].value_counts().plot(kind='bar')
    plt.xlabel(column)
    plt.ylabel('Frequency')
    plt.title(f'Distribution of {column}')
    plt.show()

In [None]:
# Target variable distribution

values = data_indiv['TARGET'].value_counts()
plt.pie(values, labels=values.index, autopct='%1.1f%%')
plt.gcf().set_size_inches(5, 5)
plt.show()

## DATA CLEANING


In [None]:

# Check the number of columns
num_columns = len(data_indiv.columns)
print("Number of columns:", num_columns)

In [None]:
# Extracting 'FLAG_DOCUMENT_' columns
flag_columns = [col for col in data_indiv.columns if 'FLAG_DOCUMENT_' in col]

# Calculating total documents submitted for each person
data_indiv['TOTAL_DOCUMENTS_SUBMITTED'] = data_indiv[flag_columns].sum(axis=1)

# Drop the individual document flag columns if needed
data_indiv.drop(columns=flag_columns, inplace=True)



In [None]:
# Check the 'TOTAL_DOCUMENTS_SUBMITTED' column
total_submitted_column = data_indiv['TOTAL_DOCUMENTS_SUBMITTED']

# Print the first few entries of the column
print(total_submitted_column.head())

# Alternatively, you can print the summary statistics of the column
print(total_submitted_column.describe())

In [None]:

# Select only numeric columns
numeric_data = data_indiv.select_dtypes(include='number')

# Calculate correlation matrix
correlation_matrix = numeric_data.corr()

# Set a threshold for highly correlated features
threshold = 0.7  # Adjust as needed

# Filter the correlation matrix to include only highly correlated features
highly_correlated = (correlation_matrix.abs() > threshold) & (correlation_matrix.abs() < 1.0)

# Apply the filter to the correlation matrix
highly_correlated_matrix = correlation_matrix[highly_correlated]

# Set up the matplotlib figure
plt.figure(figsize=(12, 10))

# Draw the heatmap
sns.heatmap(highly_correlated_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)

# Add title
plt.title('Highly Correlated Features')

# Show plot
plt.show()

In [None]:
# Initialize a dictionary to store correlated variables for each variable
correlated_variables_dict = {}

# Iterate through the correlation matrix and collect correlated variables for each variable
for col in highly_correlated.columns:
    correlated_vars = list(highly_correlated[col][highly_correlated[col]].index)
    if correlated_vars:
        correlated_variables_dict[col] = correlated_vars

# Print the dictionary of correlated variables
for var, correlated_vars in correlated_variables_dict.items():
    print(f"{var}: {correlated_vars}")

In [None]:
# Remove the highly corrolated columns
data = data_indiv.drop(columns=['REGION_RATING_CLIENT','LIVE_REGION_NOT_WORK_REGION','REG_REGION_NOT_WORK_REGION','LIVE_CITY_NOT_WORK_CITY','CNT_CHILDREN','AMT_GOODS_PRICE','AMT_ANNUITY','FLAG_EMP_PHONE','ELEVATORS_AVG', 'LIVINGAPARTMENTS_AVG', 'LIVINGAREA_AVG', 'APARTMENTS_MODE', 'ELEVATORS_MODE', 'LIVINGAPARTMENTS_MODE', 'LIVINGAREA_MODE', 'APARTMENTS_MEDI', 'ELEVATORS_MEDI', 'LIVINGAPARTMENTS_MEDI', 'LIVINGAREA_MEDI', 'TOTALAREA_MODE','BASEMENTAREA_MODE', 'BASEMENTAREA_MEDI','YEARS_BEGINEXPLUATATION_MODE', 'YEARS_BEGINEXPLUATATION_MEDI','YEARS_BUILD_MODE', 'YEARS_BUILD_MEDI','COMMONAREA_MODE', 'COMMONAREA_MEDI','ENTRANCES_MODE', 'ENTRANCES_MEDI','FLOORSMIN_AVG', 'FLOORSMAX_MODE', 'FLOORSMIN_MODE', 'FLOORSMAX_MEDI', 'FLOORSMIN_MEDI','LANDAREA_MODE', 'LANDAREA_MEDI','NONLIVINGAPARTMENTS_MODE', 'NONLIVINGAPARTMENTS_MEDI','NONLIVINGAREA_MODE', 'NONLIVINGAREA_MEDI','OBS_60_CNT_SOCIAL_CIRCLE','DEF_60_CNT_SOCIAL_CIRCLE'])

In [None]:
# Initialize a dictionary to store highly correlated variables for each variable
correlated_variables_dict = {}

# Iterate through the correlation matrix and store highly correlated variables
for col in highly_correlated_matrix.columns:
    # Exclude NaN values and select only highly correlated variables
    correlated_vars = list(highly_correlated_matrix[col].dropna().index)
    if col in correlated_vars:
        correlated_vars.remove(col)  # Remove the variable itself from the list if it exists
    # Store the list of highly correlated variables in the dictionary
    correlated_variables_dict[col] = correlated_vars

# Print the dictionary
for variable, correlated_vars in correlated_variables_dict.items():
    print(f"{variable}: {correlated_vars}")


In [None]:

# Select only numeric columns
numeric_data = data.select_dtypes(include='number')

# Calculate correlation matrix
correlation_matrix = numeric_data.corr()

# Set a threshold for highly correlated features
threshold = 0.7  # Adjust as needed

# Filter the correlation matrix to include only highly correlated features
highly_correlated = (correlation_matrix.abs() > threshold) & (correlation_matrix.abs() < 1.0)

# Apply the filter to the correlation matrix
highly_correlated_matrix = correlation_matrix[highly_correlated]

# Set up the matplotlib figure
plt.figure(figsize=(12, 10))

# Draw the heatmap
sns.heatmap(highly_correlated_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)

# Add title
plt.title('Highly Correlated Features')

# Show plot
plt.show()

In [None]:
# Check the number of columns
num_columns = len(data.columns)
print("Number of columns:", num_columns)

In [None]:
column_list = data.columns.tolist()
print(column_list)


In [None]:
# Check for columns with only one unique value
columns_with_one_value = data.columns[data.nunique() == 1]

# Print the columns with only one unique value
print("Columns with only one unique value:")
print(columns_with_one_value)

In [None]:
# Remove the irrelevant columns
data = data.drop(columns=['NAME_TYPE_SUITE','WEEKDAY_APPR_PROCESS_START','DAYS_REGISTRATION'	,'DAYS_ID_PUBLISH','HOUR_APPR_PROCESS_START','SK_ID_CURR', 'OWN_CAR_AGE','FLAG_WORK_PHONE','FLAG_PHONE','OCCUPATION_TYPE','REGION_RATING_CLIENT_W_CITY','REG_REGION_NOT_LIVE_REGION','REG_CITY_NOT_LIVE_CITY','REG_CITY_NOT_WORK_CITY','ORGANIZATION_TYPE','APARTMENTS_AVG', 'BASEMENTAREA_AVG', 'YEARS_BEGINEXPLUATATION_AVG', 'YEARS_BUILD_AVG', 'COMMONAREA_AVG', 'ENTRANCES_AVG', 'FLOORSMAX_AVG', 'LANDAREA_AVG', 'NONLIVINGAPARTMENTS_AVG', 'NONLIVINGAREA_AVG', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE','DAYS_LAST_PHONE_CHANGE'])

In [None]:
# Check the number of columns
num_columns = len(data.columns)
print("Number of columns:", num_columns)

In [None]:
# Create a new column by summing the values across all the specified columns
data['TOTAL_REQ_CREDIT_BUREAU'] = data['AMT_REQ_CREDIT_BUREAU_HOUR'] + \
                                  data['AMT_REQ_CREDIT_BUREAU_DAY'] + \
                                  data['AMT_REQ_CREDIT_BUREAU_WEEK'] + \
                                  data['AMT_REQ_CREDIT_BUREAU_MON'] + \
                                  data['AMT_REQ_CREDIT_BUREAU_QRT'] + \
                                  data['AMT_REQ_CREDIT_BUREAU_YEAR']

# Drop the individual columns if needed
data.drop(columns=['AMT_REQ_CREDIT_BUREAU_HOUR', 'AMT_REQ_CREDIT_BUREAU_DAY', 
                   'AMT_REQ_CREDIT_BUREAU_WEEK', 'AMT_REQ_CREDIT_BUREAU_MON', 
                   'AMT_REQ_CREDIT_BUREAU_QRT', 'AMT_REQ_CREDIT_BUREAU_YEAR'], inplace=True)

In [None]:
# Calculate the mean of the three scores and create a new column
data['EXT_SOURCE_MEAN'] = data[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1)
data.drop(columns=['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3'], inplace=True)

FLAG_MOBIL Did client provide mobile phone (1=YES, 0=NO)
FLAG_CONT_MOBILE Was mobile phone reachable (1=YES, 0=NO)
FLAG_EMAIL Did client provide email (1=YES, 0=NO)

In [None]:

data['REACHABLE'] = (data['FLAG_MOBIL'] & data['FLAG_CONT_MOBILE'] & data['FLAG_EMAIL']).astype(int)
# Now you have a new column 'REACHABLE' containing 1 if all three conditions are met, and 0 otherwise
data.drop(columns=['FLAG_MOBIL','FLAG_CONT_MOBILE','FLAG_EMAIL'], inplace=True)

In [None]:
# Calculate total number of observations
data['TOTAL_SOCIAL_OBSERVATIONS'] = (
    data['OBS_30_CNT_SOCIAL_CIRCLE'] +
    data['DEF_30_CNT_SOCIAL_CIRCLE']
)

# Drop the individual columns
data.drop(columns=['OBS_30_CNT_SOCIAL_CIRCLE', 'DEF_30_CNT_SOCIAL_CIRCLE'], inplace=True)


In [None]:
# Check the number of columns
num_columns = len(data.columns)
print("Number of columns:", num_columns)

In [None]:
data.describe()

In [None]:
data

## DATA PREPARATION


In [None]:
# Check for missing values in the entire DataFrame
missing_values = data.isna().sum()
# Print the missing values count for each column
print(missing_values)

In [None]:
# Impute missing values in numerical columns
data['CNT_FAM_MEMBERS'].fillna(data['CNT_FAM_MEMBERS'].median(), inplace=True)
data['EXT_SOURCE_MEAN'].fillna(data['EXT_SOURCE_MEAN'].mean(), inplace=True)

In [None]:
data['TOTAL_SOCIAL_OBSERVATIONS'].fillna(0, inplace=True)
data['TOTAL_REQ_CREDIT_BUREAU'].fillna(0, inplace=True)


In [None]:
# Check for missing values in the entire DataFrame
missing_values = data.isna().sum()
# Print the missing values count for each column
print(missing_values)

In [None]:

# Select only numerical columns
numerical_columns = data.select_dtypes(include=['number']).columns

# Calculate the IQR for each numerical column
Q1 = data[numerical_columns].quantile(0.25)
Q3 = data[numerical_columns].quantile(0.75)
IQR = Q3 - Q1

# Define the threshold for identifying outliers (e.g., 1.5 times the IQR)
threshold = 1.5

# Identify outliers using the IQR method
outliers = ((data[numerical_columns] < (Q1 - threshold * IQR)) | (data[numerical_columns] > (Q3 + threshold * IQR))).any(axis=1)

# Count the number of outliers
num_outliers = outliers.sum()
print("Number of outliers:", num_outliers)


In [None]:
# Select only numerical columns
numerical_columns = data.select_dtypes(include=['number'])

# Create box plots for each numerical column
plt.figure(figsize=(12, 8))
sns.boxplot(data=numerical_columns)
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.title('Box Plot of Numerical Columns')
plt.xlabel('Numerical Columns')
plt.ylabel('Values')
plt.show()

In [None]:
#rmove outliers
cleaned_data = data[~outliers]

In [None]:
# Display the shape of the original and cleaned dataframes
print("Shape of original data:", data.shape)
print("Shape of data without outliers:", cleaned_data.shape)
# Select only numerical columns
numerical_columns1 = cleaned_data.select_dtypes(include=['number'])

# Create box plots for each numerical column
plt.figure(figsize=(12, 8))
sns.boxplot(data=numerical_columns1)
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.title('Box Plot of Numerical Columns')
plt.xlabel('Numerical Columns')
plt.ylabel('Values')
plt.show()