In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
sns.set()
import matplotlib.pyplot as plt

In [None]:
train = pd.read_csv("/kaggle/input/credit-score-classification/train.csv", low_memory = False)
print("train", train.shape)

In [None]:
df = pd.DataFrame(data = train)
df.head()

In [None]:
df.info()

In [None]:
df.describe()

# Realizing Why Data Is Missing
It is common to find missing values in data sets both large and small. This issue can lead to significant reduction in the number of usable observations for the analysis. The reduction in the sample size not only reduces statistical power, it can also introduce a bias when the data are not missing at random. 

In [None]:
df.isnull().sum().sort_values(ascending=False)*100/df.shape[0]

#### We could see in total we have like 60% missing data in the whole data set. Although, there are shared missing values, there are a huge number of missing values in the dataset, moreover we have invalid values too.

This is the point at which we get into the part of data science that I like to call "data intution", by which I mean "really looking at your data and trying to figure out why it is the way it is and how that will affect your analysis". It can be a frustrating part of data science, especially if you're newer to the field and don't have a lot of experience. For dealing with missing values, you'll need to use your intution to figure out why the value is missing. One of the most important question you can ask yourself to help figure this out is this:

* **Is the value missing because it wasn't recorded or doesn't exist?**

By looking at metadata in our dataset, it seems missing values is mainly because of the data collection process considering we have a lot of invalid records in addition to missing values. Thus we could conclude the value in an ideal data set with these features could exist and is not because of not existing the value for the record.

But let take a closee look to realize whether missing values are related or not.


**Detecting missing values visually using Missingno library:**

Missingno is a simple Python library that presents a series of visualizations to recognize the behavior and distribution of missing data inside a pandas data frame. It can be in the form of a barplot, matrix plot, heatmap, or a dendrogram.

In [None]:
import missingno as msno

In [None]:
msno.bar(df)

The above bar chart give a quick graphical view of all feature. We could see Monthly_inhand_Salary has the most missing value and it could be because of the recording and people don't want to get information of their salaries or not agree to collecting their salary information. We should more dig on the process of collecting data and ask is it collect via a govermental organiation of credit company.
However missing values on **Name** and **Type_of_Loan** is not reasonable. 



## Matrix 
The msno.matrix() is a nullity matrix that will help to visualize the location of the null observations.

In [None]:
msno.matrix(df.sample(250))

In [None]:
sorted = train.sort_values('Monthly_Inhand_Salary')
msno.matrix(sorted)

## HeatMap
he missingno correlation heatmap measures nullity correlation: how strongly the presence or absence of one variable affects the presence of another.

In [None]:
msno.heatmap(df)

## dendrogram
The dendrogram allows you to more fully correlate variable completion, revealing trends deeper than the pairwise ones visible in the correlation heatmap:

In [None]:
msno.dendrogram(df)

We can create a dendrogram showing the clusterings of where data is missing. Leaves that are at the same level predict one another’s presence (empty or filled). The vertical arms are used to indicate how different clusters are.


Cluster leaves which split close to zero, but not at it, predict one another very well, but still imperfectly. If your own interpretation of the dataset is that these columns actually are or ought to be match each other in nullity, then the height of the cluster leaf tells you, in absolute terms, how often the records are "mismatched" or incorrectly filed—that is, how many values you would have to fill in or drop, if you are so inclined.

Thus, here we could see base on the levels that **Monthly_Balance** and  **Outstanding_Debt** missing values affect each other. howver, considering the vertical arm between them this effect is not high.

**Considering the charts, there is not a direct relationship between missing values in column as we expect. Thus, we could move forward and replacing missing values**

# Filling Missing Values
First we need to drop some of the features and keep the most important ones.

In [None]:
df = df.drop(['ID', 'Month', 'Name', 'SSN', 'Interest_Rate','Type_of_Loan', 'Changed_Credit_Limit',
              'Credit_Mix', 'Credit_Utilization_Ratio', 'Amount_invested_monthly', 
              'Payment_of_Min_Amount', 'Total_EMI_per_month', 'Payment_Behaviour'], axis = 1)
df.head()

In [None]:
df.isnull().sum().sort_values(ascending=False)*100/df.shape[0]

Monthly_Inhand_Salary, Credit_History_Age, Num_of_Delayed_payment, Num_Credit_Inquiry and Monthly Balance variables have missing values and espacially **Monthly_Inhand_Salary** and **Credit_History_Age** variable has **24% missing values in total**.

In [None]:
sns.displot(data=df['Monthly_Inhand_Salary'], color='teal', kind='kde') 
plt.show()

**First we need to convert Credir_History_Age to years (From String to Float) then we sketch it to figure out whether we could handle missing values with**

In [None]:
df['Credit_History_Age'] = df['Credit_History_Age'].str[:2]
df[['Credit_History_Age']] = df[['Credit_History_Age']].apply(pd.to_numeric)
sns.displot(data=df['Credit_History_Age'], color='teal', kind='kde') 
plt.show()

The data distribution is almost symmetric, so going to fill it with mean value rather than meadina and mode and compare it with MICE imputing method, which can be better than filling the missing values with median or mean.

In [None]:
# Create a function that we can re-use
def show_distribution(var_data):
    from matplotlib import pyplot as plt

    # Get statistics
    min_val = var_data.min()
    max_val = var_data.max()
    mean_val = var_data.mean()
    med_val = var_data.median()
    mod_val = var_data.mode()[0]

    print('Minimum:{:.2f}\nMean:{:.2f}\nMedian:{:.2f}\nMode:{:.2f}\nMaximum:{:.2f}\n'.format(min_val,
                                                                                            mean_val,
                                                                                            med_val,
                                                                                            mod_val,
                                                                                            max_val))

    # Create a figure for 2 subplots (2 rows, 1 column)
    fig, ax = plt.subplots(2, 1, figsize = (10,4))

    # Plot the histogram   
    ax[0].hist(var_data)
    ax[0].set_ylabel('Frequency')

    # Add lines for the mean, median, and mode
    ax[0].axvline(x=min_val, color = 'gray', linestyle='dashed', linewidth = 2)
    ax[0].axvline(x=mean_val, color = 'cyan', linestyle='dashed', linewidth = 2)
    ax[0].axvline(x=med_val, color = 'red', linestyle='dashed', linewidth = 2)
    ax[0].axvline(x=mod_val, color = 'yellow', linestyle='dashed', linewidth = 2)
    ax[0].axvline(x=max_val, color = 'gray', linestyle='dashed', linewidth = 2)

    # Plot the boxplot   
    ax[1].boxplot(var_data, vert=False)
    ax[1].set_xlabel('Value')

    # Add a title to the Figure
    fig.suptitle('Data Distribution')

    # Show the figure
    fig.show()

# Get the variable to examine
col = df_students['Grade']
# Call the function
show_distribution(col)

In [None]:
label = bike_data['rentals']


# Create a figure for 2 subplots (2 rows, 1 column)
fig, ax = plt.subplots(2, 1, figsize = (9,12))

# Plot the histogram   
ax[0].hist(label, bins=100)
ax[0].set_ylabel('Frequency')

# Add lines for the mean, median, and mode
ax[0].axvline(label.mean(), color='magenta', linestyle='dashed', linewidth=2)
ax[0].axvline(label.median(), color='cyan', linestyle='dashed', linewidth=2)

# Plot the boxplot   
ax[1].boxplot(label, vert=False)
ax[1].set_xlabel('Rentals')

# Add a title to the Figure
fig.suptitle('Rental Distribution')

# Show the figure
fig.show()
