### Dataset download link:
https://drive.google.com/file/d/1rIfaxVKhxCNmDaHpgr0484Gb7mIgA-nc/view?usp=sharing


In [None]:
from google.colab import drive
drive.mount('/content/drive/')

### Import Library



In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline

### Read Dataset

In [None]:
df=pd.read_csv('/content/drive/MyDrive/loans.csv',index_col = 'client_id')
df.head()

Unnamed: 0_level_0,loan_type,loan_amount,repaid,loan_id,loan_start,loan_end,rate
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
46109,home,13672,0,10243,2002-04-16,2003-12-20,2.15
46109,credit,9794,0,10984,2003-10-21,2005-07-17,1.25
46109,home,12734,1,10990,2006-02-01,2007-07-05,0.68
46109,cash,12518,1,10596,2010-12-08,2013-05-05,1.24
46109,credit,14049,1,11415,2010-07-07,2012-05-21,3.13


### Information Of The Dataset

In [None]:
df.shape
df.columns
df.dtypes
df.info()
df.describe()
df.nunique()
df.isnull().sum()

#### Remove Extra features

In [None]:
df=df.drop(['loan_id'],axis=1)
df.head()

Unnamed: 0_level_0,loan_type,loan_amount,repaid,loan_start,loan_end,rate
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
46109,home,13672,0,2002-04-16,2003-12-20,2.15
46109,credit,9794,0,2003-10-21,2005-07-17,1.25
46109,home,12734,1,2006-02-01,2007-07-05,0.68
46109,cash,12518,1,2010-12-08,2013-05-05,1.24
46109,credit,14049,1,2010-07-07,2012-05-21,3.13


### DataTypes

In [None]:
df['repaid'] = df['repaid'].astype('category')
df['loan_type'] = df['loan_type'].astype('category')

df['loan_start'] = pd.to_datetime(df['loan_start'])
df['loan_end'] = pd.to_datetime(df['loan_end'])

print(df.dtypes)

In [None]:
df.isnull().sum()

### Find Outliers

In [None]:
df['loan_amount'].plot(kind='box')
plt.show()

df['rate'].plot(kind='box')
plt.show()

In [None]:
import scipy.stats as stats
df['zscore_rate']=stats.zscore(df['rate'])
df['zscore_loan_amount']=stats.zscore(df['loan_amount'])
df.head()

In [None]:
outliers=df[(df['zscore_rate']<-3) | (3 <df['zscore_rate']) | (df['zscore_loan_amount']<-3) | (3 < df['zscore_loan_amount'])]
outliers

### Drop Outliers

In [None]:
df=df[(-3< df['zscore_rate']) & (df['zscore_rate']<3) & (-3< df['zscore_loan_amount']) & (df['zscore_loan_amount']<3)]
df.shape

### Encoding

#### Lable Encoder

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
encoded_series = pd.Series(label_encoder.fit_transform(df['loan_type']), name='loan_type_encoded')
encoded_series=encoded_series.reset_index(drop=True)
df = df.reset_index(drop=True)
df_label = pd.concat([df.drop('loan_type', axis=1), encoded_series], axis=1)
df_label

In [None]:
df_loans=df
df_loans['loan_tenure'] =  df_loans['loan_end'] - df_loans['loan_start']
df_loans

### Derived Features

In [None]:
import datetime as dt
df_loans['loan_tenure'] = df_loans['loan_tenure'].dt.days / 365
df_loans=df_loans.drop(['loan_start','loan_end'],axis=1)
df_loans

In [None]:
df=df_loans
df.head()

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
encoded_series = pd.Series(label_encoder.fit_transform(df['loan_type']), name='loan_type_encoded')
encoded_series=encoded_series.reset_index(drop=True)
df = df.reset_index(drop=True)
df_label = pd.concat([df.drop('loan_type', axis=1), encoded_series], axis=1)
df=df_label
df.head()

### Scaling

#### Standard

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

numeric_columns = df.select_dtypes(include=['number']).columns
df[numeric_columns] = scaler.fit_transform(df[numeric_columns])
df.head()

### Normalization

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

numeric_columns = df.select_dtypes(include=['number']).columns
df[numeric_columns] = scaler.fit_transform(df[numeric_columns])
df.head()

### Skew

In [None]:
df['Log_Rate'] = np.log(df['rate'])
df['Sqrt_Rate'] = np.sqrt(df['rate'])
df.head()

In [None]:
from scipy.stats import skew

print(f"Rate Skewness : {skew(df['rate'])}")
print(f"Log_Rate Skewness : {skew(df['Log_Rate'])}")
print(f"Sqrt_Rate Skewness : {skew(df['Sqrt_Rate'])}")

In [None]:
df = df.drop(columns=['rate', 'Log_Rate'])
df=df.rename(columns={'Sqrt_Rate': 'rate'})
df.head()

### Regressions

In [None]:
from sklearn.model_selection import train_test_split
X = df.drop('loan_amount', axis=1)
y = df['loan_amount']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,train_size=0.8, random_state =0)
X_train.shape , X_test.shape , y_train.shape , y_test.shape

((352, 4), (88, 4), (352,), (88,))

#### Single Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

X_train_single = X_train[['rate']]
X_test_single = X_test[['rate']]

slr_model = LinearRegression()
slr_model.fit(X_train_single, y_train)

y_pred_single = slr_model.predict(X_test_single)

mse_single = mean_squared_error(y_test, y_pred_single)
r2_single = r2_score(y_test, y_pred_single)

print(f"MSE: {mse_single}")
print(f"R2: {r2_single}")

#### Multiple Linear Regression

In [None]:
mlr_model = LinearRegression()
mlr_model.fit(X_train, y_train)

y_pred_multi = mlr_model.predict(X_test)

mse_multi = mean_squared_error(y_test, y_pred_multi)
r2_multi = r2_score(y_test, y_pred_multi)

print(f"MSE: {mse_multi}")
print(f"R2: {r2_multi}")

#### Polynomial Regression

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

poly_features = PolynomialFeatures(degree=2)
poly_model = make_pipeline(poly_features, LinearRegression())

poly_model.fit(X_train, y_train)

y_pred_poly = poly_model.predict(X_test)

mse_poly = mean_squared_error(y_test, y_pred_poly)
r2_poly = r2_score(y_test, y_pred_poly)

print(f"Polynomial Regression (Degree {2}) - MSE: {mse_poly}, R2: {r2_poly}")