# Regularization = putting a speed limit on your model.

It stops the model from getting too clever and memorizing the data instead of actually learning patterns.

#### Types (Just enough to sound smart in interviews)
L1 Regularization (Lasso) - Kicks useless features out -> This feature adds no value. Fired.

L2 Regularization (Ridge) - Shrinks feature impact -> You can stay, but lower your voice.

Elastic Net - L1 + L2 combo -> Balanced governance

Regularization prevents overfitting by penalizing overly complex models, helping them generalize better to unseen data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import Lasso,LassoCV,Ridge,ElasticNet
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import f_regression
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler

In [2]:
df=pd.read_csv('banking_dataset.csv')
df.shape

(1000, 31)

In [3]:
df.head()

Unnamed: 0,customer_age,account_balance,annual_income,loan_amount,loan_duration_months,num_of_credit_cards,credit_card_utilization,monthly_salary,num_of_loans,num_of_defaults,...,insurance_amount,spending_score,digital_transactions_ratio,mobile_banking_usage_hours,branch_visits_per_month,internet_banking_logins,loan_interest_rate,customer_satisfaction_score,risk_factor,bank_profit
0,56,17871.07,1131740.96,409401.23,51,3,0.84,54136.15,2,2,...,128309.65,98.05,0.29,3.63,8,6,6.44,6.68,0.75,494405.22
1,69,54069.27,600090.95,188667.47,31,1,0.51,63632.0,3,2,...,62273.87,0.33,0.89,6.29,6,21,10.82,2.43,0.11,268619.57
2,46,34872.99,756171.18,206035.43,13,3,0.53,63740.55,0,0,...,91142.9,12.02,0.75,5.35,9,5,11.98,3.57,0.79,356441.45
3,32,21554.93,933147.71,318767.03,11,2,0.56,59817.85,1,2,...,115780.27,54.69,0.93,10.51,0,5,6.37,2.89,0.03,373216.34
4,60,37068.54,345408.57,219029.55,29,0,0.17,49422.27,2,1,...,85623.14,95.59,0.69,9.11,3,13,8.6,5.86,0.44,328103.09


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 31 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   customer_age                 1000 non-null   int64  
 1   account_balance              1000 non-null   float64
 2   annual_income                1000 non-null   float64
 3   loan_amount                  1000 non-null   float64
 4   loan_duration_months         1000 non-null   int64  
 5   num_of_credit_cards          1000 non-null   int64  
 6   credit_card_utilization      1000 non-null   float64
 7   monthly_salary               1000 non-null   float64
 8   num_of_loans                 1000 non-null   int64  
 9   num_of_defaults              1000 non-null   int64  
 10  num_of_transactions          1000 non-null   int64  
 11  avg_transaction_value        1000 non-null   float64
 12  loan_to_income_ratio         1000 non-null   float64
 13  credit_score       

In [5]:
df.isnull().sum()

customer_age                   0
account_balance                0
annual_income                  0
loan_amount                    0
loan_duration_months           0
num_of_credit_cards            0
credit_card_utilization        0
monthly_salary                 0
num_of_loans                   0
num_of_defaults                0
num_of_transactions            0
avg_transaction_value          0
loan_to_income_ratio           0
credit_score                   0
investment_value               0
savings_account_balance        0
fixed_deposit_amount           0
num_of_dependents              0
customer_tenure_years          0
emi_amount                     0
house_value                    0
insurance_amount               0
spending_score                 0
digital_transactions_ratio     0
mobile_banking_usage_hours     0
branch_visits_per_month        0
internet_banking_logins        0
loan_interest_rate             0
customer_satisfaction_score    0
risk_factor                    0
bank_profi

In [6]:
df.duplicated().sum()

np.int64(0)

In [7]:
import seaborn as sns

In [9]:
### outlier capping
def outlier_capping(df,columns):
    Q1= df[columns].quantile(0.25)
    Q3= df[columns].quantile(0.75)
    IQR= Q3-Q1
    lower_extreme= Q1-1.5*IQR
    upper_extreme= Q3+1.5*IQR
    df[columns]= df[columns].apply(lambda x: lower_extreme if x<lower_extreme else upper_extreme if x>upper_extreme else x)
for col in df.select_dtypes(include=['int','float']).columns:
    outlier_capping(df,col)

In [10]:
target = df[['bank_profit']]
features = df.drop(columns=['bank_profit'])
f_reg = f_regression(features,target)
pd.Series(f_reg[0],index=features.columns).sort_values(ascending=False)

  y = column_or_1d(y, warn=True)


annual_income                  1934.703648
loan_amount                      63.606061
num_of_defaults                   4.830485
fixed_deposit_amount              4.419400
investment_value                  4.403184
num_of_transactions               3.239586
account_balance                   3.142897
avg_transaction_value             2.505307
savings_account_balance           1.888850
insurance_amount                  1.167567
num_of_dependents                 1.107272
house_value                       1.073202
customer_tenure_years             1.001254
loan_duration_months              0.803979
branch_visits_per_month           0.692197
monthly_salary                    0.517024
credit_score                      0.454777
loan_interest_rate                0.364788
risk_factor                       0.351680
credit_card_utilization           0.348139
emi_amount                        0.262314
loan_to_income_ratio              0.159228
internet_banking_logins           0.158835
num_of_loan

In [11]:
x_train,x_test,y_train,y_test= train_test_split(features,target,train_size=0.8,random_state=100)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(800, 30)
(200, 30)
(800, 1)
(200, 1)


In [12]:
# Lasso
lasso= Lasso(alpha=1,max_iter=1000)
lasso.fit(x_train,y_train)
y_pred= lasso.predict(x_test)
r2_score(y_test,y_pred)

0.6960569801424268

In [13]:
lasso.coef_

array([-2.91748899e+01, -1.79898746e-01,  3.01060188e-01,  2.56326020e-01,
       -8.28809063e+01, -8.62921214e+02,  3.17805508e+03,  1.37168936e-02,
        2.19846736e+03, -6.79326488e+03, -2.44600773e+01, -3.88356096e+00,
       -5.23384139e+03,  1.68649818e+01,  1.56335468e-01,  1.58327419e-02,
       -3.79265801e-02, -5.15211170e+02, -1.94775759e+02,  9.65283724e-03,
       -1.03916885e-03, -1.50245968e-02,  1.12992657e+01, -2.97962228e+03,
        9.92837152e+01,  8.90569907e+02,  2.16151039e+02,  4.29973781e+02,
       -9.42517483e+02, -2.76211192e+03])

In [14]:
lasso.intercept_

array([23011.24429373])

In [16]:
features.columns

Index(['customer_age', 'account_balance', 'annual_income', 'loan_amount',
       'loan_duration_months', 'num_of_credit_cards',
       'credit_card_utilization', 'monthly_salary', 'num_of_loans',
       'num_of_defaults', 'num_of_transactions', 'avg_transaction_value',
       'loan_to_income_ratio', 'credit_score', 'investment_value',
       'savings_account_balance', 'fixed_deposit_amount', 'num_of_dependents',
       'customer_tenure_years', 'emi_amount', 'house_value',
       'insurance_amount', 'spending_score', 'digital_transactions_ratio',
       'mobile_banking_usage_hours', 'branch_visits_per_month',
       'internet_banking_logins', 'loan_interest_rate',
       'customer_satisfaction_score', 'risk_factor'],
      dtype='object')

In [17]:
# Ridge
ridge = Ridge(alpha=1,max_iter=1000)
ridge.fit(x_train,y_train)
y_pred = ridge.predict(x_test)
r2_score(y_test,y_pred)

0.6960334629574838

In [18]:
from sklearn.linear_model import RidgeCV,ElasticNetCV

In [19]:
ridgecv= RidgeCV(alphas=(0.1, 1.0, 10.0,2,4,5),cv=5)
ridgecv.fit(x_train,y_train)
y_pred= ridgecv.predict(x_test)
r2_score(y_test,y_pred)

0.6957768464019064

In [20]:
ridgecv.alpha_

np.float64(10.0)

In [21]:
ridge.coef_

array([-2.93151577e+01, -1.79836393e-01,  3.01056756e-01,  2.56319321e-01,
       -8.28302849e+01, -8.63933829e+02,  3.13803509e+03,  1.37134401e-02,
        2.19815844e+03, -6.78078283e+03, -2.44528176e+01, -3.87939380e+00,
       -5.16168704e+03,  1.68848392e+01,  1.56372809e-01,  1.57741574e-02,
       -3.79339458e-02, -5.14538307e+02, -1.94699378e+02,  1.02302248e-02,
       -1.03980603e-03, -1.50431058e-02,  1.12967090e+01, -2.94319121e+03,
        9.91740729e+01,  8.90623103e+02,  2.16173326e+02,  4.29415789e+02,
       -9.42137298e+02, -2.72875084e+03])

In [22]:
# Elastic net
elastic= ElasticNet(alpha=1,max_iter=1000,l1_ratio=0.5)
elastic.fit(x_train,y_train)
y_pred= elastic.predict(x_test)
r2_score(y_test,y_pred)

0.6944369067973155

In [23]:
elas_cv= ElasticNetCV(n_alphas=100,max_iter=1000,cv=5,l1_ratio=[0.1,0.2,0.3,0.4,0.5,0.6,0.7])
elas_cv.fit(x_train,y_train)

  y = column_or_1d(y, warn=True)


In [24]:
best_alpha = elas_cv.alpha_

In [25]:
elas_cv.l1_ratio

[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7]

In [26]:
elastic= ElasticNet(alpha=best_alpha,max_iter=1000,l1_ratio=elas_cv.l1_ratio_)
elastic.fit(x_train,y_train)
y_pred= elastic.predict(x_test)
r2_score(y_test,y_pred)

0.693839824909905