The goal of this model will be to make predictions on the billing ammount in Setember.

In [74]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

pd.set_option('display.max_columns', None)

In [75]:
df = pd.read_csv("DefualtData/UCI_Credit_card.csv")
df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,1,20000.0,2,2,1,24,2,2,-1,-1,-2,-2,3913.0,3102.0,689.0,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,2,120000.0,2,2,2,26,-1,2,0,0,0,2,2682.0,1725.0,2682.0,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,3,90000.0,2,2,2,34,0,0,0,0,0,0,29239.0,14027.0,13559.0,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,4,50000.0,2,2,1,37,0,0,0,0,0,0,46990.0,48233.0,49291.0,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,5,50000.0,1,2,1,57,-1,0,-1,0,0,0,8617.0,5670.0,35835.0,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


In [76]:
#Rename columns to make pay align with bill & pay amt, additionally remove default since we are looking for. ID isn't too important either 
df = df.rename(columns={"PAY_0":"PAY_1"})
df = df.drop(columns={"default.payment.next.month", "ID"})
#Remove PAY_1 and PAYAMT_1 because they are in setember and it wouldn't make sense to have them to predict as that data wouldn't be availible
df = df.drop(columns={"PAY_1", "PAY_AMT1"})
df.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
0,20000.0,2,2,1,24,2,-1,-1,-2,-2,3913.0,3102.0,689.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0
1,120000.0,2,2,2,26,2,0,0,0,2,2682.0,1725.0,2682.0,3272.0,3455.0,3261.0,1000.0,1000.0,1000.0,0.0,2000.0
2,90000.0,2,2,2,34,0,0,0,0,0,29239.0,14027.0,13559.0,14331.0,14948.0,15549.0,1500.0,1000.0,1000.0,1000.0,5000.0
3,50000.0,2,2,1,37,0,0,0,0,0,46990.0,48233.0,49291.0,28314.0,28959.0,29547.0,2019.0,1200.0,1100.0,1069.0,1000.0
4,50000.0,1,2,1,57,0,-1,0,0,0,8617.0,5670.0,35835.0,20940.0,19146.0,19131.0,36681.0,10000.0,9000.0,689.0,679.0


In [None]:
#Settign Targets and Features
X = df.drop(columns="BILL_AMT1")
y = df['BILL_AMT1']

In [78]:
#Spliting Training and Test Data
X_train, X_test, y_train, y_test, = train_test_split(X,y, test_size=0.2, random_state=42)

In [79]:
model = LinearRegression()
model.fit(X_train, y_train)

In [80]:
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.2f}")


MAE: 8991.22
RMSE: 22409.48
R² Score: 0.91


In [81]:
coeffs = pd.DataFrame({
    'Feature' : X.columns,
    'Coefficient' : model.coef_
})

coeffs.sort_values(by = 'Coefficient', ascending=False)

Unnamed: 0,Feature,Coefficient
5,PAY_2,4050.668869
2,EDUCATION,986.670992
3,MARRIAGE,45.47797
10,BILL_AMT2,0.873851
11,BILL_AMT3,0.108152
17,PAY_AMT4,0.082222
16,PAY_AMT3,0.041176
18,PAY_AMT5,0.035041
19,PAY_AMT6,0.032744
12,BILL_AMT4,0.01311


In [82]:
#Implementing a Ridge and Lasso to prevent overfitting and improve generalizations.
from sklearn.linear_model import Ridge, Lasso

#Ridge Regression
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train, y_train)
ridge_preds = ridge_model.predict(X_test)

#Lasso Regression
lasso_model = Lasso(alpha=1.0)
lasso_model.fit(X_train, y_train)
lasso_preds = lasso_model.predict(X_test)


In [83]:
print("Ridge:")
print("MAE:", mean_absolute_error(y_test, ridge_preds))
print("RMSE:", mean_squared_error(y_test, ridge_preds, squared=False))
print("R² Score:", r2_score(y_test, ridge_preds))

print("Lasso:")
print("MAE:", mean_absolute_error(y_test, lasso_preds))
print("RMSE:", mean_squared_error(y_test, lasso_preds, squared=False))
print("R² Score:", r2_score(y_test, lasso_preds))

Ridge:
MAE: 8991.137347036532
RMSE: 22409.482459509065
R² Score: 0.9096580126271642
Lasso:
MAE: 8990.569125985467
RMSE: 22409.54161101735
R² Score: 0.9096575356976432


