In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score,accuracy_score,log_loss,r2_score
from sklearn.model_selection import train_test_split,KFold , GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer ,make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression,ElasticNet
from sklearn.tree import DecisionTreeRegressor,DecisionTreeClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import VotingRegressor
import warnings
warnings.filterwarnings('ignore')

In [2]:
from ISLP import load_data
Credit = load_data('Credit')

In [3]:
Credit

Unnamed: 0,ID,Income,Limit,Rating,Cards,Age,Education,Gender,Student,Married,Ethnicity,Balance
0,1,14.891,3606,283,2,34,11,Male,No,Yes,Caucasian,333
1,2,106.025,6645,483,3,82,15,Female,Yes,Yes,Asian,903
2,3,104.593,7075,514,4,71,11,Male,No,No,Asian,580
3,4,148.924,9504,681,3,36,11,Female,No,No,Asian,964
4,5,55.882,4897,357,2,68,16,Male,No,Yes,Caucasian,331
...,...,...,...,...,...,...,...,...,...,...,...,...
395,396,12.096,4100,307,3,32,13,Male,No,Yes,Caucasian,560
396,397,13.364,3838,296,5,65,17,Male,No,No,African American,480
397,398,57.872,4171,321,5,67,12,Female,No,Yes,Caucasian,138
398,399,37.728,2525,192,1,44,13,Male,No,Yes,Caucasian,0


In [4]:
Credit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 12 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   ID         400 non-null    int64   
 1   Income     400 non-null    float64 
 2   Limit      400 non-null    int64   
 3   Rating     400 non-null    int64   
 4   Cards      400 non-null    int64   
 5   Age        400 non-null    int64   
 6   Education  400 non-null    int64   
 7   Gender     400 non-null    category
 8   Student    400 non-null    category
 9   Married    400 non-null    category
 10  Ethnicity  400 non-null    category
 11  Balance    400 non-null    int64   
dtypes: category(4), float64(1), int64(7)
memory usage: 27.2 KB


In [5]:
X = Credit.drop(['Balance','ID'],axis=1)
y = Credit['Balance']

In [6]:
X_train , X_test , y_train , y_test = train_test_split(X,y,random_state=24,test_size=0.3)
ohe = OneHotEncoder(handle_unknown = 'ignore')
ct = make_column_transformer((ohe,make_column_selector(dtype_include='category')) ,
                                   ('passthrough',make_column_selector(dtype_exclude='category')),
                                   verbose_feature_names_out=False)

Pipeline for elasticNet

In [7]:
el = ElasticNet()
pipe_el = Pipeline([('CT',ct),('EL',el)])

Pipeline for knn

In [8]:
knn = KNeighborsRegressor()
pipe_knn = Pipeline([('CT',ct),('KNN',knn)])

Pipeline for Decision Tree Regressor

In [9]:
dtr = DecisionTreeRegressor(random_state=24)
pipe_dtr = Pipeline([('CT',ct),('DTR',dtr)])

Ensemble technique Voting Regressor that combines different regression models and give one final prediction.
Yeh technique helpful hai kyunki yeh alag-alag models ki strengths ko combine karti hai, taaki overall prediction aur bhi accurate ho.

In [10]:
voting = VotingRegressor([('DTR',pipe_dtr),('KNN',pipe_knn),('El',pipe_el)])
voting.fit(X_train,y_train)
y_pred = voting.predict(X_test)
print(r2_score(y_test,y_pred))

0.9146784621610978


In [11]:
pipe_el.fit(X_train, y_train)
y_pred = pipe_el.predict(X_test)
print(r2_score(y_test, y_pred))

0.9057360053223552


In [12]:
pipe_knn.fit(X_train, y_train)
y_pred = pipe_knn.predict(X_test)
print(r2_score(y_test, y_pred))

0.7899310404076199


In [13]:
pipe_dtr.fit(X_train, y_train)
y_pred = pipe_dtr.predict(X_test)
print(r2_score(y_test, y_pred))

0.8928486052197747


In [14]:
voting = VotingRegressor([('DTR',pipe_dtr),('KNN',pipe_knn),('El',pipe_el)],weights=[5,1,5])
voting.fit(X_train,y_train)
y_pred = voting.predict(X_test)
print(r2_score(y_test,y_pred))

0.930761170667517


In [15]:
ohe = OneHotEncoder(handle_unknown='ignore')
ct = make_column_transformer(('passthrough', make_column_selector(dtype_exclude='category')),
                             (ohe, make_column_selector(dtype_include='category')),
                             verbose_feature_names_out=False)
dtr = DecisionTreeRegressor(random_state=24)
knn = KNeighborsRegressor()
el = ElasticNet()
voting = VotingRegressor([('DT', dtr), ('KNN', knn), ('EL', el)],
                        weights=[6, 3, 6])
pipe_vt = Pipeline([('CT', ct), ('VOT', voting)])
pipe_vt.fit(X_train, y_train)
y_pred = pipe_vt.predict(X_test)
print(f"{(r2_score(y_test, y_pred) * 100):.2f}%")

92.54%


In [16]:
pipe_vt.get_params()

{'memory': None,
 'steps': [('CT',
   ColumnTransformer(transformers=[('passthrough', 'passthrough',
                                    <sklearn.compose._column_transformer.make_column_selector object at 0x000001A2D502C260>),
                                   ('onehotencoder',
                                    OneHotEncoder(handle_unknown='ignore'),
                                    <sklearn.compose._column_transformer.make_column_selector object at 0x000001A2D502D5E0>)],
                     verbose_feature_names_out=False)),
  ('VOT',
   VotingRegressor(estimators=[('DT', DecisionTreeRegressor(random_state=24)),
                               ('KNN', KNeighborsRegressor()),
                               ('EL', ElasticNet())],
                   weights=[6, 3, 6]))],
 'verbose': False,
 'CT': ColumnTransformer(transformers=[('passthrough', 'passthrough',
                                  <sklearn.compose._column_transformer.make_column_selector object at 0x000001A2D502C260>),
 

In [17]:
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
params = {
    'VOT__KNN__n_neighbors': [2, 5, 7],
    'VOT__EL__alpha': np.linspace(0.001, 3,3 ),
    'VOT__DT__max_depth':[None,2,3,4,5],
    'VOT__DT__min_samples_leaf':[1,5,10],
    'VOT__DT__min_samples_split':[2,4,6,7,8],
    'VOT__weights':[[1,2,3],[3,4,2],[2,5,6]]
}
gcv = GridSearchCV(pipe_vt,param_grid=params,cv=kfold,scoring='r2',verbose=3)
gcv.fit(X,y)


Fitting 5 folds for each of 2025 candidates, totalling 10125 fits
[CV 1/5] END VOT__DT__max_depth=None, VOT__DT__min_samples_leaf=1, VOT__DT__min_samples_split=2, VOT__EL__alpha=0.001, VOT__KNN__n_neighbors=2, VOT__weights=[1, 2, 3];, score=0.910 total time=   0.0s
[CV 2/5] END VOT__DT__max_depth=None, VOT__DT__min_samples_leaf=1, VOT__DT__min_samples_split=2, VOT__EL__alpha=0.001, VOT__KNN__n_neighbors=2, VOT__weights=[1, 2, 3];, score=0.941 total time=   0.0s
[CV 3/5] END VOT__DT__max_depth=None, VOT__DT__min_samples_leaf=1, VOT__DT__min_samples_split=2, VOT__EL__alpha=0.001, VOT__KNN__n_neighbors=2, VOT__weights=[1, 2, 3];, score=0.943 total time=   0.0s
[CV 4/5] END VOT__DT__max_depth=None, VOT__DT__min_samples_leaf=1, VOT__DT__min_samples_split=2, VOT__EL__alpha=0.001, VOT__KNN__n_neighbors=2, VOT__weights=[1, 2, 3];, score=0.944 total time=   0.0s
[CV 5/5] END VOT__DT__max_depth=None, VOT__DT__min_samples_leaf=1, VOT__DT__min_samples_split=2, VOT__EL__alpha=0.001, VOT__KNN__n_nei

In [18]:
gcv.best_score_

0.9359564484171168

In [19]:
gcv.best_params_

{'VOT__DT__max_depth': None,
 'VOT__DT__min_samples_leaf': 1,
 'VOT__DT__min_samples_split': 8,
 'VOT__EL__alpha': 0.001,
 'VOT__KNN__n_neighbors': 5,
 'VOT__weights': [1, 2, 3]}