In [1]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("bmi.csv")

In [3]:
df.head(10)

Unnamed: 0,Age,Height,Weight,Bmi,BmiClass
0,61,1.85,109.3,31.93572,Obese Class 1
1,60,1.71,79.02,27.0237,Overweight
2,60,1.55,74.7,31.092612,Obese Class 1
3,60,1.46,35.9,16.841809,Underweight
4,60,1.58,97.1,38.89601,Obese Class 2
5,59,1.71,79.32,27.126295,Overweight
6,59,1.7,73.32,25.370242,Overweight
7,59,1.72,85.32,28.839913,Overweight
8,59,1.46,36.0,16.888722,Underweight
9,59,1.83,104.7,31.263997,Obese Class 1


In [4]:
df.isnull().sum()

Age         0
Height      0
Weight      0
Bmi         0
BmiClass    0
dtype: int64

In [5]:
maps = {
    "Normal Weight" : 1,
    "Overweight" : 2,
    "Underweight" : 3,
    "Obese Class 3" : 4,
    "Obese Class 2" : 5,
    "Obese Class 1" : 6,
    
}

In [6]:
df["BmiClass"] = df["BmiClass"].map(maps)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 741 entries, 0 to 740
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Age       741 non-null    int64  
 1   Height    741 non-null    float64
 2   Weight    741 non-null    float64
 3   Bmi       741 non-null    float64
 4   BmiClass  741 non-null    int64  
dtypes: float64(3), int64(2)
memory usage: 29.1 KB


In [8]:
df = df.drop("BmiClass", axis = 1)

In [41]:
class Linear(BaseEstimator, ClassifierMixin):
    def __init__(self, n_iteration, alpha):
        self.n_iteration = n_iteration
        self.alpha = alpha
        self.cost_list = []
        
    def fit(self, X, y):
        X = np.c_[X, np.ones(X.shape[0])]
        self.theta = np.zeros(X.shape[1])
        m = X.shape[0]
        
        for i in range(self.n_iteration):
            h_x = np.dot(X, self.theta)

            cost = (1/m)* np.sum((y - h_x)**2)          

            d_cost = -(2/m)*(np.dot(X.T, (y - h_x)))

            self.cost_list.append(cost)
            self.theta -= self.alpha * d_cost
            
    
                
        return self
    
    def predict(self, X):
        
        X = np.c_[X, np.ones(X.shape[0])]
        
        h_x = np.dot(X, self.theta)
        

        return h_x 
    
    def score_metrics(self, X, y):
        
        y_pred = self.predict(X)
        
        print(f"R-squared = {1 - ((np.sum((y - y_pred)**2))/ (np.sum((y - np.mean(y))**2)))}")
        print(f"MSE = {(1/len(y))* np.sum((y-y_pred)**2)}")
        
    
    def score(self, X, y):
        
        y_pred = self.predict(X)
        
        return (1 - ((np.sum((y - y_pred)**2))/ (np.sum((y - np.mean(y))**2))))
        
        
        
        
        
    

    def get_theta(self):
        return self.theta
    
    
    
        
        

In [42]:
from sklearn.compose import TransformedTargetRegressor

pipeline = Pipeline([
    ("Standardiser", StandardScaler()),
    ("model", TransformedTargetRegressor(
        regressor=Linear(1000, 0.01),
        transformer=StandardScaler()))

])


In [43]:
X = df.drop("Bmi", axis = 1)
y = df["Bmi"]


X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, random_state = 42)

In [44]:
pipeline.fit(X_train, y_train)

In [45]:
pipeline.score(X_test, y_test)

0.9814169499771737

In [46]:
pipeline.predict(pd.DataFrame([[24, 1.75, 61]]))



array([19.62014358])

In [14]:
param_grid = {
    "n_iteration" : [500, 1000, 1500, 2000, 2500, 3000],
    "alpha": [0.1, 0.01, 0.001, 0.0001]
}

In [15]:
def grid(pipeline, param_grid, X_train, X_test, y_train, y_test):
    
    best_params = None
    best_score = -np.inf
    result = []
    
    for iteration in param_grid["n_iteration"]:
        for alpha in param_grid["alpha"]:
            
            
            
            pipeline.set_params(
                model__alpha = alpha,
                model__n_iteration = iteration
            )
            
            pipeline.fit(X_train, y_train)
            
            score= pipeline.score(X_test, y_test)
            result.append((iteration, alpha, score))
            
        
            
            if score >= best_score:
                best_score = score
                best_params ={
                    "n_iteration" : iteration,
                    "alpha": alpha
                }
                
    return result, best_params
    

In [16]:
grid(pipeline, param_grid, X_train, X_test, y_train, y_test)

([(500, 0.1, 0.9814173776670401),
  (500, 0.01, 0.9812291033902132),
  (500, 0.001, -0.5102427880313702),
  (500, 0.0001, -7.768251888426418),
  (1000, 0.1, 0.9814173776670401),
  (1000, 0.01, 0.9814169499126957),
  (1000, 0.001, 0.7256511351017272),
  (1000, 0.0001, -6.163177632384089),
  (1500, 0.1, 0.9814173776670401),
  (1500, 0.01, 0.9814173702411045),
  (1500, 0.001, 0.9196415012581416),
  (1500, 0.0001, -4.8613137219399825),
  (2000, 0.1, 0.9814173776670401),
  (2000, 0.01, 0.9814173775122719),
  (2000, 0.001, 0.9602249326031341),
  (2000, 0.0001, -3.80387530190253),
  (2500, 0.1, 0.9814173776670401),
  (2500, 0.01, 0.9814173776638007),
  (2500, 0.001, 0.9725524697069873),
  (2500, 0.0001, -2.9438072824837613),
  (3000, 0.1, 0.9814173776670401),
  (3000, 0.01, 0.9814173776669721),
  (3000, 0.001, 0.9774203774054191),
  (3000, 0.0001, -2.2433626446650043)],
 {'n_iteration': 3000, 'alpha': 0.1})

In [17]:
pipeline.named_steps["model"].predict(pd.DataFrame([[24, 1.75, 61]]))

array([260.32632253])

In [18]:
y

0      31.935720
1      27.023700
2      31.092612
3      16.841809
4      38.896010
         ...    
736    27.662157
737    29.302925
738    26.687598
739    27.868945
740    29.504148
Name: Bmi, Length: 741, dtype: float64

In [19]:
y_pred = pipeline.named_steps["model"].predict(X_test)

In [20]:
y_pred

array([353.15595283, 337.15641389, 546.34358944, 174.93690977,
       292.27134079, 575.7388221 , 342.77502423, 544.5319393 ,
       487.17434914, 314.55692611, 261.28940342, 320.65183278,
       446.78496125, 195.07767206, 308.47708802, 249.0518419 ,
       358.96418613, 344.26433302, 384.6585586 , 344.43005431,
       300.49786632, 873.85683134, 537.37184574, 318.61968208,
       302.18744688, 293.07984593, 334.69867529, 315.07533086,
       285.86636726, 242.37496682, 177.6102244 , 335.20510231,
       307.0987721 , 410.16250743, 463.80992262, 325.80303798,
       217.21741727, 328.59341531, 136.34959558, 178.5369442 ,
       255.00423584, 316.13383503, 368.49654548, 314.65136522,
       484.67388582, 310.79801087, 191.02827652, 346.75750005,
       245.81689306, 360.73590909, 336.5707726 , 585.85426738,
       182.62573144, 574.55247525, 269.47773213, 342.92275096,
       326.2645847 , 295.01442081, 317.47400716, 534.99648909,
       138.89461375, 457.49925415, 289.73257813, 600.58

In [37]:
pipeline.score(X_test, y_test)

0.9814169499771737

In [47]:
pipeline.score_metrics(X_test, y_test)

AttributeError: 'Pipeline' object has no attribute 'score_metrics'

In [39]:
df[df["Age"] == 24]

Unnamed: 0,Age,Height,Weight,Bmi
371,24,1.77,71.82,22.924447
372,24,1.76,69.32,22.378616
373,24,1.78,74.32,23.456634
374,24,1.759,69.32,22.404068
375,24,1.781,74.32,23.430301
376,24,1.62,125.0,47.629934
377,24,1.76,145.0,46.810434
378,24,1.92,210.0,56.966146
379,24,1.74,58.0,19.157088
380,24,1.75,70.0,22.857143
