In [1]:
import pandas as pd
import numpy as np
from numpy.linalg import norm, inv
from numpy import transpose
import matplotlib.pyplot as plt

In [14]:
ins_ds = pd.read_csv('../../Datasets/Others/Medical-cost/insurance.csv')

In [15]:
ins_ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [16]:
ins_ds.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


### Encoding categories

In [17]:
ins_ds["smoker"] = ins_ds["smoker"].apply(lambda x: 0 if x == "no" else 1)
ins_ds["sex"] = ins_ds["sex"].apply(lambda x: 0 if x == "female" else 1)

In [18]:
ins_ds["smoker"].value_counts()

0    1064
1     274
Name: smoker, dtype: int64

In [19]:
ins_ds["sex"].value_counts()

1    676
0    662
Name: sex, dtype: int64

In [20]:
dummies = pd.get_dummies(ins_ds["region"])
ins_df_enc = pd.concat([ins_ds, dummies], axis=1)

In [21]:
ins_df_enc.drop(labels=["region"], axis=1, inplace=True)

In [22]:
ins_df_enc.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges,northeast,northwest,southeast,southwest
0,19,0,27.9,0,1,16884.924,0,0,0,1
1,18,1,33.77,1,0,1725.5523,0,0,1,0
2,28,1,33.0,3,0,4449.462,0,0,1,0
3,33,1,22.705,0,0,21984.47061,0,1,0,0
4,32,1,28.88,0,0,3866.8552,0,1,0,0


In [23]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

In [24]:
ohc = OneHotEncoder(sparse=False)

In [25]:
ins_ds["region"].values[:, np.newaxis]

array([['southwest'],
       ['southeast'],
       ['southeast'],
       ...,
       ['southeast'],
       ['southwest'],
       ['northwest']], dtype=object)

In [26]:
a = ohc.fit_transform(ins_ds["region"].values[:, np.newaxis])
a

array([[0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       ...,
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.]])

### Train test split

In [27]:
target = ["charges"]
predictors = list(set(ins_df_enc.columns) - set(target))

In [28]:
X_train, X_test, y_train, y_test = train_test_split(ins_df_enc[predictors], ins_df_enc[target], test_size=0.2, random_state=42)

In [41]:
X_train.values.shape

(1070, 9)

In [40]:
y_train.values.shape

(1070, 1)

### Scaling

In [29]:
sc = StandardScaler()

In [30]:
X_train_scaled = sc.fit_transform(X_train)
X_test_scaled = sc.transform(X_test)

### Modelling

In [31]:
from sklearn.linear_model import LinearRegression

In [32]:
regressor = LinearRegression()

In [33]:
regressor.fit(X=X_train_scaled, y=y_train)

LinearRegression()

In [34]:
regressor.intercept_

array([13346.08973636])

In [35]:
regressor.coef_

array([[-1.50167101e+02,  5.16890247e+02,  3.87349490e+01,
        -9.29310107e+00,  2.03622812e+03,  3.61497541e+03,
         1.99698058e+02,  9.55848141e+03, -8.66211474e+01]])

In [36]:
y_pred = regressor.predict(X_test_scaled)

### Evaluate

In [207]:
from sklearn.metrics import r2_score, mean_squared_error

In [117]:
mean_squared_error(y_true=y_test, y_pred=y_pred)

33596915.85136148

In [118]:
r2_score(y_true=y_test, y_pred=y_pred)

0.7835929767120722

### Gradient descent

In [184]:
scl = StandardScaler()
y_train_scaled = scl.fit_transform(y_train)
y_test_scaled = scl.transform(y_test)

In [4]:
import numpy as np

In [5]:
X = np.random.rand(1600, 5)
y = np.random.rand(1600, 1)

batch_size = 32
n = X.shape[0]
d = X.shape[1]

random_offset = 0 if n % batch_size == 0 else np.random.randint(low=0, high=(n%batch_size))
last = (n // batch_size) * batch_size

X_trunc = X[random_offset: random_offset+last]
y_trunc = y[random_offset: random_offset+last]

X_minibatch = X_trunc.reshape(((n // batch_size), batch_size, d))
y_minibatch = y_trunc.reshape(((n // batch_size), batch_size, 1))

In [1]:
import random
class MiniBatchGenerator:
    def __init__(self, X, y, batch_size=32):
        self.X = X
        self.y = y
        self.n = X.shape[0]
        self.d = X.shape[1]
        self.batch_size = batch_size
        self.batches = []
        
    def generate_batches(self):
        print("Generating minibatch.")
        X = self.X
        y = self.y
        n = self.n
        d = self.d
        batch_size = self.batch_size

        random_offset = np.random.randint(low=0, high=(n % batch_size))
        last = (n // batch_size) * batch_size
        
        X_trunc = X[random_offset: random_offset+last]
        y_trunc = y[random_offset: random_offset+last]
        
        X_minibatch = X_trunc.reshape(((n // batch_size), batch_size, d))
        y_minibatch = y_trunc.reshape(((n // batch_size), batch_size))
        
        minibatches = list(zip(X_minibatch, y_minibatch))
        random.shuffle(minibatches)
        print(X_minibatch.shape)
        self.batches = minibatches

    def __next__(self):
        if not self.batches:
            self.generate_batches()
            
        return self.batches.pop(0)

In [2]:
def calculate_weights_and_biases(X, y, eps=0.01, lr=0.0001):
    assert isinstance(X, np.ndarray)
    assert isinstance(y, np.ndarray)
    
    assert X.ndim == 2
    assert y.ndim == 1

    n, d = X.shape
    
    w_curr = np.random.rand(X.shape[-1]) * 1
    w0_curr = np.random.randint(1)
    
    del_w = np.ones(shape=w_curr.shape)
    del_w0 = 1
    
    i = 0

    while any(del_w) > 0 and i < 100:
        f = y - (X @ w_curr + w0_curr * np.ones(shape=(X.shape[0],)))
        w_next = w_curr - lr * (-2 * X.T @ f) * del_w
        w0_next = w0_curr - lr * (-2 * np.ones(shape=(X.shape[0])) @ f) * del_w0
        
        del_w = np.where(np.abs(w_next - w_curr) < eps, 0, 1)
        del_w0 = np.where(np.abs(w0_next - w0_curr) < eps, 0, 1)
        
        w_curr = w_next
        w0_curr = w0_next
    
        i += 1
    
    return w_curr, w0_curr

In [219]:
def calculate_weights_and_biases_mb(X, y, eps=0.01, lr=0.01):
    assert isinstance(X, np.ndarray)
    assert isinstance(y, np.ndarray)
    
    assert X.ndim == 2
    assert y.ndim == 1

    n, d = X.shape

    w_curr = np.random.rand(X.shape[-1]) * 1
    w0_curr = np.random.randint(1)
    
    del_w = np.ones(shape=w_curr.shape)
    del_w0 = 1
    
    i = 0
    mbg = MiniBatchGenerator(X, y, batch_size=32)
    while any(del_w) > 0 and i < 100:
        X_mb, y_mb = next(mbg)

        f = y_mb - (X_mb @ w_curr + w0_curr * np.ones(shape=(X_mb.shape[0],)))
        w_next = w_curr - lr * (-2 * X_mb.T @ f) * del_w
        w0_next = w0_curr - lr * (-2 * np.ones(shape=(X_mb.shape[0])) @ f) * del_w0
        
        del_w = np.where(np.abs(w_next - w_curr) < eps, 0, 1)
        del_w0 = np.where(np.abs(w0_next - w0_curr) < eps, 0, 1)
        
        w_curr = w_next
        w0_curr = w0_next
    
        i += 1
    
    return w_curr, w0_curr

In [220]:
w_curr, w0_curr = calculate_weights_and_biases_mb(X_train_scaled, y_train.values.reshape(-1))

(33, 32, 9)
(33, 32, 9)
(33, 32, 9)
(33, 32, 9)


In [221]:
w_curr, w0_curr

(array([-346.24368355, 1792.76388561,  344.20747595,  753.35725012,
        1338.14340171, 3996.60518487, -188.4718358 , 9890.75975759,
         192.04348308]),
 13534.038975172101)

In [222]:
y_pred_gd = (X_test_scaled @ w_curr) + w0_curr

In [223]:
mean_squared_error(y_true=y_test, y_pred=y_pred_gd)

36030570.834810175

In [224]:
r2_score(y_true=y_test, y_pred=y_pred_gd)

0.7679171321491962

<hr style="height:2px;width:600px">

In [3]:
car_ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4340 entries, 0 to 4339
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   name           4340 non-null   object
 1   year           4340 non-null   int64 
 2   selling_price  4340 non-null   int64 
 3   km_driven      4340 non-null   int64 
 4   fuel           4340 non-null   object
 5   seller_type    4340 non-null   object
 6   transmission   4340 non-null   object
 7   owner          4340 non-null   object
dtypes: int64(3), object(5)
memory usage: 271.4+ KB


In [6]:
car_ds["owner"].value_counts()

First Owner             2832
Second Owner            1106
Third Owner              304
Fourth & Above Owner      81
Test Drive Car            17
Name: owner, dtype: int64

In [35]:
car_ds["seller_type"].value_counts()

Individual          3244
Dealer               994
Trustmark Dealer     102
Name: seller_type, dtype: int64

In [8]:
car_ds["transmission"].value_counts()

Manual       3892
Automatic     448
Name: transmission, dtype: int64

In [32]:
car_ds["fuel"].value_counts()

Diesel      2153
Petrol      2123
CNG           40
LPG           23
Electric       1
Name: fuel, dtype: int64

In [9]:
car_ds["name"].value_counts()

Maruti Swift Dzire VDI                     69
Maruti Alto 800 LXI                        59
Maruti Alto LXi                            47
Maruti Alto LX                             35
Hyundai EON Era Plus                       35
                                           ..
Hyundai Verna Transform CRDi VGT SX ABS     1
Maruti S-Presso VXI Plus                    1
Toyota Etios Liva 1.2 VX                    1
Toyota Yaris G                              1
Hyundai i20 Magna 1.4 CRDi                  1
Name: name, Length: 1491, dtype: int64

In [14]:
car_ds["company"] = car_ds["name"].apply(lambda x: x.split(" ")[0])

In [37]:
car_ds["company"].value_counts()

Maruti           1280
Hyundai           821
Mahindra          365
Tata              361
Honda             252
Ford              238
Toyota            206
Chevrolet         188
Renault           146
Volkswagen        107
Skoda              68
Nissan             64
Audi               60
BMW                39
Fiat               37
Datsun             37
Mercedes-Benz      35
Jaguar              6
Mitsubishi          6
Land                5
Volvo               4
Ambassador          4
Jeep                3
MG                  2
OpelCorsa           2
Daewoo              1
Force               1
Isuzu               1
Kia                 1
Name: company, dtype: int64

In [15]:
car_ds.describe()

Unnamed: 0,year,selling_price,km_driven
count,4340.0,4340.0,4340.0
mean,2013.090783,504127.3,66215.777419
std,4.215344,578548.7,46644.102194
min,1992.0,20000.0,1.0
25%,2011.0,208749.8,35000.0
50%,2014.0,350000.0,60000.0
75%,2016.0,600000.0,90000.0
max,2020.0,8900000.0,806599.0


In [20]:
q1 = car_ds.describe().loc["25%"]
q3 = car_ds.describe().loc["75%"]
iqr = q3 - q1

In [21]:
iqr

year                  5.00
selling_price    391250.25
km_driven         55000.00
dtype: float64

In [28]:
def get_outlers(df: pd.DataFrame, column_name: str):
    lower_bound = (q1 - 10 * iqr)[column_name]
    upper_bound = (q3 + 10 * iqr)[column_name]
    return df[(df[column_name] < lower_bound) | (df[column_name] > upper_bound)]

Consider: Category-wise outlier removal

In [31]:
from category_encoders import TargetEncoder