# BIAS AND VARIANCE

In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv(r"Medical_Insurance.csv")
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2772 entries, 0 to 2771
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       2772 non-null   int64  
 1   sex       2772 non-null   object 
 2   bmi       2772 non-null   float64
 3   children  2772 non-null   int64  
 4   smoker    2772 non-null   object 
 5   region    2772 non-null   object 
 6   charges   2772 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 151.7+ KB


In [4]:
data.shape

(2772, 7)

In [5]:
from sklearn.preprocessing import LabelEncoder
Le = LabelEncoder()
data['sex'] = Le.fit_transform(data['sex'])
data['smoker'] = Le.fit_transform(data['smoker'])
data['region'] = Le.fit_transform(data['region'])
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,3,16884.924
1,18,1,33.77,1,0,2,1725.5523
2,28,1,33.0,3,0,2,4449.462
3,33,1,22.705,0,0,1,21984.47061
4,32,1,28.88,0,0,1,3866.8552


In [6]:
#Splitting dataset into training and testing dataset

x = data.drop(['charges'],axis = 1)
y = data['charges']

from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(x,y,test_size = 0.2, random_state = 42)
X_train = X_train.values
X_test = X_test.values
Y_train = Y_train.values
Y_test = Y_test.values

In [7]:
#Build the model 

from sklearn.linear_model import LinearRegression
model = LinearRegression()

#Estimation of bias and variance using bias_variance_decomp

from mlxtend.evaluate import bias_variance_decomp
mse, bias, var = bias_variance_decomp(model, X_train, Y_train, X_test, Y_test, loss='mse', num_rounds=200,random_seed=123)
Y_pred = model.predict(X_test)


In [8]:
# summarize results

from sklearn.metrics import mean_squared_error
print("MSE from bias_variance lib [avg expected loss]:",round(mse,2))
print("Avg Bias:",round(bias,2))
print("Avg Variance:",round(var,2))
print("Mean Square error by Sckit-learn lib:",round(mean_squared_error(Y_test,Y_pred),2))

MSE from bias_variance lib [avg expected loss]: 40025687.62
Avg Bias: 39905879.99
Avg Variance: 119807.63
Mean Square error by Sckit-learn lib: 39822607.59


# REMOVING DUPLICATES

In [9]:
data.shape

(2772, 7)

In [10]:
#Removing Duplicates
print("Number of Duplicated rows:",data.duplicated().sum())
data = data.drop_duplicates()
print("Number of Duplicated rows:",data.duplicated().sum())

Number of Duplicated rows: 1435
Number of Duplicated rows: 0


In [11]:
data.shape

(1337, 7)

# CROSS VALIDATION

In [12]:
from sklearn.model_selection import cross_val_score,KFold

In [13]:
#Define the number of folds for cross-validation

num_folds = 5
kf = KFold(n_splits = num_folds, shuffle=True, random_state=42)

In [14]:
#Perform k-fold cross-validation

cross_val_results = cross_val_score(model,x,y,cv = kf)

In [15]:
#Evaluation metrics

print("Cross Validation Results(Accuracy):")
print(cross_val_results)
print("Mean Accuracy:",cross_val_results.mean())

Cross Validation Results(Accuracy):
[0.73988643 0.7632166  0.77997779 0.72921529 0.73670236]
Mean Accuracy: 0.7497996911947876
