<a href="https://colab.research.google.com/github/KDcodePy/First-model/blob/main/first_model_practice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# First Model (Practice)

## Imports

In [42]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_selector, make_column_transformer

## Load Data

In [2]:
filename = "/content/insurance (1).csv"
df = pd.read_csv(filename)

## Data Exploration

In [3]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [17]:
cat_col = df[["sex", "smoker", "region"]]

In [22]:
#check for data inconsistency
for col in cat_col:
  print(f"checking: {col} \n {cat_col[col].value_counts()}")
  print()
  print()

checking sex 
 male      676
female    662
Name: sex, dtype: int64


checking smoker 
 no     1064
yes     274
Name: smoker, dtype: int64


checking region 
 southeast    364
southwest    325
northwest    325
northeast    324
Name: region, dtype: int64




- Here we can see that our data labeling is pretty consistent

In [None]:
#check for data duplicate 

In [24]:
df.duplicated().sum()

1

- Here we can see that we have 1 duplicated data that we can drop

In [25]:
df.drop_duplicates(inplace=True)

In [34]:
#double checking if duplicate has been dropped 
df.duplicated().sum()

0

- our data is now looking good and ready for preprocessing 

## Preprocessing

### Model validation

In [31]:
# Splitting features & target
target = df["charges"]
X = df.drop(columns="charges")
y = target
X_train , X_test , y_train, y_test = train_test_split(X, y, random_state=42)


In [33]:
X.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,female,27.9,0,yes,southwest
1,18,male,33.77,1,no,southeast
2,28,male,33.0,3,no,southeast
3,33,male,22.705,0,no,northwest
4,32,male,28.88,0,no,northwest


- nominal columns are: `smoker`, `sex`, `region`
- numerical columns are: `age`, `bmi`, `children`
- ordinal columns: `None`

### Instantiate and fit model

In [44]:
#Create seletor
cat_selector = make_column_selector(dtype_include="object")
num_selector = make_column_selector(dtype_include="number")

In [41]:
#create OneHotEncoder and StandardScaler object
scaler = StandardScaler()
ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

In [46]:
#Create preprocessor
cat_tuple = (ohe,cat_selector)
num_tuple = (scaler,num_selector)

preprocessor = make_column_transformer(cat_tuple, num_tuple)
preprocessor

In [47]:
#Create Pipeline
reg = LinearRegression()
lin_reg_pipe = make_pipeline(preprocessor,reg)

In [48]:
lin_reg_pipe.fit(X_train, y_train)

### Measuring model perfomance 

In [49]:
#prediction
train_pred = lin_reg_pipe.predict(X_train)
test_pred = lin_reg_pipe.predict(X_test)

In [57]:
#Mean abosulute Error measurement for train vs test 
train_MAE = mean_absolute_error(y_train, train_pred)
test_MAE = mean_absolute_error(y_test, test_pred)
print(f"train_MAE score: {round(train_MAE,2)}")
print(f"test_MAE score: {round(test_MAE,2)}")
print(f"Model performance delta {round((train_MAE-test_MAE)/train_MAE*100,2)}%")


train_MAE score: 4215.73
test_MAE score: 4076.1
Model performance delta 3.31%


In [68]:
#Get R-squared for train vs test 
print(f"train r-squared score: {round(r2_score(y_train,train_pred),3)}")
print(f"test r-squared score: {round(r2_score(y_test, test_pred),3)}")
print(f"Model performance delta {round((r2_score(y_train,train_pred)-r2_score(y_test, test_pred))/r2_score(y_train,train_pred)*100,2)}%")

train r-squared score: 0.73
test r-squared score: 0.795
Model performance delta -9.01%


In [74]:
# Get RSME for train vs test 
train_rmse = mean_squared_error(y_train, train_pred, squared = False)
test_rmse = mean_squared_error(y_test, test_pred, squared= False)
print(f"train RMSE score: {round(train_rmse,2)}")
print(f"test RMSE score: {round(test_rmse,2)}")
print(f"Model perfomance delta: {round((train_rmse-test_rmse)/train_rmse*100,2)}%")

train RMSE score: 6098.63
test RMSE score: 5947.78
Model perfomance delta: 2.47%
