In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/insurance/insurance.csv


In [3]:
#Read the data file
data=pd.read_csv('/kaggle/input/insurance/insurance.csv')

# Exploratory Data Analysis

In [4]:
#Top 5 rows of the dataframe
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [5]:
#Are there any null values?
data.isna().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [6]:
#Knowing data types is important because some models do not accept certain data types

data.dtypes
##We have 3 object data types (string or text)

age           int64
sex          object
bmi         float64
children      int64
smoker       object
region       object
charges     float64
dtype: object

In [7]:
#What unique values do the object type columns have?
data.sex.unique(), data.sex.nunique()

(array(['female', 'male'], dtype=object), 2)

In [8]:
data.smoker.unique(), data.smoker.nunique()

(array(['yes', 'no'], dtype=object), 2)

In [9]:
data.region.unique(), data.region.nunique()

(array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object), 4)

### We have String Data types which are not accepted by the models we choose. 

So we need to do some feature engineering.

We need will also build two models, a simple regression model, and a more complex one.

In [10]:
from sklearn.preprocessing import LabelEncoder
#Label Encoder encodes every unique text value in a column into a unique integer
#Exmaple: If animal column contains 'dog', 'cat', 'rat', Label Encoder transforms them
######## 1,2 and 3. 1: 'dog', 2: 'cat', 3:'rat'

In [11]:
sex_enc=LabelEncoder()
smoke_enc=LabelEncoder()
reg_enc=LabelEncoder()

In [12]:
data['sex']=sex_enc.fit_transform(data['sex'])
data['smoker']=smoke_enc.fit_transform(data['smoker'])
data['region']=reg_enc.fit_transform(data['region'])

In [13]:
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,3,16884.924
1,18,1,33.77,1,0,2,1725.5523
2,28,1,33.0,3,0,2,4449.462
3,33,1,22.705,0,0,1,21984.47061
4,32,1,28.88,0,0,1,3866.8552


## Split the data into a train set and test set. Train your model using the train set, and test its performance on the test set.

It is important that a model not see some of the data when it is being trained. The model will perform well on the data it already has seen. Its actual performance is evaluated on the unseen data (test data).

In [14]:
from sklearn.model_selection import train_test_split

In [16]:
X_train, X_test, y_train, y_test = train_test_split(data[data.columns[:-2]], data[data.columns[-1]], test_size=0.33, random_state=42)

                                                                                 
#Test split tells what fraction of the total data should be kept for test. 0.33 means 33% of the data is held for testing, and 67% is left of training.
                                                                                 
#Fix a random state, since the train and test split are made on a random basis. 
#To replicate the results, we need to have the exact same data split.

## BASELINE: This is the simplest model we build to solve the problem. This does not always guarantee the best accuracy, but it runs very quick, hence it is low complexity

In [18]:
# Import models
from sklearn.linear_model import LinearRegression


In [20]:
#Creating an instance of Regression model to use in our code
reg_model=LinearRegression()

In [21]:
reg_model.fit(X_train, y_train)

In [22]:
reg_model.score(X_train, y_train)
#(data[data.columns[:-2]], data[data.columns[-1]])

0.7447818782365105

In [23]:
reg_model.coef_,reg_model.intercept_


(array([  262.35104943,   146.5321422 ,   331.64087047,   377.66377381,
        23693.21588352]),
 -12504.25802569739)

## Let us see how the model works on unseen data

In [24]:
predictions=reg_model.predict(X_test)

In [25]:
from sklearn.metrics import r2_score

In [26]:
r2_score(y_test,predictions)

0.7587422388407031

## We see that accuracy on unseen data is better than accurancy on seen data. What does this mean?

### Variance-Bias tradeoff

## Can we do better than linear regression?

### Ensemble Methods: These are model models built by combining more than one simpler models

STRENGTH IN UNITY

In [27]:
from sklearn.ensemble import RandomForestRegressor

## We can also control how each individual tree in the forest grows, and how the forest itself grows

These controls are called HYPERPARAMETERS. And the process of fine tuning them is called Hyperparameters Tuning

sklearn.ensemble.RandomForestRegressor(n_estimators=100, *, criterion='squared_error', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=1.0, max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, ccp_alpha=0.0, max_samples=None)

In [28]:
rf_model =RandomForestRegressor()

In [29]:
rf_model.fit(X_train,y_train)

In [30]:
rf_model.score(X_train,y_train)

0.9751238164843135

### We observe higher accuracy

In [31]:
rf_predictions=rf_model.predict(X_test)

In [32]:
r2_score(y_test,rf_predictions)

0.8377103421392382

## We see that accuracy on unseen data is better than accurancy on seen data. What does this mean?

# Expert Tip: Always read the documentation of each model to know the list of hyperparameters, and the impact of changing each. Some models could have more than 100 hyperparameters!!