In [2]:
# In this workbook, We're going to demonstrate how to make predictions 
# on the medical cost dataset from an insurance company.

In [3]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import statsmodels.api as sm
from statsmodels.tools.eval_measures import mse, rmse
from sqlalchemy import create_engine
import seaborn as sns
# Display preferences.
%matplotlib inline
pd.options.display.float_format = '{:.3f}'.format

import warnings
warnings.filterwarnings(action="ignore")

In [4]:
postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'medicalcosts'

engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(
    postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db))

md_df = pd.read_sql_query('select * from medicalcosts',con=engine)

# no need for an open connection, as we're only doing a single query
engine.dispose()

In [5]:
md_df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.9
1,18,male,33.77,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.705,0,no,northwest,21984.5
4,32,male,28.88,0,no,northwest,3866.86


In [6]:
md_df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207,30.663,1.095,13270.422
std,14.05,6.098,1.205,12110.012
min,18.0,15.96,0.0,1121.87
25%,27.0,26.296,0.0,4740.288
50%,39.0,30.4,1.0,9382.03
75%,51.0,34.694,2.0,16639.9
max,64.0,53.13,5.0,63770.4


In [7]:
md_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
age         1338 non-null int64
sex         1338 non-null object
bmi         1338 non-null float64
children    1338 non-null int64
smoker      1338 non-null object
region      1338 non-null object
charges     1338 non-null float64
dtypes: float64(2), int64(2), object(3)
memory usage: 57.6+ KB


In [8]:
total_missing = md_df.isnull().sum().sort_values(ascending=False)
percent_missing = (md_df.isnull().sum()/md_df.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total_missing, percent_missing], axis=1, keys=['Total', 'Percent'])
missing_data.head(10)


Unnamed: 0,Total,Percent
charges,0,0.0
region,0,0.0
smoker,0,0.0
children,0,0.0
bmi,0,0.0
sex,0,0.0
age,0,0.0


In [9]:
# Above, no missing data

In [10]:
# in the past, we used is_male & is_smoker as features

In [11]:
md_df["is_male"]= pd.get_dummies(md_df.sex, drop_first = True)
md_df["is_smoker"]= pd.get_dummies(md_df.smoker, drop_first = True)

In [12]:
print(md_df["is_male"])

0       0
1       1
2       1
3       1
4       1
       ..
1333    1
1334    0
1335    0
1336    0
1337    0
Name: is_male, Length: 1338, dtype: uint8


In [13]:
print(md_df["is_smoker"])

0       1
1       0
2       0
3       0
4       0
       ..
1333    0
1334    0
1335    0
1336    0
1337    1
Name: is_smoker, Length: 1338, dtype: uint8


In [14]:
from sklearn import ensemble
from sklearn.model_selection import cross_val_score

#rfc = ensemble.RandomForestClassifier()
rfc = ensemble.RandomForestRegressor()
X = md_df.drop('charges',axis= 1)
Y = md_df['charges']
X = pd.get_dummies(X)
X = X.dropna(axis=1)

cross_val_score(rfc, X, Y, cv=10)

array([0.8607093 , 0.81401131, 0.80104219, 0.73010407, 0.84123034,
       0.84414086, 0.82900496, 0.7764398 , 0.83723011, 0.86484908])

In [15]:
# Next mode is Decision tree

In [16]:
# Implementation Process
# Separate the target variable
#X = md_df.values[:,8:9]
#Y = md_df.values[:,0]
X = md_df[["is_male", "is_smoker"]]
Y = md_df[["charges"]]

In [17]:
# Split the dataset into Test & Train
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size = 0.3, random_state = 100)

In [19]:
# Function to perform training 
#clf_entropy = DecisionTreeRegressor(criterion='entropy', random_state = 100, max_depth=3,min_samples_leaf=5)
from sklearn.tree import DecisionTreeRegressor
clf = DecisionTreeRegressor( random_state = 100, max_depth=3, min_samples_leaf=5)
clf.fit(X_train,y_train)

DecisionTreeRegressor(criterion='mse', max_depth=3, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=5,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=100, splitter='best')

In [20]:
# Lets apply our prediction to above model
y_pred = clf.predict(X_test)
print(y_pred)

[ 8199.28903846  8805.72638462  8805.72638462  8805.72638462
 34152.04803922 30301.40375     8199.28903846  8199.28903846
 34152.04803922  8199.28903846  8199.28903846  8805.72638462
  8199.28903846  8199.28903846  8199.28903846 34152.04803922
  8805.72638462  8805.72638462  8199.28903846  8199.28903846
  8199.28903846  8199.28903846  8199.28903846 34152.04803922
  8805.72638462 30301.40375     8199.28903846 30301.40375
 30301.40375     8805.72638462  8805.72638462  8805.72638462
 30301.40375    34152.04803922  8199.28903846  8199.28903846
  8199.28903846 34152.04803922  8199.28903846 30301.40375
  8805.72638462  8805.72638462  8199.28903846  8805.72638462
  8805.72638462  8805.72638462  8805.72638462  8199.28903846
  8199.28903846  8805.72638462  8199.28903846  8805.72638462
  8199.28903846  8199.28903846  8805.72638462  8805.72638462
  8199.28903846 34152.04803922  8805.72638462  8199.28903846
  8805.72638462  8805.72638462  8199.28903846  8805.72638462
  8199.28903846  8805.72638462

In [21]:
y_test.head()

Unnamed: 0,charges
12,1826.84
306,20177.7
318,7421.19
815,1877.93
157,15518.2


In [22]:
y_pred.size


402

In [23]:
from sklearn.metrics import r2_score
# Checking accuracy
#r2_score(y_test, y_pred)
print("Accuracy is " , r2_score(y_test, y_pred)* 100)

Accuracy is  63.31393776985366
