In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('insurance.csv')

In [3]:
#data preparation
df.columns = df.columns.str.replace(' ','_').str.lower()
df.charges = np.log1p(df.charges)

In [4]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,9.734236
1,18,male,33.77,1,no,southeast,7.453882
2,28,male,33.0,3,no,southeast,8.400763
3,33,male,22.705,0,no,northwest,9.998137
4,32,male,28.88,0,no,northwest,8.260455


In [5]:
df.dtypes

age           int64
sex          object
bmi         float64
children      int64
smoker       object
region       object
charges     float64
dtype: object

In [6]:
#split data in categorical/numerical
categorical = list(df.dtypes[df.dtypes == object].index)
numerical = list(df.dtypes[df.dtypes != object].index)

In [7]:
#check foolish data
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
#split data in 3 sets
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

In [10]:
#drop set indexes
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [11]:
y_train = df_train.charges.values
y_val = df_val.charges.values
y_test = df_test.charges.values

In [12]:
del df_train['charges']
del df_val['charges']
del df_test['charges']

In [13]:
numerical.remove('charges')

In [14]:
df_full_train[numerical].corrwith(df_full_train.charges)

age         0.523401
bmi         0.124984
children    0.126453
dtype: float64

In [15]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error as mse
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor

In [16]:
def test_by_linear_regr(df_train, y_train, df_val, y_val):
    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(df_train.to_dict(orient='records'))
    X_val = dv.transform(df_val.to_dict(orient='records'))
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    print(mse(y_pred, y_val, squared=False))
    print(mse(np.expm1(y_pred), np.expm1(y_val), squared=False))

In [17]:
test_by_linear_regr(df_train[numerical+categorical],y_train,df_val[numerical+categorical],y_val)

0.44624455649573946
7565.767254470169


In [18]:
def test_by_logical_regr(df_train, y_train, df_val, y_val):
    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(df_train.to_dict(orient='records'))
    X_val = dv.transform(df_val.to_dict(orient='records'))
    model = Ridge()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    print(mse(y_pred, y_val, squared=False))
    print(mse(np.expm1(y_pred), np.expm1(y_val), squared=False))

In [19]:
test_by_logical_regr(df_train[numerical+categorical],y_train,df_val[numerical+categorical],y_val)

0.44606358214378133
7532.375827396043


In [20]:
def test_by_decision_tree(df_train, y_train, df_val, y_val):
    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(df_train.to_dict(orient='records'))
    X_val = dv.transform(df_val.to_dict(orient='records'))
    model = DecisionTreeRegressor()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    print(mse(y_pred, y_val, squared=False))
    print(mse(np.expm1(y_pred), np.expm1(y_val), squared=False))

In [21]:
test_by_decision_tree(df_train[numerical+categorical],y_train,df_val[numerical+categorical],y_val)

0.5167341227727849
6382.493490268411


In [22]:
#standardization

In [23]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [24]:
sc_x = StandardScaler()
sc_y = StandardScaler()
# sc_x = MinMaxScaler()
# sc_y = MinMaxScaler()

In [25]:
X = df[numerical].values
y = df.charges.values

In [26]:
X_std = sc_x.fit_transform(X)
y_std = sc_y.fit_transform(y.reshape(-1, 1)).flatten()

In [27]:
X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled = train_test_split(X_std, y_std, test_size=0.3, random_state=0)

In [28]:
#linear
model = LinearRegression()
model.fit(X_train_scaled, y_train_scaled)
y_train_pred = model.predict(X_train_scaled)
y_test_pred = model.predict(X_test_scaled)

In [29]:
print(mse(y_test_pred, y_test_scaled, squared=False))
print(mse(np.expm1(y_test_pred), np.expm1(y_test_scaled), squared=False))

0.8403505095593226
1.5997490983243314


In [30]:
#ridge
model = Ridge()
model.fit(X_train_scaled, y_train_scaled)
y_train_pred = model.predict(X_train_scaled)
y_test_pred = model.predict(X_test_scaled)

In [31]:
print(mse(y_test_pred, y_test_scaled, squared=False))
print(mse(np.expm1(y_test_pred), np.expm1(y_test_scaled), squared=False))

0.8403834970709496
1.599847408357255


In [32]:
#decision_tree
model = DecisionTreeRegressor()
model.fit(X_train_scaled, y_train_scaled)
y_train_pred = model.predict(X_train_scaled)
y_test_pred = model.predict(X_test_scaled)

In [33]:
print(mse(y_test_pred, y_test_scaled, squared=False))
print(mse(np.expm1(y_test_pred), np.expm1(y_test_scaled), squared=False))

1.2345548601068368
2.1672521267987386
