In [150]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error 
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import randint
from sklearn.model_selection import GridSearchCV
import pickle 
import joblib

In [151]:
try:
    from sklearn.impute import SimpleImputer # Scikit-Learn 0.20+
except ImportError:
    from sklearn.preprocessing import Imputer as SimpleImputer

In [152]:
import pandas as pd
insurance_df = pd.read_csv("/Users/maryam/Documents/Cineplex/Insurance prediction/nsurance.csv")
insurance_df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [153]:
insurance_df.shape

(1338, 7)

In [154]:
insurance = insurance_df.drop(['charges'], axis=1) # drop labels for training set

insurance_labels = insurance_df['charges'].copy()

In [155]:
# splitting train and test data
X_train, X_test, y_train, y_test = train_test_split(insurance, insurance_labels, test_size=0.4)

In [156]:
X_train_num= X_train.drop(['sex', 'region','smoker'],axis=1)
X_train_cat = X_train[['sex', 'region']]

In [157]:

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])

X_train_num_tr = num_pipeline.fit_transform(X_train_num)

In [158]:
X_train_num_tr

array([[ 0.07027237,  0.24062563,  0.78366662],
       [ 0.28843407,  0.78193075, -0.0749408 ],
       [ 0.94291917,  0.39458966, -0.93354822],
       ...,
       [-0.36605103, -0.89870819, -0.0749408 ],
       [ 1.01563974,  0.82406827,  0.78366662],
       [ 0.50659577, -0.44816082, -0.0749408 ]])

In [159]:
num_attribs = list(X_train_num)
cat_attribs = ['sex', 'region','smoker']

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

X_train_prepared = full_pipeline.fit_transform(X_train)

In [160]:
X_train_prepared

array([[ 0.07027237,  0.24062563,  0.78366662, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.28843407,  0.78193075, -0.0749408 , ...,  0.        ,
         1.        ,  0.        ],
       [ 0.94291917,  0.39458966, -0.93354822, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-0.36605103, -0.89870819, -0.0749408 , ...,  0.        ,
         1.        ,  0.        ],
       [ 1.01563974,  0.82406827,  0.78366662, ...,  1.        ,
         1.        ,  0.        ],
       [ 0.50659577, -0.44816082, -0.0749408 , ...,  0.        ,
         1.        ,  0.        ]])

In [161]:

param_distribs = {
        'n_estimators': randint(low=1, high=200),
        'max_features': randint(low=1, high=8),
    }

forest_reg = RandomForestRegressor(random_state=42)
rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,
                                n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42)
# rnd_search.fit(X_train, y_train)

In [162]:
full_pipeline_randomforest_with_predictor = Pipeline([
        ("preparation", full_pipeline),
        ("rnd_search", rnd_search),
    ])

full_pipeline_randomforest_with_predictor.fit(X_train, y_train)


Pipeline(steps=[('preparation',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('std_scaler',
                                                                   StandardScaler())]),
                                                  ['age', 'bmi', 'children']),
                                                 ('cat', OneHotEncoder(),
                                                  ['sex', 'region',
                                                   'smoker'])])),
                ('rnd_search',
                 RandomizedSearchCV(cv=5,
                                    estimator=RandomForestRegressor(random_state=42),
                                    param_distributions={'max_features': <scipy.stats._distn_infrastructure.rv_froze

In [163]:

# let's try the full preprocessing pipeline on a few training instances
some_data = insurance.iloc[:5]
some_labels = insurance_labels.iloc[:5]

In [164]:
full_pipeline_randomforest_with_predictor.predict(some_data)

array([17681.8267665,  3370.368476 ,  4707.2949513, 16998.5691497,
        6159.7849055])

In [165]:
insurance_predictions = full_pipeline_randomforest_with_predictor.predict(X_train)
lin_mse = mean_squared_error(y_train, insurance_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

1882.969390750379

In [166]:
lin_mae = mean_absolute_error(y_train, insurance_predictions)
lin_mae

1001.8912566699468

In [179]:
my_model = full_pipeline_with_predictor

# joblib.dump(my_model, "my_model.pkl") # DIFF
Pkl_Filename = "rfr.pkl"  

with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(my_model, file)
#...


In [168]:
# Load the Model back from file
with open(Pkl_Filename, 'rb') as file:  
    Pickled_LR_Model = pickle.load(file)

Pickled_LR_Model

Pipeline(steps=[('preparation',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('std_scaler',
                                                                   StandardScaler())]),
                                                  ['age', 'bmi', 'children']),
                                                 ('cat', OneHotEncoder(),
                                                  ['sex', 'region',
                                                   'smoker'])])),
                ('rnd_search',
                 RandomizedSearchCV(cv=5,
                                    estimator=RandomForestRegressor(random_state=42),
                                    param_distributions={'max_features': <scipy.stats._distn_infrastructure.rv_froze

In [169]:
# predict on the test set
full_pipeline_randomforest_with_predictor.fit(X_test, y_test)
test_predictions=full_pipeline_randomforest_with_predictor.predict(X_test)
lin_mae = mean_absolute_error(y_test, test_predictions)
lin_mae

914.7721034899466

In [170]:
# let's try the full preprocessing pipeline on a few training instances
some_data = insurance.iloc[:10]
some_labels = insurance_labels.iloc[:10]

In [171]:
my_model_loaded.predict(some_data)

array([17339.627962 ,  2755.4684147,  5620.7612018, 16615.2560596,
        4578.1183632,  4464.7575887,  8355.1089617,  7075.537398 ,
        7337.1197902, 22154.2717158])

In [172]:
some_labels

0    16884.92400
1     1725.55230
2     4449.46200
3    21984.47061
4     3866.85520
5     3756.62160
6     8240.58960
7     7281.50560
8     6406.41070
9    28923.13692
Name: charges, dtype: float64

In [174]:
# Linear Regressior Pipline

full_pipeline_linearRegressior_with_predictor = Pipeline([
        ("preparation", full_pipeline),
        ("lr", LinearRegression()),
    ])

full_pipeline_linearRegressior_with_predictor.fit(X_train, y_train)

Pipeline(steps=[('preparation',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('std_scaler',
                                                                   StandardScaler())]),
                                                  ['age', 'bmi', 'children']),
                                                 ('cat', OneHotEncoder(),
                                                  ['sex', 'region',
                                                   'smoker'])])),
                ('lr', LinearRegression())])

In [175]:
# let's try the full preprocessing pipeline on a few training instances
some_data = insurance.iloc[:5]
some_labels = insurance_labels.iloc[:5]
full_pipeline_linearRegressior_with_predictor.predict(some_data)

array([24719.89689521,  3838.69613738,  7418.11075855,  4207.90187791,
        5775.13532081])

In [176]:
insurance_predictions = full_pipeline_linearRegressior_with_predictor.predict(X_train)
lin_mse = mean_squared_error(y_train, insurance_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

6203.931032183709

In [178]:
lr_model = full_pipeline_linearRegressior_with_predictor

# joblib.dump(my_model, "my_model.pkl") # DIFF
Pkl_Filename = "lr.pkl"  

with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(lr_model, file)
#...

