In [25]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error 
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from scipy.stats import randint
from sklearn.model_selection import GridSearchCV
import math
import pickle 
import joblib

In [425]:
!pip3 install -U scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-0.23.2-cp37-cp37m-macosx_10_9_x86_64.whl (7.2 MB)
[K     |████████████████████████████████| 7.2 MB 5.5 MB/s eta 0:00:01
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 0.20.3
    Uninstalling scikit-learn-0.20.3:
      Successfully uninstalled scikit-learn-0.20.3
[31mERROR: After October 2020 you may experience errors when installing or updating packages. This is because pip will change the way that it resolves dependency conflicts.

We recommend you use --use-feature=2020-resolver to test your packages with the new resolver before it becomes the default.

azureml-train-automl-runtime 1.11.0.post1 requires numpy<1.17.0,>=1.16.0, but you'll have numpy 1.18.1 which is incompatible.
azureml-train-automl-runtime 1.11.0.post1 requires scikit-learn<=0.20.3,>=0.19.0, but you'll have scikit-learn 0.23.2 which is incompatible.
azureml-automl-runtime 1.11.0 requires

In [2]:
try:
    from sklearn.impute import SimpleImputer # Scikit-Learn 0.20+
except ImportError:
    from sklearn.preprocessing import Imputer as SimpleImputer

import pandas as pd
insurance_df = pd.read_csv("/Users/maryam/Documents/Cineplex/Insurance prediction/nsurance.csv")
insurance_df.head()


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [3]:
insurance = insurance_df.drop(['charges'], axis=1) # drop labels for training set

insurance_labels = insurance_df['charges'].copy()

In [4]:
# splitting train and test data
X_train, X_test, y_train, y_test = train_test_split(insurance, insurance_labels, test_size=0.4)

In [5]:
X_train_num= X_train.drop([ 'sex','smoker','region'],axis=1)
X_train_cat = X_train[['sex', 'smoker','region']]
num_attribs = list(X_train_num)
cat_attribs = ['sex', 'region','smoker']


In [6]:
cat_attribs

['sex', 'region', 'smoker']

In [7]:
num_attribs

['age', 'bmi', 'children']

In [8]:

class DataFrameSelector( BaseEstimator, TransformerMixin ):
    #Class Constructor 
    def __init__( self, feature_names ):
        self.feature_names = feature_names 
    
    #Return self nothing else to do here    
    def fit( self, X, y = None ):
        return self 
    
    #Method that describes what we need this transformer to do
    def transform( self, X, y = None ):
#         return X[ self.feature_names ].values
          df = X.copy()
        # convert columns to categorical
          for name in df.columns.to_list():
                col = pd.Categorical(df[name])
                df[name] = col.codes
    
    #returns numpy array
          return df
    


In [9]:

num_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

In [10]:

cat_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [11]:

full_pipeline = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, num_attribs),
        ('cat', cat_pipeline, cat_attribs)])


In [12]:
#RANDOM FOREST WITH GRID SEARCH

In [13]:
param_distribs = {
        'n_estimators': randint(low=4, high=30),
        'max_features': randint(low=1, high=2),
    }

forest_reg = RandomForestRegressor(random_state=42)
rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,
                                n_iter=5, cv=5, scoring='neg_mean_squared_error', random_state=42)

In [14]:

X_train_prepared = full_pipeline.fit_transform(X_train)
rnd_search.fit(X_train_prepared, y_train)

RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(random_state=42),
                   n_iter=5,
                   param_distributions={'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fcb144e9950>,
                                        'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fcb144e9910>},
                   random_state=42, scoring='neg_mean_squared_error')

In [15]:
y_train_pred= rnd_search.predict(X_train_prepared)

In [16]:
rf_mse = mean_squared_error(y_train, y_train_pred)
rf_rmse = np.sqrt(rf_mse)
rf_rmse

2078.2760275596966

In [17]:

# let's try the full preprocessing pipeline on a few training instances
some_data = insurance.iloc[:5]
some_labels = insurance_labels.iloc[:5]
some_data_prepared= full_pipeline.transform(some_data)
y_rf_pred= rnd_search.predict(some_data_prepared)


In [18]:
y_rf_pred

array([23079.20681739,  2098.56916087,  6633.14744783, 17570.04395522,
        8239.90013913])

In [19]:
# validate on test set

x_test_prepared = full_pipeline.transform(X_test)
y_test_pred= rnd_search.predict(x_test_prepared)


In [24]:
rf_mse = mean_squared_error(y_test, y_test_pred)
rf_rmse = np.sqrt(rf_mse)
math.trunc(rf_rmse)

5391

In [24]:
# LINEAR REGRESSIOR
lr= LinearRegression()
X_train_prepared_lr = full_pipeline.fit_transform(X_train)
lr.fit(X_train_prepared_lr, y_train)
y_train_pred_lr= lr.predict(X_train_prepared_lr)
lin_mse = mean_squared_error(y_train, y_train_pred_lr)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

6025.626299576042

In [25]:
# validate on test set

x_test_prepared_lr = full_pipeline.transform(X_test)
y_test_pred_lr= lr.predict(x_test_prepared_lr)
lr_mse = mean_squared_error(y_test, y_test_pred_lr)
lr_rmse = np.sqrt(lr_mse)
lr_rmse

6089.478791099683

In [26]:
# TRAIN WITH GRADIENT BOOSTING
gb = GradientBoostingRegressor(random_state=0)
X_train_prepared_gb = full_pipeline.fit_transform(X_train)
gb.fit(X_train_prepared_gb, y_train)
y_train_pred_gb= gb.predict(X_train_prepared_gb)
gb_mse = mean_squared_error(y_train, y_train_pred_gb)
gb_rmse = np.sqrt(gb_mse)
gb_rmse

3567.7913053780503

In [27]:
# validate on test set

x_test_prepared_gb = full_pipeline.transform(X_test)
y_test_pred_gb= gb.predict(x_test_prepared_gb)
gb_mse = mean_squared_error(y_test, y_test_pred_gb)
gb_rmse = np.sqrt(gb_mse)
gb_rmse

4733.387466083859

In [27]:
# TRAIN WITH SVR
svr = SVR(epsilon=0.2)
X_train_prepared_svr = full_pipeline.fit_transform(X_train)
svr.fit(X_train_prepared_svr, y_train)
y_train_pred_svr= svr.predict(X_train_prepared_svr)
svr_mse = mean_squared_error(y_train, y_train_pred_svr)
svr_rmse = np.sqrt(svr_mse)
svr_rmse

13163.20160625537

In [28]:
# validate on test set

x_test_prepared_svr = full_pipeline.transform(X_test)
y_test_pred_svr= svr.predict(x_test_prepared_svr)
svr_mse = mean_squared_error(y_test, y_test_pred_svr)
svr_rmse = np.sqrt(svr_mse)
svr_rmse

11701.474462654018

In [32]:
# SAVE DATA PREPRATION PIPLINE AND FINAL MODEL

# model gb
gb_model = gb
Pkl_Filename = "gb.pkl"  
with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(gb_model, file)
#...


# model lr
lr_model = lr
Pkl_Filename = "lrm.pkl"  
with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(lr_model, file)
#...



# model rf
rf_model = rnd_search
Pkl_Filename = "rf.pkl"  
with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(rf_model, file)
#...


# model svr
svr_model = svr
Pkl_Filename = "svr.pkl"  
with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(svr_model, file)
#...


# pipline
data_prepration = full_pipeline
Pkl_Filename = "data_prepration.pkl"  
with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(data_prepration, file)



In [31]:
# Load the Model back from file
Pkl_Filename = "data_prepration.pkl"  
with open(Pkl_Filename, 'rb') as file:  
    Pickled_data_prepration = pickle.load(file)

Pickled_data_prepration

ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median')),
                                                 ('scaler', StandardScaler())]),
                                 ['age', 'bmi', 'children']),
                                ('cat',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(fill_value='missing',
                                                                strategy='constant')),
                                                 ('onehot',
                                                  OneHotEncoder(handle_unknown='ignore'))]),
                                 ['sex', 'region', 'smoker'])])

In [32]:
# Load the Model back from file
Pkl_Filename = "gb.pkl"  
with open(Pkl_Filename, 'rb') as file:  
    Pickled_gb_model = pickle.load(file)

Pickled_gb_model

GradientBoostingRegressor(random_state=0)

In [33]:
# let's try the full preprocessing pipeline on a few training instances
some_data = insurance.iloc[:5]
some_labels = insurance_labels.iloc[:5]
some_data_prepared= Pickled_data_prepration.transform(some_data)
y_rf_pred= Pickled_gb_model.predict(some_data_prepared)
y_rf_pred
                                    

array([20509.7723566 ,  2881.17205125,  5962.66493576, 11457.89020139,
        3552.77660032])

In [34]:
# Load the Model back from file
Pkl_Filename = "lrm.pkl"  
with open(Pkl_Filename, 'rb') as file:  
    Pickled_lr_model = pickle.load(file)

Pickled_lr_model

LinearRegression()

In [35]:
# let's try the full preprocessing pipeline on a few training instances
some_data = insurance.iloc[:5]
some_labels = insurance_labels.iloc[:5]
some_data_prepared= Pickled_data_prepration.transform(some_data)
y_rf_pred= Pickled_lr_model.predict(some_data_prepared)
y_rf_pred
             

array([25168.12332877,  2960.4392921 ,  6001.69173246,  3956.36922799,
        5765.07061452])

In [36]:
import nltk
import sklearn

print('The nltk version is {}.'.format(nltk.__version__))
print('The scikit-learn version is {}.'.format(sklearn.__version__))

The nltk version is 3.4.5.
The scikit-learn version is 0.23.2.
