In [51]:
# Import the modules
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
# from sklearn.preprocessing import StandardScaler
from joblib import dump

## Split the Data into Training and Testing Sets

### Step 1: Read the `insurance_dataset.csv` data from the `Resource` folder into a Pandas DataFrame.

In [52]:
#  Import and read the charity_data.csv.
insurance_raw_df = pd.read_csv("./Resources/insurance_dataset.csv")
insurance_raw_df.head()

Unnamed: 0,age,gender,bmi,children,smoker,region,medical_history,family_medical_history,exercise_frequency,occupation,coverage_level,charges
0,46,male,21.45,5,yes,southeast,Diabetes,,Never,Blue collar,Premium,20460.30767
1,25,female,25.38,2,yes,northwest,Diabetes,High blood pressure,Occasionally,White collar,Premium,20390.89922
2,38,male,44.88,2,yes,southwest,,High blood pressure,Occasionally,Blue collar,Premium,20204.4763
3,25,male,19.89,0,no,northwest,,Diabetes,Rarely,White collar,Standard,11789.02984
4,49,male,38.21,3,yes,northwest,Diabetes,High blood pressure,Rarely,White collar,Standard,19268.30984


In [53]:
# Downsize dataframe
# insurance_df = insurance_raw_df.loc[insurance_raw_df["region"] == 'northwest']
# insurance_df.drop(columns='region')

In [54]:
insurance_raw_df.nunique()

age                           48
gender                         2
bmi                         3201
children                       6
smoker                         2
region                         4
medical_history                4
family_medical_history         4
exercise_frequency             4
occupation                     4
coverage_level                 3
charges                   999695
dtype: int64

# Binning

In [55]:
# Look at medical_history value counts for binning
insurance_raw_df["medical_history"].value_counts()

None                   250762
Heart disease          250121
High blood pressure    249782
Diabetes               249335
Name: medical_history, dtype: int64

In [56]:
# Look at family_medical_history value counts for binning
insurance_raw_df['family_medical_history'].value_counts()

None                   250404
Heart disease          250035
High blood pressure    249824
Diabetes               249737
Name: family_medical_history, dtype: int64

In [57]:
# Look at exercise_frequency value counts for binning
insurance_raw_df['exercise_frequency'].value_counts()

Rarely          250538
Occasionally    250362
Frequently      249746
Never           249354
Name: exercise_frequency, dtype: int64

In [58]:
# Look at occupation value counts for binning
insurance_raw_df['occupation'].value_counts()

Unemployed      250571
Student         250279
Blue collar     249825
White collar    249325
Name: occupation, dtype: int64

In [59]:
# Look at occupation coverage_level value counts for binning
insurance_raw_df['coverage_level'].value_counts()

Basic       333515
Standard    333508
Premium     332977
Name: coverage_level, dtype: int64

In [60]:
insurance_raw_df['charges'].value_counts()

23203.51170    2
14242.88970    2
17804.72484    2
19219.99252    2
19084.89174    2
              ..
19540.35545    1
16294.74663    1
18149.64475    1
14613.49927    1
19600.10121    1
Name: charges, Length: 999695, dtype: int64

In [61]:
insurance_raw_df.columns

Index(['age', 'gender', 'bmi', 'children', 'smoker', 'region',
       'medical_history', 'family_medical_history', 'exercise_frequency',
       'occupation', 'coverage_level', 'charges'],
      dtype='object')

In [62]:
# Convert categorical data to numeric with `pd.get_dummies`
# dummies_gender = pd.get_dummies(insurance_df['gender'])
# dummies_smoker = pd.get_dummies(insurance_df['smoker'])
# dummies_medical_history = pd.get_dummies(insurance_df['medical_history'])
# dummies_family_medical_history = pd.get_dummies(insurance_df['family_medical_history'])
# dummies_exercise_frequency = pd.get_dummies(insurance_df['exercise_frequency'])
# dummies_occupation = pd.get_dummies(insurance_df['occupation'])
# dummies_coverage_level = pd.get_dummies(insurance_df['coverage_level'])

# concatenated = pd.concat([insurance_df,dummies_gender, dummies_smoker, dummies_medical_history, dummies_family_medical_history, dummies_exercise_frequency,dummies_occupation,dummies_coverage_level], axis="columns")
# concatenated.head(5)

In [63]:
concatenated = pd.get_dummies(insurance_raw_df)
concatenated.columns

Index(['age', 'bmi', 'children', 'charges', 'gender_female', 'gender_male',
       'smoker_no', 'smoker_yes', 'region_northeast', 'region_northwest',
       'region_southeast', 'region_southwest', 'medical_history_Diabetes',
       'medical_history_Heart disease', 'medical_history_High blood pressure',
       'medical_history_None', 'family_medical_history_Diabetes',
       'family_medical_history_Heart disease',
       'family_medical_history_High blood pressure',
       'family_medical_history_None', 'exercise_frequency_Frequently',
       'exercise_frequency_Never', 'exercise_frequency_Occasionally',
       'exercise_frequency_Rarely', 'occupation_Blue collar',
       'occupation_Student', 'occupation_Unemployed',
       'occupation_White collar', 'coverage_level_Basic',
       'coverage_level_Premium', 'coverage_level_Standard'],
      dtype='object')

In [64]:
concatenated.head(5)

Unnamed: 0,age,bmi,children,charges,gender_female,gender_male,smoker_no,smoker_yes,region_northeast,region_northwest,...,exercise_frequency_Never,exercise_frequency_Occasionally,exercise_frequency_Rarely,occupation_Blue collar,occupation_Student,occupation_Unemployed,occupation_White collar,coverage_level_Basic,coverage_level_Premium,coverage_level_Standard
0,46,21.45,5,20460.30767,0,1,0,1,0,0,...,1,0,0,1,0,0,0,0,1,0
1,25,25.38,2,20390.89922,1,0,0,1,0,1,...,0,1,0,0,0,0,1,0,1,0
2,38,44.88,2,20204.4763,0,1,0,1,0,0,...,0,1,0,1,0,0,0,0,1,0
3,25,19.89,0,11789.02984,0,1,1,0,0,1,...,0,0,1,0,0,0,1,0,0,1
4,49,38.21,3,19268.30984,0,1,0,1,0,1,...,0,0,1,0,0,0,1,0,0,1


In [65]:
# Separate the data into labels and features
# Separate the y variable, the labels
y = concatenated['charges']

# Separate the X variable, the features
X = concatenated.drop(columns="charges")

In [66]:
X.columns

Index(['age', 'bmi', 'children', 'gender_female', 'gender_male', 'smoker_no',
       'smoker_yes', 'region_northeast', 'region_northwest',
       'region_southeast', 'region_southwest', 'medical_history_Diabetes',
       'medical_history_Heart disease', 'medical_history_High blood pressure',
       'medical_history_None', 'family_medical_history_Diabetes',
       'family_medical_history_Heart disease',
       'family_medical_history_High blood pressure',
       'family_medical_history_None', 'exercise_frequency_Frequently',
       'exercise_frequency_Never', 'exercise_frequency_Occasionally',
       'exercise_frequency_Rarely', 'occupation_Blue collar',
       'occupation_Student', 'occupation_Unemployed',
       'occupation_White collar', 'coverage_level_Basic',
       'coverage_level_Premium', 'coverage_level_Standard'],
      dtype='object')

In [67]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size= 0.2,
                                                    random_state=1, 
                                                    )
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((800000, 30), (200000, 30), (800000,), (200000,))

In [68]:
models = [LinearRegression(), DecisionTreeRegressor()]

for i in range(2):
	models[i].fit(X_train, y_train)
	print(f'{models[i]} : ')
	print(f"Training Data Score: {models[i].score(X_train, y_train)}")
	print(f"Testing Data Score: {models[i].score(X_test, y_test)}")
	testing_predictions = models[i].predict(X_test)
	# Make a prediction using the testing data
	results = pd.DataFrame({"Prediction": testing_predictions, "Actual": y_test}).reset_index(drop=True)
	print(results.head(5))
	print()

LinearRegression() : 
Training Data Score: 0.9957267068571238
Testing Data Score: 0.9957190785263942
     Prediction       Actual
0  12378.602826  12481.06896
1  18783.920686  18299.07199
2  18862.831011  18846.79561
3  21283.839914  21597.66307
4  25182.982849  25596.72139

DecisionTreeRegressor() : 
Training Data Score: 0.9999998052576735
Testing Data Score: 0.9868358095789653
    Prediction       Actual
0  12325.74429  12481.06896
1  19359.47097  18299.07199
2  19172.32232  18846.79561
3  21409.86323  21597.66307
4  25362.39435  25596.72139



# Save Model

In [69]:
dump(models[0], 'model_LinearRegression.joblib')
dump(models[1], 'model_DecisionTreeRegressor.joblib')

['model_DecisionTreeRegressor.joblib']