In [20]:
# Import the modules
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report, mean_squared_error, r2_score
import matplotlib.pyplot as plt
# from sklearn.preprocessing import StandardScaler
from joblib import dump

## Split the Data into Training and Testing Sets

### Step 1: Read the `insurance_dataset.csv` data from the `Resource` folder into a Pandas DataFrame.

In [21]:
#  Import and read the charity_data.csv.
insurance_raw_df = pd.read_csv("./Resources/insurance_dataset.csv")
insurance_raw_df.head()

Unnamed: 0,age,gender,bmi,children,smoker,region,medical_history,family_medical_history,exercise_frequency,occupation,coverage_level,charges
0,46,male,21.45,5,yes,southeast,Diabetes,,Never,Blue collar,Premium,20460.30767
1,25,female,25.38,2,yes,northwest,Diabetes,High blood pressure,Occasionally,White collar,Premium,20390.89922
2,38,male,44.88,2,yes,southwest,,High blood pressure,Occasionally,Blue collar,Premium,20204.4763
3,25,male,19.89,0,no,northwest,,Diabetes,Rarely,White collar,Standard,11789.02984
4,49,male,38.21,3,yes,northwest,Diabetes,High blood pressure,Rarely,White collar,Standard,19268.30984


In [22]:
insurance_raw_df.dropna()

Unnamed: 0,age,gender,bmi,children,smoker,region,medical_history,family_medical_history,exercise_frequency,occupation,coverage_level,charges
0,46,male,21.45,5,yes,southeast,Diabetes,,Never,Blue collar,Premium,20460.307670
1,25,female,25.38,2,yes,northwest,Diabetes,High blood pressure,Occasionally,White collar,Premium,20390.899220
2,38,male,44.88,2,yes,southwest,,High blood pressure,Occasionally,Blue collar,Premium,20204.476300
3,25,male,19.89,0,no,northwest,,Diabetes,Rarely,White collar,Standard,11789.029840
4,49,male,38.21,3,yes,northwest,Diabetes,High blood pressure,Rarely,White collar,Standard,19268.309840
...,...,...,...,...,...,...,...,...,...,...,...,...
999995,59,male,46.67,2,no,northeast,High blood pressure,,Frequently,Student,Basic,11584.134900
999996,33,male,36.83,2,no,northeast,,High blood pressure,Frequently,Unemployed,Basic,9834.871456
999997,39,male,39.84,0,yes,northeast,Heart disease,High blood pressure,Rarely,Blue collar,Standard,22076.632860
999998,37,female,45.06,4,yes,northeast,High blood pressure,Diabetes,Occasionally,Unemployed,Premium,20297.618730


In [23]:
insurance_raw_df.nunique()

age                           48
gender                         2
bmi                         3201
children                       6
smoker                         2
region                         4
medical_history                4
family_medical_history         4
exercise_frequency             4
occupation                     4
coverage_level                 3
charges                   999695
dtype: int64

# Binning

In [24]:
# Look at medical_history value counts for binning
insurance_raw_df["medical_history"].value_counts()

None                   250762
Heart disease          250121
High blood pressure    249782
Diabetes               249335
Name: medical_history, dtype: int64

In [25]:
# Look at family_medical_history value counts for binning
insurance_raw_df['family_medical_history'].value_counts()

None                   250404
Heart disease          250035
High blood pressure    249824
Diabetes               249737
Name: family_medical_history, dtype: int64

In [26]:
# Look at exercise_frequency value counts for binning
insurance_raw_df['exercise_frequency'].value_counts()

Rarely          250538
Occasionally    250362
Frequently      249746
Never           249354
Name: exercise_frequency, dtype: int64

In [27]:
# Look at occupation value counts for binning
insurance_raw_df['occupation'].value_counts()

Unemployed      250571
Student         250279
Blue collar     249825
White collar    249325
Name: occupation, dtype: int64

In [28]:
# Look at occupation coverage_level value counts for binning
insurance_raw_df['coverage_level'].value_counts()

Basic       333515
Standard    333508
Premium     332977
Name: coverage_level, dtype: int64

In [29]:
insurance_raw_df['charges'].value_counts()

23203.51170    2
14242.88970    2
17804.72484    2
19219.99252    2
19084.89174    2
              ..
19540.35545    1
16294.74663    1
18149.64475    1
14613.49927    1
19600.10121    1
Name: charges, Length: 999695, dtype: int64

In [30]:
insurance_raw_df.columns

Index(['age', 'gender', 'bmi', 'children', 'smoker', 'region',
       'medical_history', 'family_medical_history', 'exercise_frequency',
       'occupation', 'coverage_level', 'charges'],
      dtype='object')

In [31]:
concatenated = pd.get_dummies(insurance_raw_df)
concatenated.columns

Index(['age', 'bmi', 'children', 'charges', 'gender_female', 'gender_male',
       'smoker_no', 'smoker_yes', 'region_northeast', 'region_northwest',
       'region_southeast', 'region_southwest', 'medical_history_Diabetes',
       'medical_history_Heart disease', 'medical_history_High blood pressure',
       'medical_history_None', 'family_medical_history_Diabetes',
       'family_medical_history_Heart disease',
       'family_medical_history_High blood pressure',
       'family_medical_history_None', 'exercise_frequency_Frequently',
       'exercise_frequency_Never', 'exercise_frequency_Occasionally',
       'exercise_frequency_Rarely', 'occupation_Blue collar',
       'occupation_Student', 'occupation_Unemployed',
       'occupation_White collar', 'coverage_level_Basic',
       'coverage_level_Premium', 'coverage_level_Standard'],
      dtype='object')

In [32]:
concatenated.head(5)

Unnamed: 0,age,bmi,children,charges,gender_female,gender_male,smoker_no,smoker_yes,region_northeast,region_northwest,...,exercise_frequency_Never,exercise_frequency_Occasionally,exercise_frequency_Rarely,occupation_Blue collar,occupation_Student,occupation_Unemployed,occupation_White collar,coverage_level_Basic,coverage_level_Premium,coverage_level_Standard
0,46,21.45,5,20460.30767,0,1,0,1,0,0,...,1,0,0,1,0,0,0,0,1,0
1,25,25.38,2,20390.89922,1,0,0,1,0,1,...,0,1,0,0,0,0,1,0,1,0
2,38,44.88,2,20204.4763,0,1,0,1,0,0,...,0,1,0,1,0,0,0,0,1,0
3,25,19.89,0,11789.02984,0,1,1,0,0,1,...,0,0,1,0,0,0,1,0,0,1
4,49,38.21,3,19268.30984,0,1,0,1,0,1,...,0,0,1,0,0,0,1,0,0,1


In [33]:
# Separate the data into labels and features
# Separate the y variable, the labels
y = concatenated['charges']

# Separate the X variable, the features
X = concatenated.drop(columns="charges")

In [34]:
X.columns

Index(['age', 'bmi', 'children', 'gender_female', 'gender_male', 'smoker_no',
       'smoker_yes', 'region_northeast', 'region_northwest',
       'region_southeast', 'region_southwest', 'medical_history_Diabetes',
       'medical_history_Heart disease', 'medical_history_High blood pressure',
       'medical_history_None', 'family_medical_history_Diabetes',
       'family_medical_history_Heart disease',
       'family_medical_history_High blood pressure',
       'family_medical_history_None', 'exercise_frequency_Frequently',
       'exercise_frequency_Never', 'exercise_frequency_Occasionally',
       'exercise_frequency_Rarely', 'occupation_Blue collar',
       'occupation_Student', 'occupation_Unemployed',
       'occupation_White collar', 'coverage_level_Basic',
       'coverage_level_Premium', 'coverage_level_Standard'],
      dtype='object')

In [35]:
X.head()

Unnamed: 0,age,bmi,children,gender_female,gender_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,...,exercise_frequency_Never,exercise_frequency_Occasionally,exercise_frequency_Rarely,occupation_Blue collar,occupation_Student,occupation_Unemployed,occupation_White collar,coverage_level_Basic,coverage_level_Premium,coverage_level_Standard
0,46,21.45,5,0,1,0,1,0,0,1,...,1,0,0,1,0,0,0,0,1,0
1,25,25.38,2,1,0,0,1,0,1,0,...,0,1,0,0,0,0,1,0,1,0
2,38,44.88,2,0,1,0,1,0,0,0,...,0,1,0,1,0,0,0,0,1,0
3,25,19.89,0,0,1,1,0,0,1,0,...,0,0,1,0,0,0,1,0,0,1
4,49,38.21,3,0,1,0,1,0,1,0,...,0,0,1,0,0,0,1,0,0,1


In [36]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size= 0.2,
                                                    random_state=1, 
                                                    )
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((800000, 30), (200000, 30), (800000,), (200000,))

In [50]:
models = [LinearRegression(), DecisionTreeRegressor()]
reg_mse = []
reg_rmse = []
reg_r2_score = []
y_pred = []

for i in range(2):
	models[i].fit(X_train, y_train)
	y_pred.append(models[i].predict(X_test))
	print(f'{models[i]} : ')
	print(f"Training Data Score: {models[i].score(X_train, y_train)}")
	print(f"Testing Data Score: {models[i].score(X_test, y_test)}")
	reg_mse.append(mean_squared_error(y_test, y_pred[i]))
	reg_rmse.append(mean_squared_error(y_test, y_pred[i], squared=False))
	reg_r2_score.append(r2_score(y_test, y_pred[i]))

	# Make a prediction using the testing data
	results = pd.DataFrame({"Prediction": y_pred[i], "Actual": y_test}).reset_index(drop=True)
	print(f'Mean Squared Error: {reg_mse[i]}')
	print(f'Root Mean Squared Error: {reg_rmse[i]}')
	print(f'R2 score: {reg_r2_score[i]}')
	print()
	

LinearRegression() : 
Training Data Score: 0.9957267068571238
Testing Data Score: 0.9957190785263942
Mean Squared Error: 83547.32738761159
Root Mean Squared Error: 289.0455455245965
R2 score: 0.9957190785263942

DecisionTreeRegressor() : 
Training Data Score: 0.9999998052576735
Testing Data Score: 0.98682458255003
Mean Squared Error: 257134.10581061314
Root Mean Squared Error: 507.08392383373103
R2 score: 0.98682458255003



# Save Model

In [None]:
dump(models[0], 'model_LinearRegression.joblib')
dump(models[1], 'model_DecisionTreeRegressor.joblib')

['model_DecisionTreeRegressor.joblib']