In [96]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

## Split the Data into Training and Testing Sets

### Step 1: Read the `insurance_dataset.csv` data from the `Resource` folder into a Pandas DataFrame.

In [97]:
#  Import and read the charity_data.csv.
insurance_raw_df = pd.read_csv("./Resource/insurance_dataset.csv")
insurance_raw_df.head()

Unnamed: 0,age,gender,bmi,children,smoker,region,medical_history,family_medical_history,exercise_frequency,occupation,coverage_level,charges
0,46,male,21.45,5,yes,southeast,Diabetes,,Never,Blue collar,Premium,20460.30767
1,25,female,25.38,2,yes,northwest,Diabetes,High blood pressure,Occasionally,White collar,Premium,20390.89922
2,38,male,44.88,2,yes,southwest,,High blood pressure,Occasionally,Blue collar,Premium,20204.4763
3,25,male,19.89,0,no,northwest,,Diabetes,Rarely,White collar,Standard,11789.02984
4,49,male,38.21,3,yes,northwest,Diabetes,High blood pressure,Rarely,White collar,Standard,19268.30984


In [98]:
# Downsize dataframe
insurance_df = insurance_raw_df.loc[insurance_raw_df["region"] == 'northwest']
insurance_df.drop(columns='region')

Unnamed: 0,age,gender,bmi,children,smoker,medical_history,family_medical_history,exercise_frequency,occupation,coverage_level,charges
1,25,female,25.38,2,yes,Diabetes,High blood pressure,Occasionally,White collar,Premium,20390.899220
3,25,male,19.89,0,no,,Diabetes,Rarely,White collar,Standard,11789.029840
4,49,male,38.21,3,yes,Diabetes,High blood pressure,Rarely,White collar,Standard,19268.309840
10,21,male,42.08,1,yes,,Diabetes,Rarely,Student,Premium,18996.131560
11,45,female,39.68,1,no,High blood pressure,High blood pressure,Occasionally,Blue collar,Premium,14892.145930
...,...,...,...,...,...,...,...,...,...,...,...
999979,52,female,18.56,0,yes,Heart disease,High blood pressure,Occasionally,White collar,Standard,20023.506850
999983,53,female,23.27,5,yes,Heart disease,Diabetes,Occasionally,Unemployed,Standard,20126.797160
999985,60,male,37.00,4,no,High blood pressure,Heart disease,Occasionally,White collar,Standard,18098.555840
999987,45,female,28.36,1,no,Diabetes,High blood pressure,Never,Student,Basic,8089.419329


In [99]:
insurance_df.nunique()

age                           48
gender                         2
bmi                         3201
children                       6
smoker                         2
region                         1
medical_history                4
family_medical_history         4
exercise_frequency             4
occupation                     4
coverage_level                 3
charges                   249913
dtype: int64

# Binning

In [100]:
# Look at medical_history value counts for binning
insurance_df["medical_history"].value_counts()

None                   62918
High blood pressure    62500
Heart disease          62432
Diabetes               62081
Name: medical_history, dtype: int64

In [101]:
# Look at family_medical_history value counts for binning
insurance_df['family_medical_history'].value_counts()

None                   62622
High blood pressure    62603
Heart disease          62409
Diabetes               62297
Name: family_medical_history, dtype: int64

In [102]:
# Look at exercise_frequency value counts for binning
insurance_df['exercise_frequency'].value_counts()

Rarely          62599
Occasionally    62565
Frequently      62511
Never           62256
Name: exercise_frequency, dtype: int64

In [103]:
# Look at occupation value counts for binning
insurance_df['occupation'].value_counts()

Unemployed      62505
Blue collar     62500
White collar    62477
Student         62449
Name: occupation, dtype: int64

In [104]:
# Look at occupation coverage_level value counts for binning
insurance_df['coverage_level'].value_counts()

Basic       83606
Standard    83185
Premium     83140
Name: coverage_level, dtype: int64

In [105]:
insurance_df['charges'].value_counts()

12700.212320    2
15316.455200    2
13395.768570    2
16632.495860    2
10119.971020    2
               ..
7445.091173     1
18649.648350    1
11883.755240    1
11510.187250    1
18394.902720    1
Name: charges, Length: 249913, dtype: int64

In [106]:
insurance_df.columns

Index(['age', 'gender', 'bmi', 'children', 'smoker', 'region',
       'medical_history', 'family_medical_history', 'exercise_frequency',
       'occupation', 'coverage_level', 'charges'],
      dtype='object')

In [107]:
# Convert categorical data to numeric with `pd.get_dummies`
dummies_gender = pd.get_dummies(insurance_df['gender'])
dummies_smoker = pd.get_dummies(insurance_df['smoker'])
dummies_medical_history = pd.get_dummies(insurance_df['medical_history'])
dummies_family_medical_history = pd.get_dummies(insurance_df['family_medical_history'])
dummies_exercise_frequency = pd.get_dummies(insurance_df['exercise_frequency'])
dummies_occupation = pd.get_dummies(insurance_df['occupation'])
dummies_coverage_level = pd.get_dummies(insurance_df['coverage_level'])

concatenated = pd.concat([insurance_df,dummies_gender, dummies_smoker, dummies_medical_history, dummies_family_medical_history, dummies_exercise_frequency,dummies_occupation,dummies_coverage_level], axis="columns")
concatenated

Unnamed: 0,age,gender,bmi,children,smoker,region,medical_history,family_medical_history,exercise_frequency,occupation,...,Never,Occasionally,Rarely,Blue collar,Student,Unemployed,White collar,Basic,Premium,Standard
1,25,female,25.38,2,yes,northwest,Diabetes,High blood pressure,Occasionally,White collar,...,0,1,0,0,0,0,1,0,1,0
3,25,male,19.89,0,no,northwest,,Diabetes,Rarely,White collar,...,0,0,1,0,0,0,1,0,0,1
4,49,male,38.21,3,yes,northwest,Diabetes,High blood pressure,Rarely,White collar,...,0,0,1,0,0,0,1,0,0,1
10,21,male,42.08,1,yes,northwest,,Diabetes,Rarely,Student,...,0,0,1,0,1,0,0,0,1,0
11,45,female,39.68,1,no,northwest,High blood pressure,High blood pressure,Occasionally,Blue collar,...,0,1,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999979,52,female,18.56,0,yes,northwest,Heart disease,High blood pressure,Occasionally,White collar,...,0,1,0,0,0,0,1,0,0,1
999983,53,female,23.27,5,yes,northwest,Heart disease,Diabetes,Occasionally,Unemployed,...,0,1,0,0,0,1,0,0,0,1
999985,60,male,37.00,4,no,northwest,High blood pressure,Heart disease,Occasionally,White collar,...,0,1,0,0,0,0,1,0,0,1
999987,45,female,28.36,1,no,northwest,Diabetes,High blood pressure,Never,Student,...,1,0,0,0,1,0,0,1,0,0


In [108]:
# drop categorical data
concatenated_df = concatenated.drop(columns=['gender', 'smoker','region', 'medical_history', 'family_medical_history', 'exercise_frequency', 'occupation', 'coverage_level'])
concatenated_df

Unnamed: 0,age,bmi,children,charges,female,male,no,yes,Diabetes,Heart disease,...,Never,Occasionally,Rarely,Blue collar,Student,Unemployed,White collar,Basic,Premium,Standard
1,25,25.38,2,20390.899220,1,0,0,1,1,0,...,0,1,0,0,0,0,1,0,1,0
3,25,19.89,0,11789.029840,0,1,1,0,0,0,...,0,0,1,0,0,0,1,0,0,1
4,49,38.21,3,19268.309840,0,1,0,1,1,0,...,0,0,1,0,0,0,1,0,0,1
10,21,42.08,1,18996.131560,0,1,0,1,0,0,...,0,0,1,0,1,0,0,0,1,0
11,45,39.68,1,14892.145930,1,0,1,0,0,0,...,0,1,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999979,52,18.56,0,20023.506850,1,0,0,1,0,1,...,0,1,0,0,0,0,1,0,0,1
999983,53,23.27,5,20126.797160,1,0,0,1,0,1,...,0,1,0,0,0,1,0,0,0,1
999985,60,37.00,4,18098.555840,0,1,1,0,0,0,...,0,1,0,0,0,0,1,0,0,1
999987,45,28.36,1,8089.419329,1,0,1,0,1,0,...,1,0,0,0,1,0,0,1,0,0


In [109]:
concatenated_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 249931 entries, 1 to 999992
Data columns (total 27 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   age                  249931 non-null  int64  
 1   bmi                  249931 non-null  float64
 2   children             249931 non-null  int64  
 3   charges              249931 non-null  float64
 4   female               249931 non-null  uint8  
 5   male                 249931 non-null  uint8  
 6   no                   249931 non-null  uint8  
 7   yes                  249931 non-null  uint8  
 8   Diabetes             249931 non-null  uint8  
 9   Heart disease        249931 non-null  uint8  
 10  High blood pressure  249931 non-null  uint8  
 11  None                 249931 non-null  uint8  
 12  Diabetes             249931 non-null  uint8  
 13  Heart disease        249931 non-null  uint8  
 14  High blood pressure  249931 non-null  uint8  
 15  None             

In [110]:
# Separate the data into labels and features
# Separate the y variable, the labels
y = concatenated_df['charges']

# Separate the X variable, the features
X = concatenated_df.drop(columns="charges")

In [111]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size= 0.2,
                                                    random_state=1, 
                                                    )
X_train.shape, X_test.shape

((199944, 26), (49987, 26))

## Create a Logistic Regression Model with the Original Data
###  Step 1: Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [112]:
# Import the LogisticRegression module from SKLearn
from sklearn.tree import DecisionTreeRegressor

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
model = DecisionTreeRegressor(random_state=1)

# Fit the model using training data
model.fit(X_train,y_train)

DecisionTreeRegressor(random_state=1)

In [113]:
# Score the model using the test data
print(f"Training Data Score: {model.score(X_train, y_train)}")
print(f"Testing Data Score: {model.score(X_test, y_test)}")

Training Data Score: 0.9999999431574907
Testing Data Score: 0.9858839525007413


### Step 2: Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [114]:
# Generate testing predictions
testing_predictions = model.predict(X_test)

In [115]:
# Make a prediction using the testing data
results = pd.DataFrame({"Prediction": testing_predictions, "Actual": y_test}).reset_index(drop=True)
results.head(10)

Unnamed: 0,Prediction,Actual
0,19589.32245,19987.46983
1,22673.56408,22253.45933
2,13352.3685,12601.39776
3,18708.48754,18102.75
4,23886.08602,22802.38973
5,23846.31186,23973.04374
6,14400.78495,14227.83726
7,9957.722589,9398.161658
8,18027.24674,18432.60158
9,12386.32082,12466.652


### Step 3: Evaluate the model’s performance by doing the following:

* Calculate the accuracy score of the model.

* Generate a confusion matrix.

* Print the classification report.

In [120]:
model.score(X_test,y_test)

0.9858839525007413