### Import Dependencies

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import os

### Import & Inspect Data

In [2]:
data = "healthcare-dataset-stroke-data - Copy.csv"
dataframe = pd.read_csv(data)

In [3]:
dataframe.sort_values(by=['stroke'], ascending=False).head(100)

Unnamed: 0,id,gender,age,age_group,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,61-70,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
171,60739,Female,79.0,71-80,1,1,No,Self-employed,Rural,60.94,,never smoked,1
158,66258,Female,71.0,71-80,0,0,Yes,Self-employed,Urban,195.71,34.1,formerly smoked,1
159,34567,Female,81.0,81-90,1,0,Yes,Self-employed,Rural,74.02,25.0,never smoked,1
160,50931,Female,76.0,71-80,0,0,Yes,Private,Urban,57.92,,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
219,31421,Male,73.0,71-80,0,1,Yes,Govt_job,Rural,219.73,28.6,never smoked,1
218,25904,Female,76.0,71-80,1,1,Yes,Self-employed,Urban,199.86,,smokes,1
190,24905,Female,65.0,61-70,0,0,Yes,Private,Urban,205.77,46.0,formerly smoked,1
191,66071,Male,51.0,51-60,1,0,Yes,Private,Urban,112.16,42.5,formerly smoked,1


#### Gender Data

In [4]:
dataframe.groupby('gender')['gender'].count()

gender
Female    2994
Male      2115
Other        1
Name: gender, dtype: int64

In [5]:
# Purge "Other" from dataset
dataframe2 = dataframe
dataframe2.drop(dataframe2[dataframe2['gender'] == 'Other'].index, inplace = True)


dataframe2.groupby('gender')['gender'].count()

gender
Female    2994
Male      2115
Name: gender, dtype: int64

#### Confirmed Gender is now either Male or Female, binary value required for modelling.

#### Smoking Data

In [6]:
dataframe2.groupby('smoking_status')['smoking_status'].count()

smoking_status
Unknown            1544
formerly smoked     884
never smoked       1892
smokes              789
Name: smoking_status, dtype: int64

In [7]:
# Create new column that groups smoking into binary classifications

dataframe3 = dataframe2

dataframe3.loc[dataframe3['smoking_status'] == 'smokes', 'Smoke Qualifier'] = 'Y'
dataframe3.loc[dataframe3['smoking_status'] == 'formerly smoked', 'Smoke Qualifier'] = 'Y'
dataframe3.loc[dataframe3['smoking_status'] == 'never smoked', 'Smoke Qualifier'] = 'N'
dataframe3.loc[dataframe3['smoking_status'] == 'Unknown', 'Smoke Qualifier'] = 'N'
dataframe3['Smoke Qualifier'].describe()

count     5109
unique       2
top          N
freq      3436
Name: Smoke Qualifier, dtype: object

In [8]:
#Inspecting new column addition
dataframe3.head(10)

Unnamed: 0,id,gender,age,age_group,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,Smoke Qualifier
0,9046,Male,67.0,61-70,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1,Y
1,51676,Female,61.0,61-70,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1,N
2,31112,Male,80.0,71-80,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1,N
3,60182,Female,49.0,41-50,0,0,Yes,Private,Urban,171.23,34.4,smokes,1,Y
4,1665,Female,79.0,71-80,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1,N
5,56669,Male,81.0,81-90,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1,Y
6,53882,Male,74.0,71-80,1,1,Yes,Private,Rural,70.09,27.4,never smoked,1,N
7,10434,Female,69.0,61-70,0,0,No,Private,Urban,94.39,22.8,never smoked,1,N
8,27419,Female,59.0,51-60,0,0,Yes,Private,Rural,76.15,,Unknown,1,N
9,60491,Female,78.0,71-80,0,0,Yes,Private,Urban,58.57,24.2,Unknown,1,N


In [9]:
dataframe3.groupby('Smoke Qualifier')['Smoke Qualifier'].count()

Smoke Qualifier
N    3436
Y    1673
Name: Smoke Qualifier, dtype: int64

In [10]:
# Confirmed Smoking is now binary value via new data series "Smoke Qualifier"

# Logistic Modeling Method

In [35]:
# See exercise 2-2

In [36]:
dataframe4 = dataframe3[['gender', 'hypertension', 'heart_disease', 'Smoke Qualifier', 'stroke']]
data_binary_encoded = pd.get_dummies(dataframe4)
data_binary_encoded.head()

Unnamed: 0,hypertension,heart_disease,stroke,gender_Female,gender_Male,Smoke Qualifier_N,Smoke Qualifier_Y
0,0,1,1,0,1,0,1
1,0,0,1,1,0,1,0
2,0,1,1,0,1,1,0
3,0,0,1,1,0,0,1
4,1,0,1,1,0,1,0


In [37]:
# Assign X (data) and y (target)
X = data_binary_encoded.drop("stroke", axis=1)
y = data_binary_encoded["stroke"]
print(X.shape, y.shape)

(5109, 6) (5109,)


In [38]:
data_binary_encoded.groupby('stroke')['stroke'].count()

stroke
0    4860
1     249
Name: stroke, dtype: int64

In [39]:
#Splitting into Text & Training Data

In [40]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [41]:
#Create Logistic Model

In [42]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier

LogisticRegression()

In [43]:
#Fit using the Training data

In [44]:
classifier.fit(X_train, y_train)

LogisticRegression()

In [45]:
#Validate the model

In [46]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.9545810493343775
Testing Data Score: 0.9413145539906104


In [47]:
#Making Predictions

In [50]:
predictions = classifier.predict(X_test)
print(f"First 10 Predictions:   {predictions[:100]}")
print(f"First 10 Actual labels: {y_test[:100].tolist()}")

First 10 Predictions:   [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
First 10 Actual labels: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [49]:
pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
...,...,...
1273,0,0
1274,0,0
1275,0,0
1276,0,0
