In [1]:
# Initial imports
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Python SQL toolkit and Object Relational Mapper
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, inspect, MetaData, Table
import psycopg2
from config import user, db_password

  """)


# Postgress Engine Database Configuration

In [2]:
db_string = f"postgresql://{user}:{db_password}@myprogressdb.cpb2tnnn1lyz.us-east-2.rds.amazonaws.com:5432/Medical_Stroke_DB"
engine = create_engine(db_string)

In [3]:
# reflect an existing database into a new model
Base = automap_base()
# reflect the tables
Base.prepare(engine, reflect=True)
# We can view all of the classes that automap found
Base.classes.keys()

['stroke_data']

In [4]:
#  Create a meta data object to hold the reflected table schema
metadata=MetaData()
# Create a table object and use 'autoload' and 'autoload_with' to define the columns from the table. 
table = Table('stroke_data',metadata, autoload=True, autoload_with=engine)
# Get the column names using the 'keys()' method on the column object. 
table.columns.keys()

['index',
 'gender',
 'age',
 'hypertension',
 'heart_disease',
 'ever_married',
 'work_type',
 'Residence_type',
 'avg_glucose_level',
 'bmi',
 'smoking_status',
 'stroke']

In [5]:
results=[]
results = engine.execute('SELECT * FROM stroke_data;')
#print(results)
df = pd.DataFrame(results, columns=table.columns.keys())
df.set_index("index", inplace=True)

In [6]:
df.head()

Unnamed: 0_level_0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,1,67.0,0,1,1,2,1,228.69,36.6,1,1
2,1,80.0,0,1,1,2,0,105.92,32.5,2,1
3,0,49.0,0,0,1,2,1,171.23,34.4,3,1
4,0,79.0,1,0,1,3,0,174.12,24.0,2,1
5,1,81.0,0,0,1,2,1,186.21,29.0,1,1


In [7]:
df["stroke"].value_counts()

0    4622
1     208
Name: stroke, dtype: int64

# Balancing the data

In [8]:
#Create another dataset with the original values, using the 209 stroke records + 209 non stroke randomly selected
# import Python3 random function 
import random

y_non_stroke = list(df[df["stroke"] == 0].index)
index_list= random.sample(y_non_stroke, 208) + list(df[df["stroke"] == 1].index)

df=df.filter(items=index_list, axis=0)

In [9]:
df["stroke"].value_counts()

0    300
1    208
Name: stroke, dtype: int64

# Splitting Dataset: Training and Testing

In [10]:
#Store dependent (y) and independent (X) variables
X=df.drop(columns="stroke")
y=df["stroke"]

# Use sklearn to split dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, train_size=0.80)

# Exporting Test and Train dataset without scaling

In [11]:
directory="Resources"
sample_name="actual"
scale_data="unscaled"

In [12]:
#Exporting testing dataset 
output_file_path = f"{directory}/X_train_{sample_name}_balanced_{scale_data}_train_dataset.csv"

x_train_actual_balanced_unscale=pd.DataFrame(data=X_train, columns=X.columns)

#Creating csv file 
x_train_actual_balanced_unscale.to_csv(output_file_path, index=False)

In [13]:
#Exporting testing dataset 
output_file_path = f"{directory}/y_train_{sample_name}_balanced_{scale_data}_train_dataset.csv"

y_train_actual_balanced_unscale=pd.DataFrame(data=y_train)

#Creating csv file 
y_train_actual_balanced_unscale.to_csv(output_file_path, index=False)

In [14]:
#Exporting testing dataset 
output_file_path = f"{directory}/X_test_{sample_name}_balanced_{scale_data}_train_dataset.csv"

x_test_actual_balanced_unscale=pd.DataFrame(data=X_test, columns=X.columns)

#Creating csv file 
x_test_actual_balanced_unscale.to_csv(output_file_path, index=False)

In [15]:
#Exporting testing dataset 
output_file_path = f"{directory}/y_test_{sample_name}_balanced_{scale_data}_train_dataset.csv"

y_test_actual_balanced_unscale=pd.DataFrame(data=y_test)

#Creating csv file 
y_test_actual_balanced_unscale.to_csv(output_file_path, index=False)

# Standardizing Training Dataset

In [16]:
# Standardize the data with StandardScaler().
scaler=StandardScaler()
scaler.fit(X_train)
X_train_scaled=scaler.transform(X_train)
X_test_scaled=scaler.transform(X_test)



In [17]:
scale_data="scaled"

In [18]:
#Exporting testing dataset 
output_file_path = f"{directory}/X_train_{sample_name}_balanced_{scale_data}_train_dataset.csv"

X_train_scaled = pd.DataFrame(data=X_train_scaled, columns=X_train.columns)


#Creating csv file 
X_train_scaled.to_csv(output_file_path, index=False)

In [19]:
#Exporting testing dataset 
output_file_path = f"{directory}/y_train_{sample_name}_balanced_{scale_data}_train_dataset.csv"

y_train_actual_balanced_scale=pd.DataFrame(data=y_train)

#Creating csv file 
y_train_actual_balanced_scale.to_csv(output_file_path, index=False)

y_train_actual_balanced_scale.value_counts()

stroke
0         240
1         166
dtype: int64

In [20]:
#Exporting testing dataset 
output_file_path = f"{directory}/X_test_{sample_name}_balanced_{scale_data}_train_dataset.csv"

X_test_scaled = pd.DataFrame(data=X_test_scaled, columns=X_test.columns)

#Creating csv file 
X_test_scaled.to_csv(output_file_path, index=False)

In [21]:
#Exporting testing dataset 
output_file_path = f"{directory}/y_test_{sample_name}_balanced_{scale_data}_train_dataset.csv"


y_test_actual_balanced_scale=pd.DataFrame(data=y_test)

#Creating csv file 
y_test_actual_balanced_scale.to_csv(output_file_path, index=False)

#counts
y_test_actual_balanced_scale.value_counts()

stroke
0         60
1         42
dtype: int64