# PROJECT_DE GEZONDHEIDZORG

In [1]:
import os
from matplotlib import pyplot as plt
import pickle
from PIL import Image
import pandas as pd
import sqlite3
import json
import numpy as np
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
import logging
import math

# Hardening
from pathlib import Path



# DATA INGESTION
# 1.Data Extraction
# 2.Data Transformation
# 3.Data Load

# 1.Data Extraction

In [2]:
# Global configuration
logging.basicConfig(level=logging.DEBUG)
dbName = "rest_server/medisch_centrum_randstad/db.sqlite3"
tableName = "rest_api_netlify"


# Collecting the data
logging.info("Load transformed data from database into dataframe")

logging.info(f"Connect to {Path(dbName).name}")
dbConnection = sqlite3.connect(dbName)
patient_DF = pd.read_sql_query(f"SELECT * FROM {tableName}", dbConnection)
logging.debug(patient_DF.head())

INFO:root:Load transformed data from database into dataframe
INFO:root:Connect to db.sqlite3
DEBUG:root:   id  genetic  length   mass  exercise  smoking  alcohol  lifespan  sugar
0   1     73.9   185.0   99.7       0.9      0.0      2.4      73.1    6.9
1   2     86.0   172.0  105.4       1.8      8.1      0.4      85.0    4.2
2   3     83.3   176.0  111.4       1.1      0.8      4.6      81.6    7.5
3   4     82.8   164.0  111.4       4.7     11.8      1.0      81.0    2.9
4   5     78.7   178.0   71.6       1.5      8.3      4.9      75.0    5.5


# Summary of Dataframe

In [3]:
patient_DF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4096 entries, 0 to 4095
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        4096 non-null   int64  
 1   genetic   4095 non-null   float64
 2   length    4093 non-null   float64
 3   mass      4094 non-null   float64
 4   exercise  4095 non-null   float64
 5   smoking   4096 non-null   float64
 6   alcohol   4094 non-null   float64
 7   lifespan  4094 non-null   float64
 8   sugar     4095 non-null   float64
dtypes: float64(8), int64(1)
memory usage: 288.1 KB


# Dropping the null cells and sorting the dataframe

In [4]:
patient_DF = patient_DF.dropna()
patient_DF.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4084 entries, 0 to 4095
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        4084 non-null   int64  
 1   genetic   4084 non-null   float64
 2   length    4084 non-null   float64
 3   mass      4084 non-null   float64
 4   exercise  4084 non-null   float64
 5   smoking   4084 non-null   float64
 6   alcohol   4084 non-null   float64
 7   lifespan  4084 non-null   float64
 8   sugar     4084 non-null   float64
dtypes: float64(8), int64(1)
memory usage: 319.1 KB


# Find duplicated records an drop if any

In [5]:
#To find duplicate values

#patient_DF = patient_DF[patient_DF.duplicated( keep =False)]

# Clean the records by coercing the errors

In [6]:
# Cleaning
logging.info("Preprocessing : remove rows with missing values")
patient_DF1 = patient_DF.apply(pd.to_numeric, errors='coerce')
patient_DF2 = patient_DF1[patient_DF1.select_dtypes(include=[np.number]).ge(0).all(1)]
logging.debug(patient_DF2.head())

INFO:root:Preprocessing : remove rows with missing values
DEBUG:root:   id  genetic  length   mass  exercise  smoking  alcohol  lifespan  sugar
0   1     73.9   185.0   99.7       0.9      0.0      2.4      73.1    6.9
1   2     86.0   172.0  105.4       1.8      8.1      0.4      85.0    4.2
2   3     83.3   176.0  111.4       1.1      0.8      4.6      81.6    7.5
3   4     82.8   164.0  111.4       4.7     11.8      1.0      81.0    2.9
4   5     78.7   178.0   71.6       1.5      8.3      4.9      75.0    5.5


# 3.Load Cleaned Data - sql and csv

In [7]:
# save to sql

# patient_DF2.to_sql('theCleanedData', dbConnection, index= False)

In [12]:
# save as csv

patient_DF2.to_csv('data/cleaned_data.csv',header = True, index =False)
#patient_DF2 = pd.read_csv('../Project/csv/cleaned_data.csv',',')
display(patient_DF2)

Unnamed: 0,id,genetic,length,mass,exercise,smoking,alcohol,lifespan,sugar
0,1,73.9,185.0,99.7,0.9,0.0,2.4,73.1,6.9
1,2,86.0,172.0,105.4,1.8,8.1,0.4,85.0,4.2
2,3,83.3,176.0,111.4,1.1,0.8,4.6,81.6,7.5
3,4,82.8,164.0,111.4,4.7,11.8,1.0,81.0,2.9
4,5,78.7,178.0,71.6,1.5,8.3,4.9,75.0,5.5
...,...,...,...,...,...,...,...,...,...
4079,4092,80.3,176.0,115.9,3.7,0.1,5.6,80.7,5.8
4080,4093,75.3,162.0,129.6,3.6,6.2,5.6,71.1,7.6
4081,4094,93.1,189.0,115.3,2.7,9.4,4.9,90.0,6.6
4082,4095,77.9,170.0,90.8,2.3,13.2,0.8,75.9,6.2


# EXPLORATORY DATA ANALYSIS / Data Visualiasation

# Adding the Column BMI based on length and mass

In [13]:



# BMI calculation

patient_DF2['BMI'] = patient_DF2['mass'] / pow( (patient_DF2['length']/100), 2 )
logging.debug(f"BMI : {patient_DF2['BMI']}")

patient_DF2.to_csv('data/medisch_centrum_randstad_BMI.csv', header = True, index = False)

# Save dataframe as new table

#patient_DF2.to_sql('BMI_table', dbConnection, index =False)

dbConnection.close()

DEBUG:root:BMI : 0       29.130752
1       35.627366
2       35.963326
3       41.418798
4       22.598157
          ...    
4079    37.416064
4080    49.382716
4081    32.277932
4082    31.418685
4083    36.555002
Name: BMI, Length: 4084, dtype: float64


# Descriptive Statistics

In [None]:
patient_DF2.describe()

# Distribution of Data-Histogram

In [None]:
#patient_DF2 = patient_DF2.drop(['length','mass'], axis =1)
#patient_DF2_hist = patient_DF2.hist( figsize=(20,10), grid = False, bins = 50, color = "green", ec = "white")

# Heat Map - Correlation between Variables

In [None]:
fig = plt.figure(figsize=(8,7))
sns.heatmap(patient_DF2.corr(), annot = True)

# Box Plot

In [None]:
plt.figure(figsize=(10,10))
#ax = sns.boxplot(data = patient_DF2)
ax = sns.boxplot(data = patient_DF2['lifespan'])

# Outliers

In [None]:
# to fine the record of outliers

# Q1=patient_DF2.quantile(0.25)
# Q3=patient_DF2.quantile(0.75)
# IQR = Q3 - Q1
# print(Q1)
# print(Q3)
# print(IQR)

# patient_DF2 =patient_DF2[~((patient_DF2<(Q1-1.5*IQR)) | (patient_DF2>(Q3+1.5*IQR))).any(axis=1)]
# sns.boxplot(x =patient_DF2['lifespan'])

# Scatter Plot

In [None]:
patient_DF2.plot.scatter(x ='BMI', y= 'lifespan', s = 'lifespan' , c= 'darkblue',figsize= (15,10), alpha = 0.25)

# Pair Plot - bivariate distributions

In [None]:
#patient_DF3 = patient_DF2.drop(['length','mass'], axis =1)

#sns.pairplot(patient_DF3)

# Clustering using Groupby and Dendrogram

In [None]:
intervals = [ 10, 20, 25, 30, 40]
col = patient_DF2['BMI']
patient_DF2['BMI Groups'] = pd.cut(x=col, bins=intervals)
patient_DF2['BMI Groups']


In [None]:
patient_DF2.groupby('BMI Groups')['BMI Groups'].count()

In [None]:
patient_DF2_oh = pd.get_dummies(patient_DF2)
 # Display the one-hot encoded dataframe
patient_DF2_oh 

In [None]:
import scipy.cluster.hierarchy as shc


plt.figure(figsize=(10, 7))
plt.title("Dendrogram")


selected_data =patient_DF2_oh.iloc[:, 7:10]
clusters = shc.linkage(selected_data, 
            method='ward', 
            metric="euclidean")
shc.dendrogram(Z=clusters)
plt.show()

In [None]:
plt.figure(figsize=(10, 7))
plt.title(" Dendrogram with line")
clusters = shc.linkage(selected_data, 
            method='ward', 
            metric="euclidean")
shc.dendrogram(clusters)
plt.axhline(y = 125, color = 'r', linestyle = '-')


In [None]:
from sklearn.cluster import AgglomerativeClustering

clustering_model = AgglomerativeClustering(n_clusters=4, metric='euclidean', linkage='ward')
clustering_model.fit(selected_data)
clustering_model.labels_

In [None]:
data_labels = clustering_model.labels_
sns.scatterplot(x='BMI', 
                y='lifespan', 
                data=selected_data, 
                hue=data_labels,
                palette="rainbow").set_title('Data')

# Pair plot

In [None]:
patient_DF3 = patient_DF2.drop(['length','mass'], axis =1)

sns.pairplot(patient_DF3)
display(patient_DF3)

# Summary - OLS Regression

In [None]:
X = patient_DF2[ ['exercise','genetic','smoking','alcohol','sugar','BMI'] ].astype(float)
X = sm.add_constant(X)

y = patient_DF2[ "lifespan" ].astype(float)    # y is a series
# y = df.loc[:,"Life Expectancy"].astype(float)   # alternate code, same outcome

# Baseline results - model / fit / summarize, lots of bad Pvalue>0.05
model = sm.OLS(y, X)
results = model.fit()
results.summary()

In [None]:
from scipy import stats
r, p = stats.pearsonr(patient_DF2.BMI,patient_DF2.lifespan)
print(p, 20)
print(r, 40)

# Regression- Multiple linear (Model and Training)

In [None]:
from sklearn import preprocessing, svm

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error

# loading library
import pickle

# Separating the data into independent and dependent variables

# Converting each dataframe into a numpy array 

X = patient_DF2[['genetic', 'exercise', 'smoking', 'alcohol', 'sugar', 'BMI']]

y = patient_DF2['lifespan']

# Splitting the data into training and testing data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

regr = LinearRegression()

regr.fit(X_train, y_train)

print(regr.score(X_test, y_test))

# Data scatter of predicted values

y_pred = regr.predict(X_test)
print(y_pred)

y_predict = regr.predict([[80, 4, 2, 2, 2, 25]])
print(y_predict)

print('mean_squared_error : ', mean_squared_error(y_test, y_pred))
print('mean_absolute_error : ', mean_absolute_error(y_test, y_pred))

# create an iterator object with write permission - model.pkl
with open('model_pkl', 'wb') as files:
    pickle.dump(regr, files)

# 

In [None]:
from mpl_toolkits.mplot3d import Axes3D

# Prepare data
patient_DF4 = pd.read_csv('../Project/csv/data_BMI.csv',',')

X1 =  patient_DF4[['genetic', 'smoking']].values.reshape(-1,2)
Y1 = patient_DF4['lifespan']

# Create range for each dimension
x = X1[:, 0]
y = X1[:, 1]

z = Y1

xx_pred = np.linspace(50, 80, 30)  
yy_pred = np.linspace(2, 10, 30) 




# range of advertising values
xx_pred, yy_pred = np.meshgrid(xx_pred, yy_pred)
model_viz = np.array([xx_pred.flatten(), yy_pred.flatten()]).T

# Predict using model built on previous step
ols = linear_model.LinearRegression()
model = ols.fit(X1, Y1)
predicted = model.predict(model_viz)

# Evaluate model by using it's R^2 score 
r2 = model.score(X1, Y1)

# Plot model visualization
plt.style.use('classic')

fig = plt.figure(figsize=(40, 20))

ax1 = fig.add_subplot(131, projection='3d')
ax2 = fig.add_subplot(132, projection='3d')
ax3 = fig.add_subplot(133, projection='3d')

axes = [ax1, ax2, ax3]

for ax in axes:
    ax.plot(x, y, z, color='green', zorder=15, linestyle='none', marker='o', alpha=0.25)
    ax.scatter(xx_pred.flatten(),yy_pred.flatten(), predicted, facecolor=(0,0,0,0), s= 20, edgecolor='blue')
    ax.set_xlabel('genetic', fontsize=12)
    ax.set_ylabel('smoking', fontsize=12)
    ax.set_zlabel('lifespan', fontsize=12)
    ax.locator_params(nbins=4, axis='x')
    ax.locator_params(nbins=5, axis='x')

ax1.view_init(elev=25, azim=-60)
ax2.view_init(elev=15, azim=15)
ax3.view_init(elev=25, azim=60)

fig.suptitle('Multi-Linear Regression Model Visualization ($R^2 = %.2f$)' % r2, fontsize=15, color='b')

fig.tight_layout()