# PROJECT_DE GEZONDHEIDZORG - DATA INGESTION & EDA

In [None]:
import os
from matplotlib import pyplot as plt
from matplotlib_venn import venn2, venn3
from PIL import Image
import pandas as pd
import sqlite3
import json
import numpy as np
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
import logging
import math

# Hardening
from pathlib import Path



# DATA INGESTION
# 1.Data Extraction
# 2.Data Transformation
# 3.Data Load

# 1.Data Extraction

In [None]:
# Global configuration
logging.basicConfig(level=logging.DEBUG)
dbName = "../Project1/db.sqlite3"
tableName = "rest_api_netlify"


# Collecting the data
logging.info("Load transformed data from database into dataframe")

logging.info(f"Connect to {Path(dbName).name}")
dbConnection = sqlite3.connect(dbName)
patient_DF = pd.read_sql_query(f"SELECT * FROM {tableName}", dbConnection)
logging.debug(patient_DF.head())

# Summary of Dataframe

In [None]:
patient_DF.info()

# Dropping the null cells and sorting the dataframe

In [None]:
patient_DF = patient_DF.dropna()
patient_DF.info()

# Find duplicated records an drop if any

In [None]:
#To find duplicate values

#patient_DF = patient_DF[patient_DF.duplicated( keep =False)]

# Clean the records by coercing the errors

In [None]:
# Cleaning
logging.info("Preprocessing : remove rows with missing values")
patient_DF1 = patient_DF.apply(pd.to_numeric, errors='coerce')
patient_DF2 = patient_DF1[patient_DF1.select_dtypes(include=[np.number]).ge(0).all(1)]
logging.debug(patient_DF2.head())

# 3.Load Cleaned Data - sql and csv

In [None]:
# save to sql

# patient_DF2.to_sql('theCleanedData', dbConnection, index= False)

In [None]:
# save as csv

patient_DF2.to_csv('cleaned_data.csv',header = True, index =False)
patient_DF2 = pd.read_csv('../Project1/cleaned_data.csv',',')
display(patient_DF2)

# EXPLORATORY DATA ANALYSIS / Data Visualiasation

# Adding the Column BMI based on length and mass

In [None]:



# BMI calculation

patient_DF2['BMI'] = patient_DF2['mass'] / pow( (patient_DF2['length']/100), 2 )
logging.debug(f"BMI : {patient_DF2['BMI']}")

# Save dataframe as new table

#patient_DF2.to_sql('BMI_table', dbConnection, index =False)

#dbConnection.close()

# Descriptive Statistics

In [None]:
patient_DF2.describe()

# Distribution of Data-Histogram

In [None]:
#patient_DF2 = patient_DF2.drop(['length','mass'], axis =1)
patient_DF2_hist = patient_DF2.hist( figsize=(20,10), grid = False, bins = 50, color = "green", ec = "white")

# Heat Map - Correlation between Variables

In [None]:
fig = plt.figure(figsize=(8,7))
sns.heatmap(patient_DF2.corr(), annot = True)

# Box Plot

In [None]:
plt.figure(figsize=(10,10))
#ax = sns.boxplot(data = patient_DF2)
ax = sns.boxplot(data = patient_DF2['lifespan'])

# Outliers

In [None]:
# to fine the record of outliers

# Q1=patient_DF2.quantile(0.25)
# Q3=patient_DF2.quantile(0.75)
# IQR = Q3 - Q1
# print(Q1)
# print(Q3)
# print(IQR)

# patient_DF2 =patient_DF2[~((patient_DF2<(Q1-1.5*IQR)) | (patient_DF2>(Q3+1.5*IQR))).any(axis=1)]
# sns.boxplot(x =patient_DF2['lifespan'])

# Scatter Plot

In [None]:
patient_DF2.plot.scatter(x ='BMI', y= 'lifespan', s = 'lifespan' , c= 'darkblue',figsize= (15,10), alpha = 0.25)

# Pair Plot - bivariate distributions

In [None]:
#patient_DF3 = patient_DF2.drop(['length','mass'], axis =1)

#sns.pairplot(patient_DF3)

# Clustering using Groupby and Dendrogram

In [None]:
intervals = [ 10, 20, 25, 30, 40]
col = patient_DF2['BMI']
patient_DF2['BMI Groups'] = pd.cut(x=col, bins=intervals)
patient_DF2['BMI Groups']


In [None]:
patient_DF2.groupby('BMI Groups')['BMI Groups'].count()

In [None]:
patient_DF2_oh = pd.get_dummies(patient_DF2)
 # Display the one-hot encoded dataframe
patient_DF2_oh 

In [None]:
import scipy.cluster.hierarchy as shc
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 7))
plt.title("Dendrogram")


selected_data =patient_DF2_oh.iloc[:, 7:10]
clusters = shc.linkage(selected_data, 
            method='ward', 
            metric="euclidean")
shc.dendrogram(Z=clusters)
plt.show()

In [None]:
plt.figure(figsize=(10, 7))
plt.title(" Dendrogram with line")
clusters = shc.linkage(selected_data, 
            method='ward', 
            metric="euclidean")
shc.dendrogram(clusters)
plt.axhline(y = 125, color = 'r', linestyle = '-')


In [None]:
from sklearn.cluster import AgglomerativeClustering

clustering_model = AgglomerativeClustering(n_clusters=4, metric='euclidean', linkage='ward')
clustering_model.fit(selected_data)
clustering_model.labels_

In [None]:
data_labels = clustering_model.labels_
sns.scatterplot(x='BMI', 
                y='lifespan', 
                data=selected_data, 
                hue=data_labels,
                palette="rainbow").set_title('Data')

# Pair plot

In [None]:
patient_DF3 = patient_DF2.drop(['length','mass'], axis =1)

sns.pairplot(patient_DF3)
display(patient_DF3)

# Summary - OLS Regression

In [None]:
X = patient_DF2[ ['exercise','genetic','smoking','alcohol','sugar','BMI'] ].astype(float)
X = sm.add_constant(X)

y = patient_DF2[ "lifespan" ].astype(float)    # y is a series
# y = df.loc[:,"Life Expectancy"].astype(float)   # alternate code, same outcome

# Baseline results - model / fit / summarize, lots of bad Pvalue>0.05
model = sm.OLS(y, X)
results = model.fit()
results.summary()

In [None]:
from scipy import stats
r, p = stats.pearsonr(patient_DF2.BMI,patient_DF2.lifespan)
print(p, 20)
print(r, 40)

# Regression- Polynomial and Training

In [None]:
from sklearn import preprocessing, svm

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error

# loading library
import pickle

# Separating the data into independent and dependent variables

# Converting each dataframe into a numpy array 

X = patient_DF2[['genetic', 'exercise', 'smoking', 'alcohol', 'sugar', 'BMI']]

y = patient_DF2['lifespan']

# Splitting the data into training and testing data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

regr = LinearRegression()

regr.fit(X_train, y_train)

print(regr.score(X_test, y_test))

# Data scatter of predicted values

y_pred = regr.predict(X_test)
print(y_pred)



print('mean_squared_error : ', mean_squared_error(y_test, y_pred))
print('mean_absolute_error : ', mean_absolute_error(y_test, y_pred))

# create an iterator object with write permission - model.pkl
with open('model_pkl', 'wb') as files:
    pickle.dump(regr, files)

# 