<H1><B>10 PIPELINE</B></H1>

In [32]:
# Imports
import os
import csv
import json

In [33]:
#!/usr/bin/env python

# Imports
import logging
import pandas as pd
import sqlite3

import numpy as np
import scipy as cp

# Hardening
from pathlib import Path

# Global configuration
logging.basicConfig(level=logging.DEBUG)
dbName = 'rest_server/medisch_centrum_randstad/db.sqlite3'
tableName = 'rest_api_netlify'

# Collecting the data
logging.info('Load transformed data from database into dataframe')
logging.info(f"Connect to {Path(dbName).name}")
dbConnection = sqlite3.connect(dbName)
patient_DF = pd.read_sql_query(f"SELECT * FROM {tableName}", dbConnection)
logging.debug(patient_DF.head())

# Cleaning empty cells and removing its index
logging.info('Preprocessing : remove rows with missing values')
dfCleanFromDB = patient_DF.dropna()

# Replacing Wrong DataTypes and Replace NaN
dfCleanFromDB2 = dfCleanFromDB.apply(pd.to_numeric, errors='coerce')
dfCleanFromDB3 = dfCleanFromDB2[dfCleanFromDB.select_dtypes(include=[np.number]).ge(0).all(1)]

logging.debug(dfCleanFromDB3.head())
dfCleanFromDB3.to_csv('data/data_cleaned.csv', header=True, index=False)

# Outliers stay in
# Transforming in next step using CSV

# print(dfCleanFromDB3.info())
# dfCleanFromDB3.shape

INFO:root:Load transformed data from database into dataframe
INFO:root:Connect to db.sqlite3
DEBUG:root:   id  genetic  length   mass  exercise  smoking  alcohol  lifespan  sugar
0   1     73.9   185.0   99.7       0.9      0.0      2.4      73.1    6.9
1   2     86.0   172.0  105.4       1.8      8.1      0.4      85.0    4.2
2   3     83.3   176.0  111.4       1.1      0.8      4.6      81.6    7.5
3   4     82.8   164.0  111.4       4.7     11.8      1.0      81.0    2.9
4   5     78.7   178.0   71.6       1.5      8.3      4.9      75.0    5.5
INFO:root:Preprocessing : remove rows with missing values
DEBUG:root:   id  genetic  length   mass  exercise  smoking  alcohol  lifespan  sugar
0   1     73.9   185.0   99.7       0.9      0.0      2.4      73.1    6.9
1   2     86.0   172.0  105.4       1.8      8.1      0.4      85.0    4.2
2   3     83.3   176.0  111.4       1.1      0.8      4.6      81.6    7.5
3   4     82.8   164.0  111.4       4.7     11.8      1.0      81.0    2.9
4 

In [34]:
# Adding BMI
patient_DF = pd.read_csv('data/data_cleaned.csv')
patient_DF["BMI"] = (patient_DF["mass"]/patient_DF["length"]**2)*10000

# Saving transformed DataFrame including BMI
patient_DF.to_csv('data/medisch_centrum_randstad_BMI.csv', header=True, index=False)
# Storing DataFrame as Table inside .sql
patient_DF.to_sql('medisch_centrum_randstad_BMI', dbConnection, if_exists='replace', index=False)

# close Connection
dbConnection.close()

In [35]:
# Imports
from sklearn import preprocessing, svm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [36]:
# Regression
patient_DF2 = pd.read_csv('data/medisch_centrum_randstad_BMI.csv')
X = patient_DF2[['genetic', 'exercise', 'smoking', 'alcohol', 'sugar', 'BMI']]
y = patient_DF2['lifespan']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

regr = LinearRegression()
regr.fit(X_train, y_train)

# Data scatter of predicted values
y_pred = regr.predict(X_test)

sq = (mean_squared_error(y_test, y_pred))**0.5
mae = round((mean_absolute_error(y_test, y_pred)),2)

# Storing Regression Coefficients into Lists
regCoef = regr.coef_
regInter = regr.intercept_

In [37]:
# This input is limited to 'genetic', BMI and the daily intake of 'exercise', 'smoking', 'alcohol' and 'sugar'.
# gen = float(input("Please input your genetic: "))
# bmi = float(input("Please input your BMI: "))
# exer = float(input("Please input your daily exercise: "))
# smok = float(input("Please input your daily smoking habit: "))
# alco = float(input("Please input your daily alcohol intake: "))
# sug = float(input("Please input your daily sugar intake: "))

# ONE LINE INPUT (Waiting for end-user interface)
# lifespan_prog = (regCoef[0]*gen)+(regCoef[1]*exer)+(regCoef[2]*smok)+(regCoef[3]*alco)+(regCoef[4]*sug)+(regCoef[5]*bmi)+regInter
# print(f'Your Lifespan Prognosis: ',lifespan_prog)