In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import os
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

os.chdir('C:')
os.getcwd()

plt.rcParams["figure.autolayout"] = True

def getHighestDate(data):
    prevLoc = data['location'][0]
    dropArr = []
    for i in range(1, len(data['location'])):
        if data['location'][i] == prevLoc:
            dropArr.append(i-1)
        prevLoc = data['location'][i]
    newData = data.drop(dropArr).reset_index()
    return(newData)

def removeNanCat(data, cats):
    dropArr = []
    for i, row in enumerate(data[cats[0]]):
        for cat in cats:
            if type(data[cat][i]) != np.float64:
                print(type(data[cat][i]))
                dropArr.append(i)
            if type(data[cat][i]) != str:
                if math.isnan(data[cat][i]) or np.isfinite(data[cat][i]) == False:
                    dropArr.append(i)
    dropArr = list(set(dropArr))
    return(data.drop(dropArr).reset_index())

def makeLog(data, cats):
    newData = data
    for i, row in enumerate(data[cats[0]]):
        for cat in cats:
            if math.isinf(np.log(data[cat][i])):
                newData[cat][i] = 0
            else:
                newData[cat][i] = np.log(data[cat][i])
    return(newData)

def plotMultiple(data, dataX, dataY, titles, lablesX, lablesY):
    plt.rcParams["figure.figsize"] = [20, len(dataX)/3*20]
    fig, axs = plt.subplots(len(dataX))
    for i, x in enumerate(dataX):
        axs[i].set_title(titles[i])
        axs[i].scatter(data[x], data[dataY[i]])
        axs[i].set(xlabel=lablesX[i], ylabel=lablesY[i])
    plt.show()

def plotLinearRegression(data, dataX, dataY, titles, lablesX, lablesY):
    plt.rcParams["figure.figsize"] = [20, len(dataX)/3*20]
    fig, axs = plt.subplots(len(dataX))
    for i, x in enumerate(dataX):
        x_train, x_test, y_train, y_test = train_test_split(data[x], data[dataY[i]], test_size=0.3)
        model = linear_model.LinearRegression()
        model.fit(x_train.values.reshape(-1,1), y_train)
        axs[i].scatter(data[x], data[dataY[i]])
        axs[i].plot(data[x], model.predict(data[x].values.reshape(-1,1)))
        print("Mean squared error: %f" %mean_squared_error(y_test, model.predict(x_test.values.reshape(-1,1))))
        print("R2 score percentage: %f" %(r2_score(y_test, model.predict(x_test.values.reshape(-1,1)))*100))
        axs[i].set_title(titles[i])
        axs[i].set(xlabel=lablesX[i], ylabel=lablesY[i])
    plt.show()

In [None]:
data = pd.read_csv('Case_PRB.csv')
data = getHighestDate(data)
data = removeNanCat(data, ['life_expectancy', 'human_development_index', 'total_cases_per_million', 'total_deaths_per_million', 'male_smokers', 'cardiovasc_death_rate'])
data = makeLog(data, ['life_expectancy', 'human_development_index', 'total_cases_per_million', 'total_deaths_per_million', 'male_smokers', 'cardiovasc_death_rate'])

plotMultiple(data, ['life_expectancy', 'total_cases_per_million', 'male_smokers'], ['human_development_index', 'total_deaths_per_million', 'cardiovasc_death_rate'], ['Figuur 1','Figuur 2','Figuur 3'],
['Life expectancy', 'Total cases per million', 'Male smokers'], ['Human development index', 'Total deaths per million', 'Cardiovascular death rate'])

In [None]:
plotLinearRegression(data, ['life_expectancy', 'total_cases_per_million', 'male_smokers'], ['human_development_index', 'total_deaths_per_million', 'total_deaths_per_million'], ['Figuur 1','Figuur 2','Figuur 3'],
['Life expectancy', 'Total cases per million', 'Male smokers'], ['Human development index', 'Total deaths per million', 'Cardiovascular death rate'])