# 0. Import packages

In [1]:
import numpy as np #for vectorized numerical calculations
import pandas as pd #for data handling
import seaborn as sns #for data visualization 
import matplotlib.pyplot as plt #for data visualization
from sklearn import metrics #for metrics to evaluate the models
import torch #for tensor, gradient and neural network calculations

# 1. Get Data

In [2]:
#available at my drive (link is public) --> https://drive.google.com/file/d/1XFffn1YC91wvoDvnYjs3NIRSFF5zFPiK/view?usp=sharing
!gdown --id 1XFffn1YC91wvoDvnYjs3NIRSFF5zFPiK

Downloading...
From: https://drive.google.com/uc?id=1XFffn1YC91wvoDvnYjs3NIRSFF5zFPiK
To: D:\Praktisi Mengajar\3 Universitas Muhammadiyah Semarang\1 Courses\Life Expectancy Data.csv

  0%|          | 0.00/333k [00:00<?, ?B/s]
100%|##########| 333k/333k [00:00<00:00, 787kB/s]
100%|##########| 333k/333k [00:00<00:00, 787kB/s]


Read Data

In [None]:
life_exp = pd.read_csv('Life Expectancy Data.csv')

Explore Data

In [None]:
life_exp.shape

In [None]:
life_exp.head() #shows first 5 elements

In [None]:
life_exp.info() #gives information about the fields

In [None]:
life_exp.describe()

In [None]:
sns.set(rc = {'figure.figsize':(15,10)})
sns.heatmap(life_exp.corr(), annot=True, cmap="coolwarm") #visualizing the correlation between the variables (how do they influence each other)

Handling data

In [None]:
#Change column names for conveniency 
life_exp.rename(columns = {" BMI " :"BMI",
  "Life expectancy ": "Life_expectancy",
  "Adult Mortality":"Adult_mortality",
  "infant deaths":"Infant_deaths",
  "percentage expenditure":"Percentage_expenditure",
  "Hepatitis B":"HepatitisB",
  "Measles ":"Measles",
  "under-five deaths ": "Under_five_deaths",
  "Total expenditure":"Total_expenditure",
  "Diphtheria ": "Diphtheria",
  " thinness 1–19 years":"Thinness_1–19_years",
  " thinness 5–9 years":"Thinness_5–9_years",
  " HIV/AIDS":"HIV/AIDS",
  "Income composition of resources":
  "Income_composition_of_resources"}, inplace = True)

In [None]:
X = life_exp.drop(["Status","Country","Life_expectancy"], axis=1)
Y = life_exp["Life_expectancy"]

In [None]:
X.head()

In [None]:
Y.head()

Handling missing values

In [None]:
X.isnull().sum() #alias for isna -- checks the number of empty, nan, nat, ... elements

In [None]:
#There are many ways to fill missing values: https://www.geeksforgeeks.org/working-with-missing-data-in-pandas/
def fillmissing(df, feature, method):
  if method == "mode":
    df[feature] = df[feature].fillna(df[feature].mode()[0])
  elif method == "median":
    df[feature] = df[feature].fillna(df[feature].median())
  else:
    df[feature] = df[feature].fillna(df[feature].mean())

#We'll use the mean value
features_missing = X.columns[X.isna().any()]
for feature in features_missing:
  fillmissing(X, feature= feature, method= "mean")

#for Y we'll use the median value
Y.fillna(Y.median(), inplace=True)

In [None]:
Y = np.array(Y)

In [None]:
X.isnull().sum() #no more missing values

#**LINEAR REGRESSION**

#**2 . Linear Regression with Tim**

In [None]:
tim_data = X['BMI']
tim_data.head()

Visualzing Life expectancy vs BMI plot

In [None]:
fig = plt.figure()
ax = fig.add_subplot()

ax.scatter(tim_data, Y)

ax.set_xlabel('BMI')
ax.set_ylabel('Life expectancy')

Preprocessing data

In [None]:
#tim_data.values.shape
tim_data.values.reshape(-1,1).shape

In [None]:
tim_data.values

In [None]:
np.ones(len(tim_data)).reshape(-1,1).shape

In [None]:
np.ones(len(tim_data)).reshape(-1,1)

In [None]:
def preprocess(X):
    data = X.values.reshape(-1,1)
    data = np.concatenate((np.ones(len(data)).reshape(-1,1), data), axis=1)
    return data

tim_data_ready = preprocess(tim_data)
tim_data_ready.shape

In [None]:
tim_data_ready

In [None]:
def normal_equation(X,Y):
    X = torch.from_numpy(X)
    Y = torch.from_numpy(Y)
    return torch.matmul( torch.inverse(torch.matmul(X.t(), X)), torch.matmul(X.t(), Y))

tim_theta = normal_equation(tim_data_ready, Y)
print(f'y = mx + b')
print(f'y = {tim_theta[1]}x + {tim_theta[0]}')

In [None]:
plt.figure(figsize=(10,6))
plt.title('Life expectancy regression using $\\theta_0$ = {} , $\\theta_1$ = {}'.format(tim_theta[0], tim_theta[1]))
plt.scatter(tim_data,Y, marker='o')
plt.plot(tim_data,np.dot(tim_data_ready, tim_theta), color='r')
plt.show()

In [None]:
def life_expectancy(X, theta):
    X = np.concatenate((np.ones(1),np.array(X)),axis=0)
    return round(np.dot(X, theta),1)

height = 1.82 #float(input("Please input your height (in meters): ")) #1.82
weight = 80 #float(input("Please input your weight (in kilograms): ")) #80
bmi = weight / height**2
print("BMI:",bmi)
life_exp_tim = life_expectancy([bmi], tim_theta)
print("Tim's life expectancy is", life_exp_tim , "years.")

In [None]:
height = 1.80 #float(input("Please input your height (in meters): ")) #1.82
weight = 73 #float(input("Please input your weight (in kilograms): ")) #80
bmi = weight / height**2
print("BMI:",bmi)
life_exp_tim_tom = life_expectancy([bmi], tim_theta)
print("Tom's life expectancy predicted by Tim's model is", life_exp_tim_tom , "years.")

#**3.Linear Regression with Tom**

In [None]:
tom_data = X['GDP']
tom_data.head()

Visualzing Life expectancy vs GDP plot

In [None]:
fig = plt.figure()
ax = fig.add_subplot()

ax.scatter(tom_data, Y)

ax.set_xlabel('GDP')
ax.set_ylabel('Life expectancy')

Preprocessing data

In [None]:
tom_data_ready = preprocess(tom_data)
tom_data_ready.shape

In [None]:
tom_theta = normal_equation(tom_data_ready, Y)
print(f'y = mx + b')
print(f'y = {tom_theta[1]}x + {tom_theta[0]}')

In [None]:
plt.figure(figsize=(10,6))
plt.title('Life expectancy regression using $\\theta_0$ = {} , $\\theta_1$ = {}'.format(tom_theta[0], tom_theta[1]))
plt.scatter(tom_data,Y, marker='o')
plt.plot(tom_data,np.dot(tom_data_ready, tom_theta), color='r')
plt.show()

In [None]:
def life_expectancy(X, theta):
    X = np.concatenate((np.ones(1),np.array(X)),axis=0)
    return round(np.dot(X, theta),1)

gdp = 8500 #float(input("Please input the GDP of your country: ")) #8500
life_exp_tom = life_expectancy([gdp], tom_theta)
print("Tom's life expectancy is", life_exp_tom , "years.")

In [None]:
gdp = 5000 #float(input("Please input the GDP of your country: ")) #8500
life_exp_tom_tim = life_expectancy([gdp], tom_theta)
print("Tim's life expectancy predicted by Tom is", life_exp_tom_tim , "years.")

#**MULTIPLE REGRESSION**

#**4. Multiple Regression with Tim**

In [None]:
tim_data = X[['BMI', 'Adult_mortality', 'HIV/AIDS']]
tim_data.head()

In [None]:
def add_ones(X):
    return np.concatenate((np.ones(len(X)).reshape(-1,1), X), axis=1)

tim_data_ready = add_ones(tim_data)
tim_data_ready.shape

In [None]:
tim_theta = normal_equation(tim_data_ready, Y)
print(f'theta = {tim_theta}')

In [None]:
def RMSE(y_pred, y):
    return np.sqrt(metrics.mean_squared_error(y, y_pred))

tim_y_pred = np.dot(tim_data_ready, tim_theta)
tim_rmse = RMSE(tim_y_pred, Y)

print("Tim's RMSE: ", tim_rmse)

In [None]:
#def life_expectancy(X, theta):
#    X = np.concatenate((np.ones(1),np.array(X)),axis=0)
#    return round(np.dot(X, theta),1)

height = 1.82 #float(input("Please input your height (in meters): "))
weight = 80 #float(input("Please input your weight (in kilograms): "))
bmi = weight / height**2
print("BMI:",bmi)
adult_mortality = 53
hiv = 0.1
life_exp_tim = life_expectancy([bmi,adult_mortality,hiv], tim_theta)
print("Tim's life expectancy is", life_exp_tim , "years.")

In [None]:
height = 1.80 #float(input("Please input your height (in meters): "))
weight = 73 #float(input("Please input your weight (in kilograms): "))
bmi = weight / height**2
print("BMI:",bmi)
adult_mortality = 70
hiv = 0.1
life_exp_tim_tom = life_expectancy([bmi,adult_mortality,hiv], tim_theta)
print("Tom's life expectancy predicted by Tim is", life_exp_tim_tom , "years.")

#**5. Multiple Regression with Tom**

In [None]:
tom_data = X[['GDP', 'Income_composition_of_resources', 'Schooling']]
tom_data.head()

In [None]:
tom_data_ready = add_ones(tom_data)
tom_data_ready.shape

In [None]:
tom_theta = normal_equation(tom_data_ready, Y)
print(f'theta = {tom_theta}')

In [None]:
def RMSE(y_pred, y):
    return np.sqrt(metrics.mean_squared_error(y, y_pred))

tom_y_pred = np.dot(tom_data_ready, tom_theta)
tom_rmse = RMSE(tom_y_pred, Y)

print("Tom's RMSE: ", tom_rmse)

In [None]:
gdp = 8500
income = 0.8
school = 16.5
life_exp_tom = life_expectancy([gdp,income,school], tom_theta)
print("Tom's life expectancy is", life_exp_tom , "years.")

In [None]:
gdp = 5000
income = 0.77
school = 15.5
life_exp_tom_tim = life_expectancy([gdp,income,school], tom_theta)
print("Tim's life expectancy predicted by Tom is", life_exp_tom_tim , "years.")

#**Now your turn, try to use all 6 variables or choose variables on your own**

In [None]:
##CODE HERE