## AT1B - Experiment multivariate linear regression

In [1]:
# importing pandas library to read and modify the dataset
import pandas as pd
# Do not want to print the warning, so ignoring them
import warnings
warnings.filterwarnings('ignore')

In [2]:
# reading train and test file of the cancer mortality dataset
cancer_data_train = pd.read_csv("cancer_us_county-training.csv")
cancer_data_test = pd.read_csv("cancer_us_county-testing.csv")

In [3]:
print(cancer_data_train.shape, cancer_data_test.shape)

(2438, 35) (609, 35)


In [4]:
# Dropping Id as it is of no use
cancer_data_train = cancer_data_train.drop(columns=["Id"])
cancer_data_test = cancer_data_test.drop(columns=["Id"])

Replacing missing values with mean and median of train set in both sets of data because using a different value to fill in test set missing data could lead to biased results.

In [5]:
# PctSomeCol18_24 should be eliminated as seen in Part A
cancer_data_train = cancer_data_train.drop('PctSomeCol18_24',axis=1)
cancer_data_test = cancer_data_test.drop('PctSomeCol18_24',axis=1)

#PctPrivateCoverageAlone has no skew and is normally distributed - can replace null with mean
mean_val = cancer_data_train['PctPrivateCoverageAlone'].mean()
cancer_data_train['PctPrivateCoverageAlone'] = cancer_data_train['PctPrivateCoverageAlone'].fillna(mean_val)
cancer_data_test['PctPrivateCoverageAlone'] = cancer_data_test['PctPrivateCoverageAlone'].fillna(mean_val)

#PctEmployed16_Over has outliers and is left skewed - replace null by median
median_val = cancer_data_train['PctEmployed16_Over'].median()
cancer_data_train['PctEmployed16_Over'] = cancer_data_train['PctEmployed16_Over'].fillna(median_val)
cancer_data_test['PctEmployed16_Over'] = cancer_data_test['PctEmployed16_Over'].fillna(median_val)

# Training multivariate linear regression

In [6]:
# importing numpy library
import numpy as np
#import regression model from scikit learn library 
from sklearn.linear_model import LinearRegression 
reg = LinearRegression()

In [7]:
# reading dependent and independent variables from train & test dataset - only numeric features
X_train = cancer_data_train.select_dtypes(include=np.number)
X_train = X_train.drop('TARGET_deathRate', axis=1)
y_train = cancer_data_train['TARGET_deathRate'].values

X_test = cancer_data_test.select_dtypes(include=np.number)
X_test = X_test.drop('TARGET_deathRate', axis=1)
y_test = cancer_data_test['TARGET_deathRate'].values

In [8]:
# importing mean squared error from scikit learn metrics
from sklearn.metrics import mean_squared_error as mse
# fitting regression model 
reg.fit(X_train, y_train)

# Calculating MSE for train set
y_train_pred = reg.predict(X_train)
print(mse(y_train, y_train_pred, squared=True))

355.58533385247364


In [9]:
# importing altair library
import altair as alt

#plotting the relationship between actual and predicted target line - train set
actual_y_train = alt.Chart(pd.DataFrame({'target': y_train, 'preds': y_train})).mark_line(color='green').encode(
    x='target',
    y='preds'
)

pred_y_train = alt.Chart(pd.DataFrame({'target': y_train, 'preds': y_train_pred})).mark_line(color='orange').encode(
    x='target',
    y='preds'
  )

pred_y_train + actual_y_train

In [10]:
# Calculating MSE for test set
y_test_pred = reg.predict(X_test)
print(mse(y_test, y_test_pred, squared=True))

423.62053769994736


In [11]:
#plotting the relationship between actual and predicted target line - test set
actual_y_test = alt.Chart(pd.DataFrame({'target': y_test, 'preds': y_test})).mark_line(color='green').encode(
    x='target',
    y='preds'
)

pred_y_test = alt.Chart(pd.DataFrame({'target': y_test, 'preds': y_test_pred})).mark_line(color='orange').encode(
    x='target',
    y='preds'
  )

pred_y_test + actual_y_test