# Simple Regression Example

### This demonstration uses synthesised test data, synthesised using England census data.

In [None]:
import pandas as pd # for dataframe manipulation
import numpy as np # for vectorising and working with arrays

# sklearn models
from sklearn.linear_model import LinearRegression # Linear Model
from sklearn.dummy import DummyRegressor # Dummy models to test if a regression is worth it
# sklearn metrics
from sklearn.metrics import accuracy_score 
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import root_mean_squared_error
# sklearn splitter for splitting data prior to testing
from sklearn.model_selection import train_test_split

# plotting and charting results
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
# Load data from csv
df = pd.read_csv('demo.csv', sep='\t', encoding='utf-8')

In [None]:
# check column info
df.info()

In [None]:
# Check first 5 rows using head function
df.head(5)

In [None]:
# Check number of columns and rows using shape
df.shape

In [None]:
# Check for nulls
df[df.isnull().any(axis=1)].head()

In [None]:
 # delete column 'Unnamed: 0' as looks like an index with no bearing on task.
del df['Unnamed: 0']

In [None]:
# Check descriptive statistics for numeric columns
df.describe()

## Linear regression to predict the number of A&E attendences unpaid carers might have in a 12 month period.

### Exploritory Data Analysis

In [None]:
# Set up plot array (2*2)
fig, axes = plt.subplots(2, 2, figsize=(10,8))
fig.suptitle('box plots for numerical column distributions')

# Plotting the charts
sns.boxplot(data=df['age'], ax=axes[0, 0], color='lavender')
axes[0, 0].set_title('Age')
axes[0, 0].set(ylabel=None)
sns.boxplot(data=df['unpaid_care_hours_week'], ax=axes[0, 1], color='turquoise')
axes[0, 1].set_title('Unpaid care hours per week')
axes[0, 1].set(ylabel=None)
sns.boxplot(data=df['care_months'], ax=axes[1, 0], color='coral')
axes[1, 0].set_title('Months of unpaid caring')
axes[1, 0].set(ylabel=None)
sns.boxplot(data=df['ae_atts_last_12m'], ax=axes[1, 1], color='plum')
axes[1, 1].set_title('A&E attendances in last 12 months')
axes[1, 1].set(ylabel=None)

In [None]:
# Create dictionary of existing column names to plain english names
col_map = {
    'age': 'Age',
    'care_months': 'Months Caring',
    'unpaid_care_hours_week': 'Weekly Hours on Care',
    'ae_atts_last_12m': 'Emergency attendances'
}

In [None]:
# Create a streamlined dataset which can be used forr charting
df_hm = df[['age', 'care_months','unpaid_care_hours_week','ae_atts_last_12m']].copy()
df_hm.rename(columns=col_map, inplace=True)

In [None]:
# Using pairplot we'll visualize the data for correlation
g = sns.pairplot(
    data=df_hm,
    x_vars=['Age', 'Months Caring','Weekly Hours on Care'],
    y_vars='Emergency attendances',
    height=4,
    aspect=1,
    kind='scatter')
g.fig.suptitle("Figure 1: Relationship of features to Emergency Attendances", y=1.08) # y= some height>1


plt.show()

In [None]:
# Visualizing the data using heatmap to see the relationships between columns
ax = plt.axes()
sns.heatmap(df_hm.corr(), cmap="YlGnBu", annot = True, ax = ax)

ax.set_title('Figure 2: strength of relationships')
plt.show()

From the above heatmap it can be seen that while Months is the highest predictor for Emergency Attendances, it is closely followed by Weekly Hours Caring. Though Age is the lowest predictor and is just below 50% it is not that much further behind.

Building a model

In [None]:
# Feature Data
X = df[['age', 'care_months','unpaid_care_hours_week']]

# Target Data
y = df['ae_atts_last_12m']

In [None]:
# Split data to test and train sets
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.15,random_state=324)

In [None]:
# Define the Dummy Model
dummy_model = DummyRegressor(strategy="mean")
dummy_model.fit(X_train,y_train)

In [None]:
# Create dummy predictions
y_pred_dummy = dummy_model.predict(X_test)

In [None]:
# Define the Linear Regression Model
regressor = LinearRegression()
regressor.fit(X_train,y_train)

In [None]:
# Check Linear model Y intercept and coefficients for 'age', 'care_months','unpaid_care_hours_week'
## note this is not possible with DummyRegressor
print(regressor.intercept_)
print(regressor.coef_)

In [None]:
# Create dummy predictions
y_pred = regressor.predict(X_test)

In [None]:
# Score the dummy model to see performance
r2_dummy = r2_score(y_test,y_pred_dummy)
rmse_dummy = root_mean_squared_error(y_test,y_pred_dummy)
nrmse_dummy = rmse_dummy / (np.max(y_test) - np.min(y_test))
print("Dummy scores")
print(f'R squared score: {r2_dummy}')
print(f'Root Mean Squared Error: {rmse_dummy}')
print(f'y_test range:{(np.max(y_test) - np.min(y_test))}')
print(f'Normalised root Mean Squared Error: {nrmse_dummy}')

In [None]:
# Score the linear model to see performance
r2 = r2_score(y_test,y_pred)
rmse = root_mean_squared_error(y_test,y_pred)
nrmse = rmse / (np.max(y_test) - np.min(y_test))
print("Linear scores")
print(f'R squared score: {r2}')
print(f'Root Mean Squared Error: {rmse}')
print(f'y_test range:{(np.max(y_test) - np.min(y_test))}')
print(f'Normalised root Mean Squared Error: {nrmse}')

## Let's test the regressor

In [None]:
# Get a test row from the test set
test = X_test.iloc[100]
expected = y_test.iloc[100]
print(f'Using the values Age: {test.iloc[0]}, care_months: {test.iloc[2]}, unpaid_care_hours_week: {test.iloc[2]} we expect the result {expected} emergency attendances')

In [None]:
# use test in model
t_pred = regressor.predict(test.to_frame().transpose())
print(f"Predicted {t_pred} emergency attendances")