### Hamoye Data Science Internship Stage B
These codes were used to derive the answers to the Hamoye Data Science Internship Quiz for the "Machine Learning: Regression - Predicting Energy Efficiency of Buildings" course

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/00374/energydata_complete.csv')
df.head()

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,2016-01-11 17:00:00,60,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,...,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,2016-01-11 17:10:00,60,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,...,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,2016-01-11 17:20:00,50,30,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,...,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
3,2016-01-11 17:30:00,50,40,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,...,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.410389,45.410389
4,2016-01-11 17:40:00,60,40,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,...,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097


In [3]:
from sklearn.preprocessing import MinMaxScaler
new_df = df.drop(columns = ['date', 'lights'])

In [4]:
#normalise the dataset to a common scale using the min max scaler
scaler = MinMaxScaler()

In [5]:
normalised_df = pd.DataFrame(scaler.fit_transform(new_df), columns = new_df.columns)

Q12 - From the dataset, fit a linear model on the relationship between the temperature in the living room in Celsius (x = T2) and the temperature outside the building (y = T6). What is the R^2 value in two D.P?\
What is the Mean Absolute Error (in two decimal places)?\
What is the Residual Sum of Squares (in two decimal places)?\
What is the Root Mean Squared Error (in three decimal places)?\
What is the Coefficient of Determination (in two decimal places)?\

In [6]:
#separate T2 and T6 into 2 columns: x and y
x_feature = normalised_df['T2']
y_feature = normalised_df['T6']

In [7]:
#split the data into test and train 
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_feature, y_feature, test_size = 0.3, random_state = 42)

In [11]:
#reshape the feature variables to 2D array
x_test = x_test.values.reshape(-1,1)
x_train = x_train.values.reshape(-1,1)

In [12]:
#fit the data
from sklearn.linear_model  import LinearRegression
linear_model = LinearRegression()
linear_model.fit(x_train, y_train)

LinearRegression()

In [13]:
predicted_values = linear_model.predict(x_test)

In [14]:
predicted_values

array([0.23928945, 0.46794238, 0.23108472, ..., 0.3001772 , 0.4297256 ,
       0.3217686 ])

In [15]:
from sklearn.metrics import r2_score
r2_score = r2_score(y_test, predicted_values)
round(r2_score,2)

0.64

In [16]:
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_test, predicted_values)
round(mae, 2)

0.08

In [17]:
rss = np.sum(np.square(y_test - predicted_values))
round(rss,2)

66.12

In [18]:
from sklearn.metrics import mean_squared_error
rmse = np.sqrt(mean_squared_error(y_test, predicted_values))
round(rmse,2)

0.11

Obtain the feature weights from your linear model above. Which features have the lowest and highest weights respectively?

In [19]:
features_df = normalised_df.drop(columns = ['Appliances'])

In [20]:
Appliances_target = normalised_df['Appliances']

In [21]:
from sklearn.model_selection import train_test_split
x_train_new, x_test_new, y_train_new, y_test_new = train_test_split(features_df, Appliances_target, test_size = 0.3, random_state = 42)

In [23]:
#train the whole dataset with Appliances as the target variable
from sklearn.linear_model  import LinearRegression
linear_model = LinearRegression()
linear_model.fit(x_train_new, y_train_new)

LinearRegression()

In [24]:
predicted_values_new = linear_model.predict(x_test_new)

In [25]:
predicted_values_new

array([0.03322207, 0.24411599, 0.03400024, ..., 0.06844707, 0.10032325,
       0.05722198])

In [26]:
#this function returns the weight of every feature
def get_weights_df(model, feat, col_name):
  weights = pd.Series(model.coef_, feat.columns).sort_values()
  weights_df = pd.DataFrame(weights).reset_index()
  weights_df.columns = ['Features', col_name]
  weights_df[col_name].round(3)
  return weights_df


In [27]:
linear_model_weights = get_weights_df(linear_model, x_train_new, 'Linear_Model_Weight')

In [28]:
linear_model_weights

Unnamed: 0,Features,Linear_Model_Weight
0,RH_2,-0.456698
1,T_out,-0.32186
2,T2,-0.236178
3,T9,-0.189941
4,RH_8,-0.157595
5,RH_out,-0.077671
6,RH_7,-0.044614
7,RH_9,-0.0398
8,T5,-0.015657
9,T1,-0.003281


Train a ridge regression model with an alpha value of 0.4. Is there any change to the root mean squared error (RMSE) when evaluated on the test set?

In [29]:
from sklearn.linear_model import Ridge
ridge_reg = Ridge(alpha=0.4)
ridge_reg.fit(x_train_new, y_train_new)

Ridge(alpha=0.4)

In [30]:
predicted_values_ridge = ridge_reg.predict(x_test_new)
predicted_values_ridge

array([0.03321872, 0.24043824, 0.03461337, ..., 0.06872351, 0.10025536,
       0.05851175])

In [31]:
#rmse from ridge regression
from sklearn.metrics import mean_squared_error
rmse_ridge = np.sqrt(mean_squared_error(y_test_new, predicted_values_ridge))
rmse_ridge

0.08753385704628003

In [32]:
#rmse from linear model
from sklearn.metrics import mean_squared_error
rmse_linear = np.sqrt(mean_squared_error(y_test_new, predicted_values_new))
rmse_linear

0.08751444947661711

Train a lasso regression model with an alpha value of 0.001 and obtain the new feature weights with it. How many of the features have non-zero feature weights?

In [33]:
from sklearn.linear_model import Lasso
lasso_reg = Lasso(alpha=0.001)
lasso_reg.fit(x_train_new, y_train_new)

Lasso(alpha=0.001)

In [34]:
predicted_values_lasso = lasso_reg.predict(x_test_new)

In [35]:
predicted_values_lasso

array([0.07370267, 0.08143458, 0.07716072, ..., 0.07792848, 0.09034412,
       0.08359255])

In [38]:
lasso_weights_df = get_weights_df(lasso_reg, x_train_new, 'Lasso_weight')
lasso_weights_df

Unnamed: 0,Features,Lasso_weight
0,RH_out,-0.049557
1,RH_8,-0.00011
2,T1,0.0
3,Tdewpoint,0.0
4,Visibility,0.0
5,Press_mm_hg,-0.0
6,T_out,0.0
7,RH_9,-0.0
8,T9,-0.0
9,T8,0.0


What is the new RMSE with the Lasso Regression (in 3 decimal places)?

In [39]:
from sklearn.metrics import mean_squared_error
rmse_lasso = np.sqrt(mean_squared_error(y_test_new, predicted_values_lasso))
round(rmse_lasso,3)

0.094