In [68]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [69]:
# Import the Excel as a pandas dataframe
air = pd.read_excel("AirQualityUCI.xlsx")

In [70]:
# Report on the number of rows and columns in the dataframe
print (air.shape)

(9357, 15)


In [71]:
# Print the top five rows of the dataframe
print(air.head)

<bound method NDFrame.head of            Date      Time  CO(GT)  PT08.S1(CO)  NMHC(GT)   C6H6(GT)  \
0    2004-03-10  18:00:00     2.6      1360.00       150  11.881723   
1    2004-03-10  19:00:00     2.0      1292.25       112   9.397165   
2    2004-03-10  20:00:00     2.2      1402.00        88   8.997817   
3    2004-03-10  21:00:00     2.2      1375.50        80   9.228796   
4    2004-03-10  22:00:00     1.6      1272.25        51   6.518224   
...         ...       ...     ...          ...       ...        ...   
9352 2005-04-04  10:00:00     3.1      1314.25      -200  13.529605   
9353 2005-04-04  11:00:00     2.4      1162.50      -200  11.355157   
9354 2005-04-04  12:00:00     2.4      1142.00      -200  12.374538   
9355 2005-04-04  13:00:00     2.1      1002.50      -200   9.547187   
9356 2005-04-04  14:00:00     2.2      1070.75      -200  11.932060   

      PT08.S2(NMHC)  NOx(GT)  PT08.S3(NOx)  NO2(GT)  PT08.S4(NO2)  \
0           1045.50    166.0       1056.25    11

In [72]:
# Print the data types of the columns
print (air.dtypes)

Date             datetime64[ns]
Time                     object
CO(GT)                  float64
PT08.S1(CO)             float64
NMHC(GT)                  int64
C6H6(GT)                float64
PT08.S2(NMHC)           float64
NOx(GT)                 float64
PT08.S3(NOx)            float64
NO2(GT)                 float64
PT08.S4(NO2)            float64
PT08.S5(O3)             float64
T                       float64
RH                      float64
AH                      float64
dtype: object


In [73]:
# fill the NaN values in each column for the column mean
air.fillna(air.mean())

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,2004-03-10,18:00:00,2.6,1360.00,150,11.881723,1045.50,166.0,1056.25,113.0,1692.00,1267.50,13.600,48.875001,0.757754
1,2004-03-10,19:00:00,2.0,1292.25,112,9.397165,954.75,103.0,1173.75,92.0,1558.75,972.25,13.300,47.700000,0.725487
2,2004-03-10,20:00:00,2.2,1402.00,88,8.997817,939.25,131.0,1140.00,114.0,1554.50,1074.00,11.900,53.975000,0.750239
3,2004-03-10,21:00:00,2.2,1375.50,80,9.228796,948.25,172.0,1092.00,122.0,1583.75,1203.25,11.000,60.000000,0.786713
4,2004-03-10,22:00:00,1.6,1272.25,51,6.518224,835.50,131.0,1205.00,116.0,1490.00,1110.00,11.150,59.575001,0.788794
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9352,2005-04-04,10:00:00,3.1,1314.25,-200,13.529605,1101.25,471.7,538.50,189.8,1374.25,1728.50,21.850,29.250000,0.756824
9353,2005-04-04,11:00:00,2.4,1162.50,-200,11.355157,1027.00,353.3,603.75,179.2,1263.50,1269.00,24.325,23.725000,0.711864
9354,2005-04-04,12:00:00,2.4,1142.00,-200,12.374538,1062.50,293.0,603.25,174.7,1240.75,1092.00,26.900,18.350000,0.640649
9355,2005-04-04,13:00:00,2.1,1002.50,-200,9.547187,960.50,234.5,701.50,155.7,1041.00,769.75,28.325,13.550000,0.513866


In [74]:
#  Find the Correlation

corr = air.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
CO(GT),1.0,0.0414149,0.128351,-0.0313773,0.0299394,0.52645,-0.0899806,0.67114,-0.0737205,0.0803156,-0.0689517,-0.0482306,-0.0458922
PT08.S1(CO),0.0414149,1.0,0.170009,0.852659,0.933101,0.278029,0.0869312,0.154058,0.845133,0.892436,0.754806,0.745344,0.764866
NMHC(GT),0.128351,0.170009,1.0,0.0373289,0.110097,-0.00441289,0.0488322,0.103345,0.162689,0.101189,-7.57769e-06,0.00828792,0.0125001
C6H6(GT),-0.0313773,0.852659,0.0373289,1.0,0.767401,-0.00116298,0.512154,-0.0109714,0.774649,0.641306,0.97137,0.925068,0.984556
PT08.S2(NMHC),0.0299394,0.933101,0.110097,0.767401,1.0,0.331331,-0.0737483,0.176569,0.874761,0.909909,0.668984,0.585775,0.646535
NOx(GT),0.52645,0.278029,-0.00441289,-0.00116298,0.331331,1.0,-0.436083,0.817138,0.0355795,0.461916,-0.138457,-0.0530081,-0.0958406
PT08.S3(NOx),-0.0899806,0.0869312,0.0488322,0.512154,-0.0737483,-0.436083,1.0,-0.256217,0.122672,-0.208935,0.588061,0.573513,0.621576
NO2(GT),0.67114,0.154058,0.103345,-0.0109714,0.176569,0.817138,-0.256217,1.0,-0.0220925,0.253469,-0.0840845,-0.0812997,-0.0604231
PT08.S4(NO2),-0.0737205,0.845133,0.162689,0.774649,0.874761,0.0355795,0.122672,-0.0220925,1.0,0.72367,0.755053,0.640685,0.691889
PT08.S5(O3),0.0803156,0.892436,0.101189,0.641306,0.909909,0.461916,-0.208935,0.253469,0.72367,1.0,0.503659,0.524922,0.519432


C6H6(GT) and AH are one of the least correlated variables


In [75]:
linreg = linear_model.LinearRegression()

In [101]:
# Create the dataseries for Realtive Humidity
air_rh = air['RH']
air_C6H6 = air['C6H6(GT)']
air_PT08 = air['PT08.S1(CO)']

# for non correlated
air_C6H6 = air['NMHC(GT)']
air_PT08 = air['NOx(GT)']

# divide the values by 100 to make the numbers smaller
air_C6H6 = air_C6H6.apply(lambda x: x/100) 
air_PT08 = air_PT08.apply(lambda x: x/100)
air_rh = air_rh.apply(lambda x: x/100) 

# Create an array for the dependent variables, as we are running a multivaraint regression
x = np.array([air_C6H6, air_PT08]).T


# Create an arrray for independent Variable, relative humidity
y = np.array(air_rh).T

# Generate the Training Set
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=76)

# Calculate the Regression
linreg.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [107]:
print(linreg.coef_) 
print(linreg.intercept_) # Y Intercept


[ 1.31535818 -0.02531375]
0.6362571355450263


In [108]:
# Predict Y
yhat = linreg.predict(x_test)


In [109]:
# The Mean Squared Error
print(np.mean((yhat-y_test)**2))

# r-squared error of the Training Data
print(linreg.score(x_train, y_train))
# r-squared error of the Test data
print(linreg.score(x_test, y_test))



0.03570495192394486
0.8574461756715541
0.8801637089608847
