In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer

dataset = pd.read_csv('weatherHistory.csv')

X = dataset.iloc[:,1:4]
X1 = dataset.iloc[:,5:]
y = dataset['Apparent Temperature (C)'].values

In [2]:
for x in X1.columns:
    X[x] = X1[x]

X.drop(['Loud Cover','Precip Type','Daily Summary'], axis = 1, inplace = True)

In [3]:
print(X.columns)

Index(['Summary', 'Temperature (C)', 'Humidity', 'Wind Speed (km/h)',
       'Wind Bearing (degrees)', 'Visibility (km)', 'Pressure (millibars)'],
      dtype='object')


In [4]:
imputer = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')
# check for missing data in df and use imputer accordingly
if(X.isna().any().any()):
    for i in X.loc[:, X.isnull().any()].columns.values:
        print(X[i].dtype)
X = X.values

In [5]:
print(X[:,0])

['Partly Cloudy' 'Partly Cloudy' 'Mostly Cloudy' ... 'Partly Cloudy'
 'Partly Cloudy' 'Partly Cloudy']


In [6]:
# Encode Categorical data (Summary)\
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X[:,0] = le.fit_transform(X[:,0])

# X['Summary'].value_counts()
# from sklearn.preprocessing import LabelBinarizer
# lb = LabelBinarizer()

In [8]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [30]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train,y_train)

LinearRegression()

In [31]:
y_pred = regressor.predict(X_test)

In [40]:
## R-Squared Score
from sklearn.metrics import r2_score
print(r2_score(y_test,y_pred))

0.9898164686464459


In [37]:
## Adjusted R-Squared Score
import statsmodels.api as sm
from statsmodels.sandbox.regression.predstd import wls_prediction_std

model = sm.OLS(y_test.astype('float64'),y_pred.astype('float64'))
res = model.fit()
print(res.summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.995
Model:                            OLS   Adj. R-squared (uncentered):              0.995
Method:                 Least Squares   F-statistic:                          3.821e+06
Date:                Wed, 12 May 2021   Prob (F-statistic):                        0.00
Time:                        21:06:03   Log-Likelihood:                         -28871.
No. Observations:               19291   AIC:                                  5.774e+04
Df Residuals:                   19290   BIC:                                  5.775e+04
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [41]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test,y_pred)

0.8480055227750488

In [43]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test,y_pred)

1.1683252971250142

In [51]:
## Compare actual and predicted values
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)), axis = 1, out=None))

[[19.16 18.94]
 [24.58 24.12]
 [19.97 20.09]
 ...
 [ 3.62  2.64]
 [11.78 13.89]
 [13.35 14.1 ]]
