In [2]:
# import the packages
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
import statsmodels.formula.api as smf
import numpy as np

In [3]:
# import the file & print head
weather = pd.read_csv('worcester.csv')
weather.head()

Unnamed: 0,Index,STATION,NAME,Month,Date,Year,Season,TMAX,TMIN
0,0,USW00094746,"WORCESTER, MA US",1,1,1950,Spring,40,27
1,1,USW00094746,"WORCESTER, MA US",1,2,1950,Spring,37,28
2,2,USW00094746,"WORCESTER, MA US",1,3,1950,Spring,51,35
3,3,USW00094746,"WORCESTER, MA US",1,4,1950,Spring,60,50
4,4,USW00094746,"WORCESTER, MA US",1,5,1950,Spring,58,41


In [4]:
weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25188 entries, 0 to 25187
Data columns (total 9 columns):
Index      25188 non-null int64
STATION    25188 non-null object
NAME       25188 non-null object
Month      25188 non-null int64
Date       25188 non-null int64
Year       25188 non-null int64
Season     25188 non-null object
TMAX       25188 non-null int64
TMIN       25188 non-null int64
dtypes: int64(6), object(3)
memory usage: 1.7+ MB


In [3]:
# make a new column for week
def Week(df):
    if df['Date'] <= 7 and df['Date'] >= 1:
        return 1
    elif df['Date'] <= 15 and df['Date'] >= 8:
        return 2
    elif df['Date'] <= 23 and df['Date'] >= 16:
        return 3
    else:
        return 4

In [4]:
weather['Week'] = weather.apply(Week, axis=1)
weather.head(8)

Unnamed: 0,Index,STATION,NAME,Month,Date,Year,Season,TMAX,TMIN,Week
0,0,USW00094746,"WORCESTER, MA US",1,1,1950,Spring,40,27,1
1,1,USW00094746,"WORCESTER, MA US",1,2,1950,Spring,37,28,1
2,2,USW00094746,"WORCESTER, MA US",1,3,1950,Spring,51,35,1
3,3,USW00094746,"WORCESTER, MA US",1,4,1950,Spring,60,50,1
4,4,USW00094746,"WORCESTER, MA US",1,5,1950,Spring,58,41,1
5,5,USW00094746,"WORCESTER, MA US",1,6,1950,Spring,42,34,1
6,6,USW00094746,"WORCESTER, MA US",1,7,1950,Spring,34,21,1
7,7,USW00094746,"WORCESTER, MA US",1,8,1950,Spring,22,2,2


In [5]:
# create a new dataframe of spring
spring = weather[weather['Season'] == 'Spring']
# re-index
spring.index = range(len(spring))
spring.drop(['Index'],axis = 1)

Unnamed: 0,STATION,NAME,Month,Date,Year,Season,TMAX,TMIN,Week
0,USW00094746,"WORCESTER, MA US",1,1,1950,Spring,40,27,1
1,USW00094746,"WORCESTER, MA US",1,2,1950,Spring,37,28,1
2,USW00094746,"WORCESTER, MA US",1,3,1950,Spring,51,35,1
3,USW00094746,"WORCESTER, MA US",1,4,1950,Spring,60,50,1
4,USW00094746,"WORCESTER, MA US",1,5,1950,Spring,58,41,1
...,...,...,...,...,...,...,...,...,...
6217,USW00094746,"WORCESTER, MA US",3,27,2018,Spring,45,25,4
6218,USW00094746,"WORCESTER, MA US",3,28,2018,Spring,53,34,4
6219,USW00094746,"WORCESTER, MA US",3,29,2018,Spring,51,39,4
6220,USW00094746,"WORCESTER, MA US",3,30,2018,Spring,56,39,4


In [15]:
# cheak the correlation
(a,b) = stats.pearsonr(spring.index, spring['TMIN'])
(c,d) = stats.pearsonr(spring.index, spring['TMAX'])
(e,f) = stats.pearsonr(spring['TMAX'], spring['TMIN'])
print('The correlation coefficient is',a,'The p-value is',b, 'between TMIN and the index')
print('The correlation coefficient is',c,'The p-value is',d, 'between TMAX and the index')
print('The correlation coefficient is',e,'The p-value is',f, 'between TMAX and TMIN')

The correlation coefficient is 0.07289946444985573 The p-value is 8.572862892623352e-09 between TMIN and the index
The correlation coefficient is 0.05464088950847254 The p-value is 1.6148446644473456e-05 between TMAX and the index
The correlation coefficient is 0.8371084646909295 The p-value is 0.0 between TMAX and TMIN


In [7]:
# fit model
model_fit = smf.ols('TMIN ~ spring.index', data = spring).fit()
# show the summary
model_fit.summary()

0,1,2,3
Dep. Variable:,TMIN,R-squared:,0.005
Model:,OLS,Adj. R-squared:,0.005
Method:,Least Squares,F-statistic:,33.23
Date:,"Tue, 19 Nov 2019",Prob (F-statistic):,8.57e-09
Time:,14:12:30,Log-Likelihood:,-23730.0
No. Observations:,6222,AIC:,47460.0
Df Residuals:,6220,BIC:,47480.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,18.7508,0.278,67.429,0.000,18.206,19.296
spring.index,0.0004,7.74e-05,5.765,0.000,0.000,0.001

0,1,2,3
Omnibus:,78.395,Durbin-Watson:,0.542
Prob(Omnibus):,0.0,Jarque-Bera (JB):,78.094
Skew:,-0.255,Prob(JB):,1.1e-17
Kurtosis:,2.799,Cond. No.,7180.0


In [8]:
# create a new dataframe of January
jan = weather[weather['Month'] == 1]
# re-index
jan.index = range(len(jan))
jan.drop(['Index'],axis = 1)

Unnamed: 0,STATION,NAME,Month,Date,Year,Season,TMAX,TMIN,Week
0,USW00094746,"WORCESTER, MA US",1,1,1950,Spring,40,27,1
1,USW00094746,"WORCESTER, MA US",1,2,1950,Spring,37,28,1
2,USW00094746,"WORCESTER, MA US",1,3,1950,Spring,51,35,1
3,USW00094746,"WORCESTER, MA US",1,4,1950,Spring,60,50,1
4,USW00094746,"WORCESTER, MA US",1,5,1950,Spring,58,41,1
...,...,...,...,...,...,...,...,...,...
2132,USW00094746,"WORCESTER, MA US",1,27,2018,Spring,49,23,4
2133,USW00094746,"WORCESTER, MA US",1,28,2018,Spring,49,36,4
2134,USW00094746,"WORCESTER, MA US",1,29,2018,Spring,38,23,4
2135,USW00094746,"WORCESTER, MA US",1,30,2018,Spring,30,17,4


In [9]:
# cheak the correlation
(a,b) = stats.pearsonr(jan.index, jan['TMIN'])
(c,d) = stats.pearsonr(jan.index, jan['TMAX'])
(e,f) = stats.pearsonr(jan['TMAX'], jan['TMIN'])
print('The correlation coefficient is',a,'The p-value is',b, 'for TMIN')
print('The correlation coefficient is',c,'The p-value is',d, 'for TMAX')
print('The correlation coefficient is',e,'The p-value is',f, 'for TMAX and TMIN')

The correlation coefficient is 0.056439792560570634 The p-value is 0.009063878669843053 for TMIN
The correlation coefficient is 0.010833459082919267 The p-value is 0.6167036706394822 for TMAX
The correlation coefficient is 0.8386162486637306 The p-value is 0.0 for TMAX and TMIN


In [10]:
# fit model
model_fit = smf.ols('TMIN ~ jan.index', data = jan).fit()
# show the summary
model_fit.summary()

0,1,2,3
Dep. Variable:,TMIN,R-squared:,0.003
Model:,OLS,Adj. R-squared:,0.003
Method:,Least Squares,F-statistic:,6.823
Date:,"Tue, 19 Nov 2019",Prob (F-statistic):,0.00906
Time:,14:12:30,Log-Likelihood:,-8143.0
No. Observations:,2137,AIC:,16290.0
Df Residuals:,2135,BIC:,16300.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,15.4043,0.473,32.571,0.000,14.477,16.332
jan.index,0.0010,0.000,2.612,0.009,0.000,0.002

0,1,2,3
Omnibus:,19.921,Durbin-Watson:,0.648
Prob(Omnibus):,0.0,Jarque-Bera (JB):,15.128
Skew:,-0.109,Prob(JB):,0.000519
Kurtosis:,2.65,Cond. No.,2470.0


In [11]:
# create a new dataframe of Week1 in January
week1 = jan[(jan['Week'] == 1)]
# re-index
week1.index = range(len(week1))
week1.drop(['Index'],axis = 1)

Unnamed: 0,STATION,NAME,Month,Date,Year,Season,TMAX,TMIN,Week
0,USW00094746,"WORCESTER, MA US",1,1,1950,Spring,40,27,1
1,USW00094746,"WORCESTER, MA US",1,2,1950,Spring,37,28,1
2,USW00094746,"WORCESTER, MA US",1,3,1950,Spring,51,35,1
3,USW00094746,"WORCESTER, MA US",1,4,1950,Spring,60,50,1
4,USW00094746,"WORCESTER, MA US",1,5,1950,Spring,58,41,1
...,...,...,...,...,...,...,...,...,...
478,USW00094746,"WORCESTER, MA US",1,3,2018,Spring,23,8,1
479,USW00094746,"WORCESTER, MA US",1,4,2018,Spring,26,15,1
480,USW00094746,"WORCESTER, MA US",1,5,2018,Spring,17,1,1
481,USW00094746,"WORCESTER, MA US",1,6,2018,Spring,6,-4,1


In [12]:
# cheak the correlation
(a,b) = stats.pearsonr(week1.index, week1['TMIN'])
(c,d) = stats.pearsonr(week1.index, week1['TMAX'])
(e,f) = stats.pearsonr(week1['TMAX'], week1['TMIN'])
print('The correlation coefficient is',a,'The p-value is',b, 'for TMIN')
print('The correlation coefficient is',c,'The p-value is',d, 'for TMAX')
print('The correlation coefficient is',e,'The p-value is',f, 'for TMAX and TMIN')

The correlation coefficient is -0.0033870650765646347 The p-value is 0.9408148428220696 for TMIN
The correlation coefficient is 0.00537460810498756 The p-value is 0.9062150682571997 for TMAX
The correlation coefficient is 0.8578061278108411 The p-value is 3.884868850316602e-141 for TMAX and TMIN
