In [41]:
# import the packages
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
import statsmodels.formula.api as smf
import numpy as np

In [42]:
# import the file & print head
weather = pd.read_csv('worcester.csv')
weather.head()

Unnamed: 0,Index,STATION,NAME,Month,Date,Year,Season,TMAX,TMIN
0,0,USW00094746,"WORCESTER, MA US",1,1,1950,Spring,40,27
1,1,USW00094746,"WORCESTER, MA US",1,2,1950,Spring,37,28
2,2,USW00094746,"WORCESTER, MA US",1,3,1950,Spring,51,35
3,3,USW00094746,"WORCESTER, MA US",1,4,1950,Spring,60,50
4,4,USW00094746,"WORCESTER, MA US",1,5,1950,Spring,58,41


In [43]:
# make a new column for week
def Week(df):
    if df['Date'] <= 7 and df['Date'] >= 1:
        return 1
    elif df['Date'] <= 15 and df['Date'] >= 8:
        return 2
    elif df['Date'] <= 23 and df['Date'] >= 16:
        return 3
    else:
        return 4

In [44]:
weather['Week'] = weather.apply(Week, axis=1)
weather.head(8)

Unnamed: 0,Index,STATION,NAME,Month,Date,Year,Season,TMAX,TMIN,Week
0,0,USW00094746,"WORCESTER, MA US",1,1,1950,Spring,40,27,1
1,1,USW00094746,"WORCESTER, MA US",1,2,1950,Spring,37,28,1
2,2,USW00094746,"WORCESTER, MA US",1,3,1950,Spring,51,35,1
3,3,USW00094746,"WORCESTER, MA US",1,4,1950,Spring,60,50,1
4,4,USW00094746,"WORCESTER, MA US",1,5,1950,Spring,58,41,1
5,5,USW00094746,"WORCESTER, MA US",1,6,1950,Spring,42,34,1
6,6,USW00094746,"WORCESTER, MA US",1,7,1950,Spring,34,21,1
7,7,USW00094746,"WORCESTER, MA US",1,8,1950,Spring,22,2,2


In [45]:
# create a new dataframe of spring
spring = weather[weather['Season'] == 'Spring']
# re-index
spring.index = range(len(spring))
spring.drop(['Index'],axis = 1)

Unnamed: 0,STATION,NAME,Month,Date,Year,Season,TMAX,TMIN,Week
0,USW00094746,"WORCESTER, MA US",1,1,1950,Spring,40,27,1
1,USW00094746,"WORCESTER, MA US",1,2,1950,Spring,37,28,1
2,USW00094746,"WORCESTER, MA US",1,3,1950,Spring,51,35,1
3,USW00094746,"WORCESTER, MA US",1,4,1950,Spring,60,50,1
4,USW00094746,"WORCESTER, MA US",1,5,1950,Spring,58,41,1
...,...,...,...,...,...,...,...,...,...
6218,USW00094746,"WORCESTER, MA US",3,28,2018,Spring,53,34,4
6219,USW00094746,"WORCESTER, MA US",3,29,2018,Spring,51,39,4
6220,USW00094746,"WORCESTER, MA US",3,30,2018,Spring,56,39,4
6221,USW00094746,"WORCESTER, MA US",3,31,2018,Spring,52,33,4


In [46]:
# cheak the correlation
(a,b) = stats.pearsonr(spring.index, spring['TMIN'])
(c,d) = stats.pearsonr(spring.index, spring['TMAX'])
(e,f) = stats.pearsonr(spring['TMAX'], spring['TMIN'])
print('The correlation coefficient is',a,'The p-value is',b, 'for TMIN')
print('The correlation coefficient is',c,'The p-value is',d, 'for TMAX')
print('The correlation coefficient is',e,'The p-value is',f, 'for TMAX and TMIN')

The correlation coefficient is 0.07310234267859185 The p-value is 7.772567991143916e-09 for TMIN
The correlation coefficient is 0.054984727779913305 The p-value is 1.4252374250815771e-05 for TMAX
The correlation coefficient is 0.8371189981804991 The p-value is 0.0 for TMAX and TMIN


In [47]:
# fit model
model_fit = smf.ols('TMIN ~ spring.index', data = spring).fit()
# show the summary
model_fit.summary()

0,1,2,3
Dep. Variable:,TMIN,R-squared:,0.005
Model:,OLS,Adj. R-squared:,0.005
Method:,Least Squares,F-statistic:,33.42
Date:,"Wed, 13 Nov 2019",Prob (F-statistic):,7.77e-09
Time:,13:41:10,Log-Likelihood:,-23733.0
No. Observations:,6223,AIC:,47470.0
Df Residuals:,6221,BIC:,47480.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,18.7484,0.278,67.428,0.000,18.203,19.293
spring.index,0.0004,7.74e-05,5.781,0.000,0.000,0.001

0,1,2,3
Omnibus:,78.53,Durbin-Watson:,0.542
Prob(Omnibus):,0.0,Jarque-Bera (JB):,78.243
Skew:,-0.256,Prob(JB):,1.02e-17
Kurtosis:,2.799,Cond. No.,7180.0


In [48]:
# create a new dataframe of January
jan = weather[weather['Month'] == 1]
# re-index
jan.index = range(len(jan))
jan.drop(['Index'],axis = 1)

Unnamed: 0,STATION,NAME,Month,Date,Year,Season,TMAX,TMIN,Week
0,USW00094746,"WORCESTER, MA US",1,1,1950,Spring,40,27,1
1,USW00094746,"WORCESTER, MA US",1,2,1950,Spring,37,28,1
2,USW00094746,"WORCESTER, MA US",1,3,1950,Spring,51,35,1
3,USW00094746,"WORCESTER, MA US",1,4,1950,Spring,60,50,1
4,USW00094746,"WORCESTER, MA US",1,5,1950,Spring,58,41,1
...,...,...,...,...,...,...,...,...,...
2133,USW00094746,"WORCESTER, MA US",1,28,2018,Spring,49,36,4
2134,USW00094746,"WORCESTER, MA US",1,29,2018,Spring,38,23,4
2135,USW00094746,"WORCESTER, MA US",1,30,2018,Spring,30,17,4
2136,USW00094746,"WORCESTER, MA US",1,31,2018,Spring,27,12,4


In [54]:
# cheak the correlation
(a,b) = stats.pearsonr(jan.index, jan['TMIN'])
(c,d) = stats.pearsonr(jan.index, jan['TMAX'])
(e,f) = stats.pearsonr(jan['TMAX'], jan['TMIN'])
print('The correlation coefficient is',a,'The p-value is',b, 'for TMIN')
print('The correlation coefficient is',c,'The p-value is',d, 'for TMAX')
print('The correlation coefficient is',e,'The p-value is',f, 'for TMAX and TMIN')

The correlation coefficient is 0.057309356980670474 The p-value is 0.008036602046245222 for TMIN
The correlation coefficient is 0.012278917224222535 The p-value is 0.5704096977055302 for TMAX
The correlation coefficient is 0.8386864006773814 The p-value is 0.0 for TMAX and TMIN


In [56]:
# fit model
model_fit = smf.ols('TMIN ~ jan.index', data = jan).fit()
# show the summary
model_fit.summary()

0,1,2,3
Dep. Variable:,TMIN,R-squared:,0.003
Model:,OLS,Adj. R-squared:,0.003
Method:,Least Squares,F-statistic:,7.039
Date:,"Wed, 13 Nov 2019",Prob (F-statistic):,0.00804
Time:,13:50:03,Log-Likelihood:,-8146.8
No. Observations:,2138,AIC:,16300.0
Df Residuals:,2136,BIC:,16310.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,15.3936,0.473,32.555,0.000,14.466,16.321
jan.index,0.0010,0.000,2.653,0.008,0.000,0.002

0,1,2,3
Omnibus:,20.086,Durbin-Watson:,0.649
Prob(Omnibus):,0.0,Jarque-Bera (JB):,15.254
Skew:,-0.11,Prob(JB):,0.000487
Kurtosis:,2.649,Cond. No.,2470.0


In [51]:
# create a new dataframe of Week1 in January
week1 = jan[(jan['Week'] == 1)]
# re-index
week1.index = range(len(week1))
week1.drop(['Index'],axis = 1)

Unnamed: 0,STATION,NAME,Month,Date,Year,Season,TMAX,TMIN,Week
0,USW00094746,"WORCESTER, MA US",1,1,1950,Spring,40,27,1
1,USW00094746,"WORCESTER, MA US",1,2,1950,Spring,37,28,1
2,USW00094746,"WORCESTER, MA US",1,3,1950,Spring,51,35,1
3,USW00094746,"WORCESTER, MA US",1,4,1950,Spring,60,50,1
4,USW00094746,"WORCESTER, MA US",1,5,1950,Spring,58,41,1
...,...,...,...,...,...,...,...,...,...
479,USW00094746,"WORCESTER, MA US",1,4,2018,Spring,26,15,1
480,USW00094746,"WORCESTER, MA US",1,5,2018,Spring,17,1,1
481,USW00094746,"WORCESTER, MA US",1,6,2018,Spring,6,-4,1
482,USW00094746,"WORCESTER, MA US",1,7,2018,Spring,13,-9,1


In [55]:
# cheak the correlation
(a,b) = stats.pearsonr(week1.index, week1['TMIN'])
(c,d) = stats.pearsonr(week1.index, week1['TMAX'])
(e,f) = stats.pearsonr(week1['TMAX'], week1['TMIN'])
print('The correlation coefficient is',a,'The p-value is',b, 'for TMIN')
print('The correlation coefficient is',c,'The p-value is',d, 'for TMAX')
print('The correlation coefficient is',e,'The p-value is',f, 'for TMAX and TMIN')

The correlation coefficient is 0.000430825345262488 The p-value is 0.992457190831633 for TMIN
The correlation coefficient is 0.011982852208178556 The p-value is 0.7925887048393911 for TMAX
The correlation coefficient is 0.8578250627684285 The p-value is 1.936359285324353e-141 for TMAX and TMIN
