# Bonus: Temperature Analysis I

In [1]:
import pandas as pd
import datetime as dt
from sqlalchemy import create_engine, func


In [2]:
#   "tobs" is "temperature observations"
#   read the 'measurements' table and store as a dataframe
df = pd.read_csv('Resources/hawaii_measurements.csv')
df.head()


Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63
2,USC00519397,2010-01-03,0.0,74
3,USC00519397,2010-01-04,0.0,76
4,USC00519397,2010-01-06,,73


In [3]:
#   Convert the date column format from string to datetime
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')
df


Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.00,63
2,USC00519397,2010-01-03,0.00,74
3,USC00519397,2010-01-04,0.00,76
4,USC00519397,2010-01-06,,73
...,...,...,...,...
19545,USC00516128,2017-08-19,0.09,71
19546,USC00516128,2017-08-20,,78
19547,USC00516128,2017-08-21,0.56,76
19548,USC00516128,2017-08-22,0.50,76


In [4]:
#   Set the date column as the DataFrame index
df1 = df.set_index('date')
df1


Unnamed: 0_level_0,station,prcp,tobs
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-01-01,USC00519397,0.08,65
2010-01-02,USC00519397,0.00,63
2010-01-03,USC00519397,0.00,74
2010-01-04,USC00519397,0.00,76
2010-01-06,USC00519397,,73
...,...,...,...
2017-08-19,USC00516128,0.09,71
2017-08-20,USC00516128,,78
2017-08-21,USC00516128,0.56,76
2017-08-22,USC00516128,0.50,76


In [5]:
#   replace NaN values with zeroes
df1 = df1.fillna(0)
df1


Unnamed: 0_level_0,station,prcp,tobs
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-01-01,USC00519397,0.08,65
2010-01-02,USC00519397,0.00,63
2010-01-03,USC00519397,0.00,74
2010-01-04,USC00519397,0.00,76
2010-01-06,USC00519397,0.00,73
...,...,...,...
2017-08-19,USC00516128,0.09,71
2017-08-20,USC00516128,0.00,78
2017-08-21,USC00516128,0.56,76
2017-08-22,USC00516128,0.50,76


In [6]:
#   Drop the date column 
#df1.reset_index(drop=True, inplace=True)
#df1

#   there was no need to do this - it had already been dropped when the index was set to 'date'


### Compare June and December data across all years 

In [7]:
import scipy
from scipy import stats
from scipy.stats import ttest_ind


In [8]:
#   check the dataframe
df1


Unnamed: 0_level_0,station,prcp,tobs
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-01-01,USC00519397,0.08,65
2010-01-02,USC00519397,0.00,63
2010-01-03,USC00519397,0.00,74
2010-01-04,USC00519397,0.00,76
2010-01-06,USC00519397,0.00,73
...,...,...,...
2017-08-19,USC00516128,0.09,71
2017-08-20,USC00516128,0.00,78
2017-08-21,USC00516128,0.56,76
2017-08-22,USC00516128,0.50,76


In [9]:
#   identify the average temperature for all stations for all years
#   sort the dataframe into ascending sequence

#   there was no need to sort - it is already in ascending sequence   *** CHECK THIS OUT TIME PERMITTING   ****

#   set the start and end date of the dataset

start_date = '2010-01-01'
end_date = '2017-08-23'
df1[start_date:end_date]
df1

#print(f"Start of Date Range: {start_date} , End of Date Range: {end_date}")


Unnamed: 0_level_0,station,prcp,tobs
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-01-01,USC00519397,0.08,65
2010-01-02,USC00519397,0.00,63
2010-01-03,USC00519397,0.00,74
2010-01-04,USC00519397,0.00,76
2010-01-06,USC00519397,0.00,73
...,...,...,...
2017-08-19,USC00516128,0.09,71
2017-08-20,USC00516128,0.00,78
2017-08-21,USC00516128,0.56,76
2017-08-22,USC00516128,0.50,76


In [10]:
# Identify the average temperature for June
#   first get all 'tobs' for June for the entire period of observations
df1['month'] = pd.DatetimeIndex(df['date']).month
df1[df1['month'] == 6]


Unnamed: 0_level_0,station,prcp,tobs,month
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-06-01,USC00519397,0.00,78,6
2010-06-02,USC00519397,0.01,76,6
2010-06-03,USC00519397,0.00,78,6
2010-06-04,USC00519397,0.00,76,6
2010-06-05,USC00519397,0.00,77,6
...,...,...,...,...
2017-06-26,USC00516128,0.02,79,6
2017-06-27,USC00516128,0.10,74,6
2017-06-28,USC00516128,0.02,74,6
2017-06-29,USC00516128,0.04,76,6


In [11]:
#   then get the average of 'tobs' for June
jun_temps = df1[df1['month'] == 6]
avg_jun_temps = jun_temps['tobs'].mean()
avg_jun_temps


74.94411764705882

In [12]:
# Identify the average temperature for December
#   first get all 'tobs' for December for the entire period of observations
df1['month'] = pd.DatetimeIndex(df['date']).month
df1[df1['month'] == 12]


Unnamed: 0_level_0,station,prcp,tobs,month
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-12-01,USC00519397,0.04,76,12
2010-12-03,USC00519397,0.00,74,12
2010-12-04,USC00519397,0.00,74,12
2010-12-06,USC00519397,0.00,64,12
2010-12-07,USC00519397,0.00,64,12
...,...,...,...,...
2016-12-27,USC00516128,0.14,71,12
2016-12-28,USC00516128,0.14,71,12
2016-12-29,USC00516128,1.03,69,12
2016-12-30,USC00516128,2.37,65,12


In [13]:
#   then get the average of 'tobs' for December
#   then get the average of 'tobs' for June
dec_temps = df1[df1['month'] == 6]
avg_dec_temps = dec_temps['tobs'].mean()
avg_dec_temps



74.94411764705882

In [14]:
# Create collections of temperature data
#   created 'jun_temps' and 'dec_temps' as above

In [15]:
# Run unpaired t-test
scipy.stats.ttest_ind(jun_temps['tobs'], dec_temps['tobs'], equal_var=False)


Ttest_indResult(statistic=0.0, pvalue=1.0)

### Temperature Analysis - June versus December

The unpaired t-test is used to compare the means of two samples when each individual in one sample is independent of every individual in the other sample. 

As opposed to the paired t-test which is used to compare the means of two samples when each individual in one sample also appears in the other sample. 

    The results of the unpaired t-test indicate the following : 

    The t-value of 0.0 indicates that the two samples are statistically very similar.
    The p-value of 1.0 indicates that there is no difference statistically between the two samples.
