# Bonus: Temperature Analysis I

In [2]:
import pandas as pd
from datetime import datetime as dt

In [3]:
# "tobs" is "temperature observations"
df = pd.read_csv('Resources/hawaii_measurements.csv')
df.head()

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63
2,USC00519397,2010-01-03,0.0,74
3,USC00519397,2010-01-04,0.0,76
4,USC00519397,2010-01-06,,73


In [4]:
# Convert the date column format from string to datetime
converted_to_dt = pd.to_datetime(df['date'])
converted_to_dt.head()

0   2010-01-01
1   2010-01-02
2   2010-01-03
3   2010-01-04
4   2010-01-06
Name: date, dtype: datetime64[ns]

In [5]:
# Add formated date to 'df', then set the formated date column as the DataFrame index
df['date_formated'] = converted_to_dt
df = df.set_index('date_formated')
df.head()

Unnamed: 0_level_0,station,date,prcp,tobs
date_formated,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-01-01,USC00519397,2010-01-01,0.08,65
2010-01-02,USC00519397,2010-01-02,0.0,63
2010-01-03,USC00519397,2010-01-03,0.0,74
2010-01-04,USC00519397,2010-01-04,0.0,76
2010-01-06,USC00519397,2010-01-06,,73


In [6]:
# Drop the 'date' (old, non-formated) column
del df['date']
df.head()

Unnamed: 0_level_0,station,prcp,tobs
date_formated,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-01-01,USC00519397,0.08,65
2010-01-02,USC00519397,0.0,63
2010-01-03,USC00519397,0.0,74
2010-01-04,USC00519397,0.0,76
2010-01-06,USC00519397,,73


### Compare June and December data across all years 

In [7]:
from scipy import stats

In [8]:
# Filter data for desired months
jun_dec_filtered_df = df.loc[(df.index.month == 6) | (df.index.month == 12), :]
jun_dec_filtered_df.tail(90)

Unnamed: 0_level_0,station,prcp,tobs
date_formated,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015-12-26,USC00516128,0.81,69
2015-12-27,USC00516128,2.21,71
2015-12-29,USC00516128,,67
2015-12-30,USC00516128,0.03,71
2015-12-31,USC00516128,0.10,69
...,...,...,...
2017-06-26,USC00516128,0.02,79
2017-06-27,USC00516128,0.10,74
2017-06-28,USC00516128,0.02,74
2017-06-29,USC00516128,0.04,76


In [9]:
# Identify the average temperature for June
jun_dec_avg = jun_dec_filtered_df.groupby(by=[jun_dec_filtered_df.index.month]).mean()

jun_avg = jun_dec_avg.loc[6, :][1]
print(f'Average June Temperature: {jun_avg}')

Average June Temperature: 74.94411764705882


In [10]:
# Identify the average temperature for December
dec_avg = jun_dec_avg.loc[12, :][1]
print(f'Average December Temperature: {dec_avg}')

Average December Temperature: 71.04152933421226


In [14]:
# Create collections of temperature data
jun_temp = df.loc[df.index.month == 6, :]['tobs'].to_list()
dec_temp = df.loc[df.index.month == 12, :]['tobs'].to_list()

In [20]:
# Run paired t-test
stats.ttest_ind(jun_temp, dec_temp, equal_var=False)

Ttest_indResult(statistic=31.355036920962423, pvalue=4.193529835915755e-187)

### Analysis

Running the T-Test we found our p-value=**4.193529835915755e-187**. This tells us that the difference between the means of June and December temperatures in Hawaii is statistically significant. 

The level of statistical significance is often expressed as a p-value between 0 and 1. The lower the p-value, the greater the statistical significance is (typically if p-value <0.05 it is considered statistically significant). 