# Bonus: Temperature Analysis I

In [13]:
import pandas as pd
from datetime import datetime as dt
from scipy import stats

In [3]:
# "tobs" is "temperature observations"
df = pd.read_csv('Resources/hawaii_measurements.csv')
df.head()

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63
2,USC00519397,2010-01-03,0.0,74
3,USC00519397,2010-01-04,0.0,76
4,USC00519397,2010-01-06,,73


In [4]:
# Convert the date column format from string to datetime
df['date'] = pd.to_datetime(df['date'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19550 entries, 0 to 19549
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   station  19550 non-null  object        
 1   date     19550 non-null  datetime64[ns]
 2   prcp     18103 non-null  float64       
 3   tobs     19550 non-null  int64         
dtypes: datetime64[ns](1), float64(1), int64(1), object(1)
memory usage: 611.1+ KB


In [5]:
# Set the date column as the DataFrame index
df_indexed = df.set_index('date')
df_indexed.head()

Unnamed: 0_level_0,station,prcp,tobs
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-01-01,USC00519397,0.08,65
2010-01-02,USC00519397,0.0,63
2010-01-03,USC00519397,0.0,74
2010-01-04,USC00519397,0.0,76
2010-01-06,USC00519397,,73


In [6]:
# Drop the date column
df_indexed = df_indexed.reset_index(drop=True)
df_indexed.head()

Unnamed: 0,station,prcp,tobs
0,USC00519397,0.08,65
1,USC00519397,0.0,63
2,USC00519397,0.0,74
3,USC00519397,0.0,76
4,USC00519397,,73


### Compare June and December data across all years 

In [7]:
from scipy import stats

In [8]:
df.head()

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63
2,USC00519397,2010-01-03,0.0,74
3,USC00519397,2010-01-04,0.0,76
4,USC00519397,2010-01-06,,73


In [9]:
# Filter data for desired months
# Data: The final year for Dec data is 2016, which does not show up on the tail-end display.
df_Jun_Dec = df.loc[(df['date'].dt.month == 6) | (df['date'].dt.month == 12)]
print(df_Jun_Dec)

           station       date  prcp  tobs
133    USC00519397 2010-06-01  0.00    78
134    USC00519397 2010-06-02  0.01    76
135    USC00519397 2010-06-03  0.00    78
136    USC00519397 2010-06-04  0.00    76
137    USC00519397 2010-06-05  0.00    77
...            ...        ...   ...   ...
19492  USC00516128 2017-06-26  0.02    79
19493  USC00516128 2017-06-27  0.10    74
19494  USC00516128 2017-06-28  0.02    74
19495  USC00516128 2017-06-29  0.04    76
19496  USC00516128 2017-06-30  0.20    75

[3217 rows x 4 columns]


In [20]:
# Identify the average temperature for June
June_temp = df_Jun_Dec.loc[(df_Jun_Dec['date'].dt.month == 6), ['tobs']].mean()

print(f"The mean temperature in Hawaii for the month of June is: {June_temp.values[0]:.2f}")

The mean temperature in Hawaii for the month of June is: 74.94


In [21]:
# Identify the average temperature for December
Dec_temp = df_Jun_Dec.loc[(df_Jun_Dec['date'].dt.month == 12), ['tobs']].mean()

print(f"The mean temperature in Hawaii for the month of December is: {Dec_temp.values[0]:.2f}")

The mean temperature in Hawaii for the month of December is: 71.04


In [12]:
# Create collections of temperature data
June_collect = df_Jun_Dec.loc[(df_Jun_Dec['date'].dt.month == 6), ['tobs']]
Dec_collect = df_Jun_Dec.loc[(df_Jun_Dec['date'].dt.month == 12), ['tobs']]

In [16]:
# Run unpaired t-test
stats.ttest_ind(June_collect, Dec_collect)

Ttest_indResult(statistic=array([31.60372399]), pvalue=array([3.9025129e-191]))

### Analysis

The unpaired t-Test showed an extremely low P value, which indicated a statistically significant difference in temperature between June and December in Hawaii.