In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

import wrangle as w
import model as m

from importlib import reload

from scipy import stats

In [2]:
df = w.get_explore_data()

In [3]:
df = w.get_contest_data(df)

In [4]:
df.head()

Unnamed: 0,region,elevation,lat,lon,startdate,potential_evap,precip,barometric_pressure,all_atmos_precip,relative_humidity,...,height_500_mb,height_850_mb,zonal_wind_250mb,zonal_wind_925mb,long_wind_250mb,long_wind_925mb,elevation_range,region_bins,month,season
0,BSh,200.0,0.0,0.833333,2014-09-01,237.0,94.31,98644.97,42.45,81.72,...,5899.66,1535.52,-2.56,-5.22,-3.52,4.41,bottom_low,Dry,9,Autumn
1,BSh,200.0,0.0,0.833333,2014-09-02,228.9,100.85,98686.8,42.66,82.56,...,5901.03,1538.0,-2.39,-5.2,-4.49,3.74,bottom_low,Dry,9,Autumn
2,BSh,200.0,0.0,0.833333,2014-09-03,220.69,101.25,98712.85,43.23,83.29,...,5902.18,1540.32,-2.76,-5.0,-5.44,3.4,bottom_low,Dry,9,Autumn
3,BSh,200.0,0.0,0.833333,2014-09-04,225.28,101.9,98711.7,43.11,83.26,...,5903.07,1541.1,-3.0,-4.61,-5.76,3.29,bottom_low,Dry,9,Autumn
4,BSh,200.0,0.0,0.833333,2014-09-05,237.24,82.95,98686.46,42.98,82.5,...,5903.36,1539.73,-3.4,-4.25,-6.09,3.27,bottom_low,Dry,9,Autumn


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 375734 entries, 0 to 375733
Data columns (total 24 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   region               375734 non-null  object        
 1   elevation            375734 non-null  float64       
 2   lat                  375734 non-null  float64       
 3   lon                  375734 non-null  float64       
 4   startdate            375734 non-null  datetime64[ns]
 5   potential_evap       375734 non-null  float64       
 6   precip               375734 non-null  float64       
 7   barometric_pressure  375734 non-null  float64       
 8   all_atmos_precip     375734 non-null  float64       
 9   relative_humidity    375734 non-null  float64       
 10  sea_level_press      375734 non-null  float64       
 11  mean_temp            375734 non-null  float64       
 12  height_10_mb         375734 non-null  float64       
 13  height_100_mb 

# splitting into TVT so conclusions reached in exploration can be immediately modeled on

In [6]:
df.shape

(375734, 24)

In [7]:
train, validate, test = w.split_data(df)

In [8]:
train.shape, validate.shape, test.shape

((244226, 24), (75147, 24), (56361, 24))

# season bins already created, so making individual dfs based on season for further exploration

In [9]:
def autumn(df):
    df = df[(df.season == 'Autumn')]
    return df

def spring(df):
    df = df[(df.season == 'Spring')]
    return df

def summer(df):
    df = df[(df.season == 'Summer')]
    return df

def winter(df):
    df = df[(df.season == 'Winter')]
    return df



In [10]:
autumn = autumn(train)
spring = spring(train)
summer = summer(train)
winter = winter(train)

In [11]:
autumn.shape, spring.shape, summer.shape, winter.shape

((60642, 24), (61759, 24), (61481, 24), (60344, 24))

### running value counts to ensure they align with original train df
### Autumn: check, Spring: check, Summer: check, Winter: check

In [12]:
train.value_counts(train.season == 'Autumn')

season
False    183584
True      60642
dtype: int64

# Now that we have our data isolated by season, let's see other trends 

# 1) Autumn
### Hypotheses:
- strong correlation between precipitation and temp
- region will be less signiicant than it is for the og df
- elevation and elevation range will be more significant than for the ogdf

In [13]:
corrs = list(autumn.columns)
corrs.remove('mean_temp')
corrs.remove('startdate')

In [14]:
def make_metric(metric_df, col, p):
    if metric_df.size ==0:
        metric_df = pd.DataFrame(data=[
            {
                'Column_name': col, 
                'p-value': p
            }])
        return metric_df
    else:
        return metric_df.append(
            {
                'column_name': col, 
                'p-value': p
            }, ignore_index=True)

In [15]:
metric_df = pd.DataFrame()
for col in corrs:
    corr, p = stats.spearmanr(autumn[col], autumn['mean_temp'])
    metric_df = make_metric(metric_df, col, p)

metric_df

Unnamed: 0,Column_name,p-value,column_name
0,region,0.0,
1,,0.0,elevation
2,,0.0,lat
3,,7.751143e-234,lon
4,,0.0,potential_evap
5,,1.674732e-07,precip
6,,0.0,barometric_pressure
7,,0.0,all_atmos_precip
8,,0.0,relative_humidity
9,,0.0,sea_level_press


In [16]:

#Winter: 
w_corrs = list(winter.columns)
w_corrs.remove('mean_temp')
w_corrs.remove('startdate')

In [18]:
metric_df = pd.DataFrame()
for col in w_corrs:
    corr, p = stats.spearmanr(winter[col], winter['mean_temp'])
    metric_df = make_metric(metric_df, col, p)

metric_df

Unnamed: 0,Column_name,p-value,column_name
0,region,0.0,
1,,0.0,elevation
2,,0.0,lat
3,,7.338413e-42,lon
4,,0.0,potential_evap
5,,1.195705e-21,precip
6,,0.0,barometric_pressure
7,,0.0,all_atmos_precip
8,,0.0,relative_humidity
9,,0.0,sea_level_press


In [19]:
# Spring

sp_corrs = list(spring.columns)
sp_corrs.remove('mean_temp')
sp_corrs.remove('startdate')

In [20]:
metric_df = pd.DataFrame()
for col in sp_corrs:
    corr, p = stats.spearmanr(spring[col], spring['mean_temp'])
    metric_df = make_metric(metric_df, col, p)

metric_df

Unnamed: 0,Column_name,p-value,column_name
0,region,0.0,
1,,0.0,elevation
2,,0.0,lat
3,,0.0,lon
4,,0.0,potential_evap
5,,5.061901e-160,precip
6,,0.0,barometric_pressure
7,,0.0,all_atmos_precip
8,,0.0,relative_humidity
9,,0.0,sea_level_press


In [21]:
# Summer

sm_corrs = list(summer.columns)
sm_corrs.remove('mean_temp')
sm_corrs.remove('startdate')

In [22]:
metric_df = pd.DataFrame()
for col in sm_corrs:
    corr, p = stats.spearmanr(summer[col], summer['mean_temp'])
    metric_df = make_metric(metric_df, col, p)

metric_df

Unnamed: 0,Column_name,p-value,column_name
0,region,0.0,
1,,0.0,elevation
2,,0.0,lat
3,,0.0,lon
4,,0.0,potential_evap
5,,3.0709659999999997e-87,precip
6,,0.0,barometric_pressure
7,,0.0,all_atmos_precip
8,,0.5481968,relative_humidity
9,,0.0,sea_level_press
