In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
import sklearn.metrics as metric
from sklearn.linear_model import LinearRegression, LassoLars
from sklearn.preprocessing import PolynomialFeatures
import sklearn.preprocessing as pre
from sklearn.ensemble import HistGradientBoostingRegressor

import warnings
warnings.filterwarnings('ignore')

import wrangle1 as w
import model as m

from importlib import reload

## Prepping Data with all features

In [2]:
train_df = pd.read_csv('train_data.csv').drop('index', axis='columns')
test_df = pd.read_csv('test_data.csv')
train_df.shape, test_df.shape

((375734, 245), (31354, 245))

In [21]:
test_df

Unnamed: 0,index,lat,lon,startdate,contest-pevpr-sfc-gauss-14d__pevpr,nmme0-tmp2m-34w__cancm30,nmme0-tmp2m-34w__cancm40,nmme0-tmp2m-34w__ccsm30,nmme0-tmp2m-34w__ccsm40,nmme0-tmp2m-34w__cfsv20,...,wind-vwnd-925-2010-11,wind-vwnd-925-2010-12,wind-vwnd-925-2010-13,wind-vwnd-925-2010-14,wind-vwnd-925-2010-15,wind-vwnd-925-2010-16,wind-vwnd-925-2010-17,wind-vwnd-925-2010-18,wind-vwnd-925-2010-19,wind-vwnd-925-2010-20
0,375734,0.0,0.833333,11/1/22,339.88,30.88,30.92,29.17,31.02,29.47,...,-19.28,-39.77,-29.25,40.88,-8.31,14.91,-24.62,31.05,-23.69,6.27
1,375735,0.0,0.833333,11/2/22,334.63,30.88,30.92,29.17,31.02,29.47,...,-19.58,-43.14,-28.62,45.37,-5.42,16.97,-23.94,28.84,-20.61,14.16
2,375736,0.0,0.833333,11/3/22,337.83,30.88,30.92,29.17,31.02,29.47,...,-13.73,-44.22,-27.67,49.76,-1.31,21.44,-19.06,26.85,-16.78,13.42
3,375737,0.0,0.833333,11/4/22,345.81,30.88,30.92,29.17,31.02,29.47,...,-7.97,-49.47,-19.32,52.62,-0.44,21.65,-23.12,23.70,-18.62,10.69
4,375738,0.0,0.833333,11/5/22,357.39,30.88,30.92,29.17,31.02,29.47,...,-0.80,-56.07,-9.89,51.23,-7.57,19.86,-30.56,20.66,-25.08,19.64
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31349,407083,1.0,0.866667,12/27/22,62.72,4.60,8.71,6.05,10.08,6.39,...,74.96,-8.49,32.39,38.82,7.42,11.75,-23.62,-0.24,-5.94,51.23
31350,407084,1.0,0.866667,12/28/22,73.41,4.60,8.71,6.05,10.08,6.39,...,88.57,0.83,26.23,37.64,13.01,17.84,-22.05,-3.03,1.31,51.45
31351,407085,1.0,0.866667,12/29/22,70.00,4.60,8.71,6.05,10.08,6.39,...,99.43,10.90,21.06,36.53,14.15,23.12,-25.60,-5.88,9.32,45.32
31352,407086,1.0,0.866667,12/30/22,79.81,4.60,8.71,6.05,10.08,6.39,...,109.39,21.37,20.42,36.05,6.38,29.00,-27.06,-1.42,16.06,31.88


In [3]:
test_df.shape[0] / train_df.shape[0]

0.08344733242134063

In [5]:
train_df = w.rename_data(train_df)

In [5]:
test_df = w.rename_data(test_df)

In [6]:
train, validate, test = w.split_data(train_df)

In [7]:
train.shape, validate.shape, test.shape

((244226, 245), (75147, 245), (56361, 245))

In [3]:
def filter_na_cols(df):
    count_na_df = df.isna().sum() 
    if count_na_df[count_na_df > 0].tolist():
        return count_na_df[count_na_df > 0]
    else:
        return 'Clean dataset'

In [None]:
count_na_df = df.isna().sum() 
count_na_df[count_na_df > 0].tolist()

In [7]:
nulls = filter_na_cols(train_df)

In [8]:
null_cols = list(nulls.index)
null_cols

['nmme0-tmp2m-34w__ccsm30',
 'nmme-tmp2m-56w__ccsm3',
 'nmme-prate-34w__ccsm3',
 'nmme0-prate-56w__ccsm30',
 'nmme0-prate-34w__ccsm30',
 'nmme-prate-56w__ccsm3',
 'nmme-tmp2m-34w__ccsm3',
 'ccsm30']

According to the data description:

```nmme0-tmp2m-34w```: file containing most recent monthly NMME model forecasts for tmp2m (cancm30,cancm40, ccsm30, ccsm40, cfsv20, gfdlflora0, gfdlflorb0, gfdl0, nasa0, nmme0mean) and average forecast across those models (nmme0mean)"
=> This means:

```nmme0mean``` = (cancm30 + cancm40 + x +ccsm30 + ccsm40 + cfsv20 + gfdlflora0 + gfdlflorb0 + gfdl0 + nasa0) / 9

In [19]:
calc_1 = ['nmme0-tmp2m-34w__cancm30',
'nmme0-tmp2m-34w__cancm40',
'nmme0-tmp2m-34w__ccsm30',
'nmme0-tmp2m-34w__ccsm40',
'nmme0-tmp2m-34w__cfsv20',
'nmme0-tmp2m-34w__gfdlflora0',
'nmme0-tmp2m-34w__gfdlflorb0',
'nmme0-tmp2m-34w__gfdl0',
'nmme0-tmp2m-34w__nasa0']

In [20]:
df[calc_1].mean(axis=1)

0         30.454444
1         30.454444
2         30.454444
3         30.454444
4         30.454444
            ...    
375729    22.551111
375730    22.551111
375731    22.551111
375732    22.551111
375733    22.551111
Length: 375734, dtype: float64

In [24]:
df['calculated_nmme0-tmp2m-34w__nmme0mean'] = df[calc_1].mean(axis=1)

In [25]:
df['calculated_nmme0-tmp2m-34w__nmme0mean']

0         30.454444
1         30.454444
2         30.454444
3         30.454444
4         30.454444
            ...    
375729    22.551111
375730    22.551111
375731    22.551111
375732    22.551111
375733    22.551111
Name: calculated_nmme0-tmp2m-34w__nmme0mean, Length: 375734, dtype: float64

In [12]:
null_cols

['nmme0-tmp2m-34w__ccsm30',
 'nmme-tmp2m-56w__ccsm3',
 'nmme-prate-34w__ccsm3',
 'nmme0-prate-56w__ccsm30',
 'nmme0-prate-34w__ccsm30',
 'nmme-prate-56w__ccsm3',
 'nmme-tmp2m-34w__ccsm3',
 'ccsm30']

In [9]:
g_means =  ['nmme0-tmp2m-34w__nmme0mean', 
 'nmme-tmp2m-56w__nmmemean', 
 'nmme-prate-34w__nmmemean', 
 'nmme0-prate-56w__nmme0mean', 
 'nmme0-prate-34w__nmme0mean', 
 'nmme-prate-56w__nmmemean', 
 'nmme-tmp2m-34w__nmmemean']


g_1 = ['nmme0-tmp2m-34w__cancm30',
'nmme0-tmp2m-34w__cancm40',
'nmme0-tmp2m-34w__ccsm40',
'nmme0-tmp2m-34w__cfsv20',
'nmme0-tmp2m-34w__gfdlflora0',
'nmme0-tmp2m-34w__gfdlflorb0',
'nmme0-tmp2m-34w__gfdl0',
'nmme0-tmp2m-34w__nasa0']

g_2 = ['nmme-tmp2m-56w__cancm3',
'nmme-tmp2m-56w__cancm4',
'nmme-tmp2m-56w__ccsm4',
'nmme-tmp2m-56w__cfsv2',
'nmme-tmp2m-56w__gfdl',
'nmme-tmp2m-56w__gfdlflora',
'nmme-tmp2m-56w__gfdlflorb',
'nmme-tmp2m-56w__nasa']

g_3 = ['nmme-prate-34w__cancm3',
'nmme-prate-34w__cancm4',
'nmme-prate-34w__ccsm4',
'nmme-prate-34w__cfsv2',
'nmme-prate-34w__gfdl',
'nmme-prate-34w__gfdlflora',
'nmme-prate-34w__gfdlflorb',
'nmme-prate-34w__nasa']

g_4 = [ 'nmme0-prate-56w__cancm30',
'nmme0-prate-56w__cancm40',
'nmme0-prate-56w__ccsm40',
'nmme0-prate-56w__cfsv20',
'nmme0-prate-56w__gfdlflora0',
'nmme0-prate-56w__gfdlflorb0',
'nmme0-prate-56w__gfdl0',
'nmme0-prate-56w__nasa0']

g_5 = ['nmme0-prate-34w__cancm30',
'nmme0-prate-34w__cancm40',
'nmme0-prate-34w__ccsm40',
'nmme0-prate-34w__cfsv20',
'nmme0-prate-34w__gfdlflora0',
'nmme0-prate-34w__gfdlflorb0',
'nmme0-prate-34w__gfdl0',
'nmme0-prate-34w__nasa0']

g_6 = ['nmme-prate-56w__cancm3',
'nmme-prate-56w__cancm4',
'nmme-prate-56w__ccsm4',
'nmme-prate-56w__cfsv2',
'nmme-prate-56w__gfdl',
'nmme-prate-56w__gfdlflora',
'nmme-prate-56w__gfdlflorb',
'nmme-prate-56w__nasa']

g_7 = ['nmme-tmp2m-34w__cancm3',
'nmme-tmp2m-34w__cancm4',
'nmme-tmp2m-34w__ccsm4',
'nmme-tmp2m-34w__cfsv2',
'nmme-tmp2m-34w__gfdl',
'nmme-tmp2m-34w__gfdlflora',
'nmme-tmp2m-34w__gfdlflorb',
'nmme-tmp2m-34w__nasa']

In [30]:
df[df['nmme0-tmp2m-34w__ccsm30'].isna()]['nmme0-tmp2m-34w__ccsm30']

356      NaN
357      NaN
358      NaN
359      NaN
360      NaN
          ..
375385   NaN
375386   NaN
375387   NaN
375388   NaN
375389   NaN
Name: nmme0-tmp2m-34w__ccsm30, Length: 15934, dtype: float64

In [28]:
df[g_1].sum(axis=1)

0         244.52
1         244.52
2         244.52
3         244.52
4         244.52
           ...  
375729    182.71
375730    182.71
375731    182.71
375732    182.71
375733    182.71
Length: 375734, dtype: float64

'nmme0-tmp2m-34w__ccsm30' = ('nmme0-tmp2m-34w__nmme0mean'*9) - 

In [38]:
(df['nmme0-tmp2m-34w__nmme0mean'] *9)

0         274.14
1         274.14
2         274.14
3         274.14
4         274.14
           ...  
375729    202.95
375730    202.95
375731    202.95
375732    202.95
375733    202.95
Name: nmme0-tmp2m-34w__nmme0mean, Length: 375734, dtype: float64

In [34]:
(df['nmme0-tmp2m-34w__nmme0mean'] *9) - (df[g_1].sum(axis=1))

0         29.62
1         29.62
2         29.62
3         29.62
4         29.62
          ...  
375729    20.24
375730    20.24
375731    20.24
375732    20.24
375733    20.24
Length: 375734, dtype: float64

In [33]:
df['nmme0-tmp2m-34w__ccsm30']

0         29.57
1         29.57
2         29.57
3         29.57
4         29.57
          ...  
375729    20.25
375730    20.25
375731    20.25
375732    20.25
375733    20.25
Name: nmme0-tmp2m-34w__ccsm30, Length: 375734, dtype: float64

In [35]:
df['calc_nmme0-tmp2m-34w__ccsm30'] = (df['nmme0-tmp2m-34w__nmme0mean'] *9) - (df[g_1].sum(axis=1))

In [39]:
df[df['nmme0-tmp2m-34w__ccsm30'].isna()][['calc_nmme0-tmp2m-34w__ccsm30','nmme0-tmp2m-34w__ccsm30']]

Unnamed: 0,calc_nmme0-tmp2m-34w__ccsm30,nmme0-tmp2m-34w__ccsm30
356,30.91,
357,30.91,
358,30.91,
359,30.91,
360,30.91,
...,...,...
375385,22.72,
375386,22.72,
375387,22.72,
375388,22.72,


In [11]:
gs = [g_1, g_2, g_3, g_4, g_5, g_6, g_7]

zip_cols = zip(null_cols, gs, g_means)
for c, g, s in zip_cols:
    train_df[c] = (train_df[s]*9) - train_df[g].sum(1)

In [13]:
filter_na_cols(train_df)

ccsm30    15934
dtype: int64

In [14]:
train_df = train_df.drop(columns='ccsm30')

In [15]:
train_df.shape

(375734, 244)

In [16]:
train, validate, _ = w.split_data(train_df)

In [17]:
train.shape

(244226, 244)

In [20]:
filter_na_cols(train)

'Clean dataset'

In [18]:
drivers2 = ['nmme0-tmp2m-34w__cancm40',
 'nmme0-tmp2m-34w__gfdlflorb0',
 'nmme-tmp2m-56w__nmmemean',
 'nmme-prate-34w__nmmemean',
 'nmme0-prate-56w__nmme0mean',
 'nmme0-prate-34w__nmme0mean',
 'sea_level_press',
 'nmme-prate-56w__nmmemean',
 'nmme-tmp2m-34w__nmmemean',
 'nmme0mean',
 'wind-hgt-100-2010-1',
 'nmme0-tmp2m-34w__nmme0mean',
 'nmme-tmp2m-56w__cancm4',
 'nmme-tmp2m-56w__ccsm3',
 'nmme-tmp2m-56w__ccsm4',
 'nmme-tmp2m-56w__cfsv2',
 'nmme-tmp2m-56w__gfdlflora',
 'nmme-tmp2m-56w__gfdlflorb',
 'nmme0-prate-56w__cancm30',
 'nmme0-prate-56w__cancm40',
 'nmme0-prate-56w__ccsm30',
 'nmme0-prate-56w__ccsm40',
 'nmme0-prate-56w__cfsv20',
 'nmme0-prate-56w__gfdlflora0',
 'nmme0-prate-56w__gfdlflorb0',
 'nmme0-prate-56w__gfdl0',
 'nmme0-prate-56w__nasa0',
 'nmme0-prate-34w__cancm30',
 'nmme0-prate-34w__cancm40',
 'nmme0-prate-34w__ccsm30',
 'nmme0-prate-34w__ccsm40',
 'nmme0-prate-34w__cfsv20',
 'nmme0-prate-34w__gfdlflora0',
 'nmme0-prate-34w__gfdlflorb0',
 'nmme0-prate-34w__gfdl0',
 'nmme0-prate-34w__nasa0',
 'nmme-prate-56w__gfdlflorb',
 'nmme-tmp2m-34w__cancm3',
 'nmme-tmp2m-34w__ccsm3',
 'nmme-tmp2m-34w__ccsm4',
 'nmme-tmp2m-34w__cfsv2',
 'nmme-tmp2m-34w__gfdlflora',
 'nmme-tmp2m-34w__gfdlflorb',
 'nmme-tmp2m-34w__nasa',
 'cancm30',
 'cancm40',
 'cfsv20',
 'gfdlflora0',
 'gfdl0',
 'nasa0',
 'wind-hgt-500-2010-1',
 'region',
 'elevation',
 'lat',
 'lon',
 'potential_evap',
 'precip',
 'barometric_pressure',
 'all_atmos_precip',
 'relative_humidity',
 'sea_level_press',
 'height_10_mb',
 'height_100_mb',
 'height_500_mb',
 'height_850_mb',
 'zonal_wind_250mb',
 'zonal_wind_925mb',
 'long_wind_250mb',
 'long_wind_925mb']

In [58]:
import model as m

In [20]:
X_train, y_train, X_validate, y_validate, X_test = m.prep_for_model(train, validate, test_df, 'mean_temp', drivers2)





KeyError: "['index', 'contest-pevpr-sfc-gauss-14d__pevpr', 'contest-wind-h10-14d__wind-hgt-10', 'contest-rhum-sig995-14d__rhum', 'contest-wind-h100-14d__wind-hgt-100', 'contest-slp-14d__slp', 'contest-wind-vwnd-925-14d__wind-vwnd-925', 'contest-pres-sfc-gauss-14d__pres', 'contest-wind-uwnd-250-14d__wind-uwnd-250', 'contest-prwtr-eatm-14d__prwtr', 'contest-wind-vwnd-250-14d__wind-vwnd-250', 'contest-precip-14d__precip', 'contest-wind-h850-14d__wind-hgt-850', 'contest-wind-uwnd-925-14d__wind-uwnd-925', 'contest-wind-h500-14d__wind-hgt-500', 'ccsm30', 'elevation__elevation'] not in index"

In [None]:
est = HistGradientBoostingRegressor(max_depth=20, min_samples_leaf=3)
est.fit(X_train, y_train)

metric_df = pd.DataFrame(data=[
        {
            'model': 'Quadratic', 
            f'RMSE_train': metric.mean_squared_error(
                y_train,
                est.predict(X_train)) ** .5,
            f'RMSE_validate': metric.mean_squared_error(
                y_validate,
                est.predict(X_validate)) ** .5
        }])

metric_df

In [63]:
# make the polynomial features to get a new set of features
pf = PolynomialFeatures(degree=2)
# fit and transform X_train_scaled
X_train_degree2 = pf.fit_transform(X_train)
# transform X_validate_scaled & X_test_scaled
X_validate_degree2 = pf.transform(X_validate)
X_test_degree2 = pf.transform(X_test)
# create the model object
lm2 = LinearRegression(normalize=True)
lm2.fit(X_train_degree2, y_train)

metric_df = pd.DataFrame(data=[
        {
            'model': 'Quadratic', 
            f'RMSE_train': metric.mean_squared_error(
                y_train,
                lm2.predict(X_train_degree2)) ** .5,
            f'RMSE_validate': metric.mean_squared_error(
                y_validate,
                lm2.predict(X_validate_degree2)) ** .5
        }])

metric_df

Unnamed: 0,model,RMSE_train,RMSE_validate
0,Quadratic,0.748009,0.761539


In [64]:
test_pred = lm2.predict(X_test_degree2)

test_pred

array([17.83670849, -5.86949116,  7.07576412, ..., -6.21368366,
       25.46963161, 25.76183742])

In [65]:
jan_30_predictions = pd.DataFrame({'contest-tmp2m-14d__tmp2m' : test_pred, 
                     'index': test['index']})

jan_30_predictions

KeyError: 'index'

In [None]:
test_df = pd.read_csv('test_data.csv')

Explaining more detail about how I became an meteoroli

Teaching the audience what the data is.

Why am I doing this?

Run on baseline mean(

Model performs % over baseline