In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
import sklearn.metrics as metric
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures
import sklearn.preprocessing as pre

import warnings
warnings.filterwarnings('ignore')

import wrangle1 as w
import model as m

from importlib import reload

## Prepping Data with all features

In [2]:
df = w.get_explore_data()

In [3]:
df = w.prep_data(df)

In [4]:
df = w.rename_data(df)

In [5]:
df = w.create_elevation_bins(df)

In [6]:
df = w.create_region_bins(df)

In [None]:
df[]

In [7]:
#dropped nulls because they couldn't go into the model
df = df.dropna()

In [8]:
train, validate, test = w.split_data(df)

In [39]:
train.shape

(229192, 248)

## Splitting data for modeling

In [18]:
#drivers used are ALL columns beside startdate and mean_temp
drivers = list(train.columns)
drivers.remove('startdate')
drivers.remove('mean_temp')

In [10]:
X_train, y_train, X_validate, y_validate, X_test, y_test = m.prep_for_model(train, validate, test, 'mean_temp', drivers)

In [11]:
lm = LinearRegression(normalize=True)
lm.fit(X_train, y_train)

LinearRegression(normalize=True)

In [12]:
feature_coefs = pd.DataFrame({'feature' : lm.feature_names_in_, 
                     'weight': lm.coef_})

In [38]:
feature_coefs.sort_values(by='weight')

Unnamed: 0,feature,weight
249,region_Csb,-1.595715e+11
247,region_Cfb,-1.595715e+11
246,region_Cfa,-1.595715e+11
248,region_Csa,-1.595715e+11
252,region_Dfc,-4.209229e+10
...,...,...
43,nmme0-prate-56w__gfdlflorb0,3.472114e+01
53,nmme0-prate-34w__gfdlflorb0,3.594195e+01
52,nmme0-prate-34w__gfdlflora0,3.673249e+01
13,nmme0-tmp2m-34w__nmme0mean,6.928643e+01


In [36]:
negative_coef = list(feature_coefs[feature_coefs.weight < -200].feature)

In [37]:
negative_coef

['nmme0-prate-56w__nmme0mean',
 'nmme0-prate-34w__nmme0mean',
 'region_Cfa',
 'region_Cfb',
 'region_Csa',
 'region_Csb',
 'region_Dfa',
 'region_Dfb',
 'region_Dfc',
 'region_Dsb',
 'region_Dsc',
 'region_Dwa',
 'region_Dwb',
 'region_bins_Dry']

In [25]:
positive_coef = list(feature_coefs[feature_coefs.weight > 50].feature)

In [27]:
drivers2 = negative_coef + positive_coef + w.contest_features

In [28]:
w.contest_features

['region',
 'elevation',
 'lat',
 'lon',
 'potential_evap',
 'precip',
 'barometric_pressure',
 'all_atmos_precip',
 'relative_humidity',
 'sea_level_press',
 'height_10_mb',
 'height_100_mb',
 'height_500_mb',
 'height_850_mb',
 'zonal_wind_250mb',
 'zonal_wind_925mb',
 'long_wind_250mb',
 'long_wind_925mb',
 'elevation_range',
 'region_bins']

In [29]:
drivers2

['nmme-tmp2m-56w__nmmemean',
 'nmme0-prate-56w__nmme0mean',
 'nmme0-prate-34w__nmme0mean',
 'nmme-tmp2m-34w__nmmemean',
 'nmme0mean',
 'region_Cfa',
 'region_Cfb',
 'region_Csa',
 'region_Csb',
 'region_Dfa',
 'region_Dfb',
 'region_Dfc',
 'region_Dsb',
 'region_Dsc',
 'region_Dwa',
 'region_Dwb',
 'region_bins_Dry',
 'nmme0-tmp2m-34w__nmme0mean',
 'region_bins_Temperate',
 'region',
 'elevation',
 'lat',
 'lon',
 'potential_evap',
 'precip',
 'barometric_pressure',
 'all_atmos_precip',
 'relative_humidity',
 'sea_level_press',
 'height_10_mb',
 'height_100_mb',
 'height_500_mb',
 'height_850_mb',
 'zonal_wind_250mb',
 'zonal_wind_925mb',
 'long_wind_250mb',
 'long_wind_925mb',
 'elevation_range',
 'region_bins']

## Prep again for modeling with all contest and high coef features

In [21]:
X_train, y_train, X_validate, y_validate, X_test, y_test = m.prep_for_model(train, validate, test, 'mean_temp', drivers2)

KeyError: "['region_Cfa', 'region_Cfb', 'region_Csa', 'region_Csb', 'region_Dfa', 'region_Dfb', 'region_Dfc', 'region_Dsb', 'region_Dsc', 'region_Dwa', 'region_Dwb', 'region_bins_Dry', 'region_bins_Temperate'] not in index"

In [None]:
m.regression_models(X_train, y_train, X_validate, y_validate)

In [None]:
m.best_model(X_train, y_train, X_validate, y_validate, X_test, y_test)

In [29]:
final_test = pd.read_csv('test_data.csv')

In [30]:
final_test

Unnamed: 0,index,lat,lon,startdate,contest-pevpr-sfc-gauss-14d__pevpr,nmme0-tmp2m-34w__cancm30,nmme0-tmp2m-34w__cancm40,nmme0-tmp2m-34w__ccsm30,nmme0-tmp2m-34w__ccsm40,nmme0-tmp2m-34w__cfsv20,...,wind-vwnd-925-2010-11,wind-vwnd-925-2010-12,wind-vwnd-925-2010-13,wind-vwnd-925-2010-14,wind-vwnd-925-2010-15,wind-vwnd-925-2010-16,wind-vwnd-925-2010-17,wind-vwnd-925-2010-18,wind-vwnd-925-2010-19,wind-vwnd-925-2010-20
0,375734,0.0,0.833333,11/1/22,339.88,30.88,30.92,29.17,31.02,29.47,...,-19.28,-39.77,-29.25,40.88,-8.31,14.91,-24.62,31.05,-23.69,6.27
1,375735,0.0,0.833333,11/2/22,334.63,30.88,30.92,29.17,31.02,29.47,...,-19.58,-43.14,-28.62,45.37,-5.42,16.97,-23.94,28.84,-20.61,14.16
2,375736,0.0,0.833333,11/3/22,337.83,30.88,30.92,29.17,31.02,29.47,...,-13.73,-44.22,-27.67,49.76,-1.31,21.44,-19.06,26.85,-16.78,13.42
3,375737,0.0,0.833333,11/4/22,345.81,30.88,30.92,29.17,31.02,29.47,...,-7.97,-49.47,-19.32,52.62,-0.44,21.65,-23.12,23.70,-18.62,10.69
4,375738,0.0,0.833333,11/5/22,357.39,30.88,30.92,29.17,31.02,29.47,...,-0.80,-56.07,-9.89,51.23,-7.57,19.86,-30.56,20.66,-25.08,19.64
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31349,407083,1.0,0.866667,12/27/22,62.72,4.60,8.71,6.05,10.08,6.39,...,74.96,-8.49,32.39,38.82,7.42,11.75,-23.62,-0.24,-5.94,51.23
31350,407084,1.0,0.866667,12/28/22,73.41,4.60,8.71,6.05,10.08,6.39,...,88.57,0.83,26.23,37.64,13.01,17.84,-22.05,-3.03,1.31,51.45
31351,407085,1.0,0.866667,12/29/22,70.00,4.60,8.71,6.05,10.08,6.39,...,99.43,10.90,21.06,36.53,14.15,23.12,-25.60,-5.88,9.32,45.32
31352,407086,1.0,0.866667,12/30/22,79.81,4.60,8.71,6.05,10.08,6.39,...,109.39,21.37,20.42,36.05,6.38,29.00,-27.06,-1.42,16.06,31.88


In [31]:
final_test = w.prep_data(final_test)

In [32]:
final_test = w.rename_data(final_test)

In [33]:
final_test = w.create_elevation_bins(final_test)

In [34]:
final_test = w.create_region_bins(final_test)

In [35]:
scale_features= list(final_test.select_dtypes(include=np.number).columns)
final_test_scaled = final_test.copy()
minmax = pre.MinMaxScaler()
minmax.fit(final_test[scale_features])

final_test_scaled[scale_features] = pd.DataFrame(minmax.transform(final_test[scale_features]),
                                              columns=final_test[scale_features].columns.values).set_index([final_test.index.values])

In [54]:
final_test_scaled.shape

(31354, 247)

In [55]:
X_test = final_test_scaled[drivers]

In [57]:
#make list of cat variables to make dummies for
cat_vars = list(X_test.select_dtypes(exclude=np.number).columns)

dummy_df_test = pd.get_dummies(X_test[cat_vars], dummy_na=False, drop_first=[True, True])
X_test = pd.concat([X_test, dummy_df_test], axis=1).drop(columns=cat_vars)

In [59]:
pf = PolynomialFeatures(degree=2)
# fit and transform X_train_scaled
X_train_degree2 = pf.fit_transform(X_train)
# transform X_validate_scaled & X_test_scaled
X_test_degree2 = pf.transform(X_test)
# create the model object
lm2 = LinearRegression(normalize=True)
lm2.fit(X_train_degree2, y_train)

LinearRegression(normalize=True)

In [60]:
predictions = lm2.predict(X_test_degree2)

In [58]:
X_test.shape

(31354, 44)

In [61]:
predictions.shape

(31354,)

In [62]:
jan_27_predictions = pd.DataFrame({'contest-tmp2m-14d__tmp2m' : predictions, 
                     'index': final_test['index']})

In [63]:
jan_27_predictions

Unnamed: 0,contest-tmp2m-14d__tmp2m,index
0,23.776459,375734
1,23.675446,375735
2,23.680573,375736
3,23.637299,375737
4,23.747833,375738
...,...,...
31349,-7.161102,407083
31350,-6.624542,407084
31351,-8.446381,407085
31352,-8.086761,407086


In [64]:
jan_27_predictions.to_csv('jan_27_predictions.csv', index=False)