In [3]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
import sklearn.metrics as metric
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures
import sklearn.preprocessing as pre

import warnings
warnings.filterwarnings('ignore')

import wrangle1 as w
import model as m

from importlib import reload

## Prepping Data with all features

In [4]:
df = w.get_explore_data()

In [5]:
df = w.prep_data(df)

In [6]:
df = w.rename_data(df)

In [7]:
#df = w.create_elevation_bins(df)

In [8]:
#df = w.create_region_bins(df)

In [9]:
#dropped nulls because they couldn't go into the model
df = df.dropna()

In [10]:
train, validate, test = w.split_data(df)

In [11]:
train.shape

(229192, 246)

## Splitting data for modeling

In [12]:
#drivers used are ALL columns beside startdate and mean_temp
drivers = list(train.columns)
drivers.remove('startdate')
drivers.remove('mean_temp')

In [13]:
X_train, y_train, X_validate, y_validate, X_test, y_test = m.prep_for_model(train, validate, test, 'mean_temp', drivers)

In [14]:
lm = LinearRegression(normalize=True)
lm.fit(X_train, y_train)

LinearRegression(normalize=True)

In [15]:
feature_coefs = pd.DataFrame({'feature' : lm.feature_names_in_, 
                     'weight': lm.coef_})

In [16]:
negative_coef = list(feature_coefs[feature_coefs.weight < -15].feature)

In [17]:
positive_coef = list(feature_coefs[feature_coefs.weight > 50].feature)

In [28]:
drivers2 = negative_coef + positive_coef + w.contest_features

In [27]:
reload(w)

<module 'wrangle1' from '/Users/fostermark/codeup-data-science/mirzakhani_shared/mark/wrangle1.py'>

In [30]:
drivers2

['nmme0-tmp2m-34w__cancm40',
 'nmme0-tmp2m-34w__gfdlflorb0',
 'nmme-tmp2m-56w__nmmemean',
 'nmme-prate-34w__nmmemean',
 'nmme0-prate-56w__nmme0mean',
 'nmme0-prate-34w__nmme0mean',
 'sea_level_press',
 'nmme-prate-56w__nmmemean',
 'nmme-tmp2m-34w__nmmemean',
 'nmme0mean',
 'wind-hgt-100-2010-1',
 'nmme0-tmp2m-34w__nmme0mean',
 'region',
 'elevation',
 'lat',
 'lon',
 'potential_evap',
 'precip',
 'barometric_pressure',
 'all_atmos_precip',
 'relative_humidity',
 'sea_level_press',
 'height_10_mb',
 'height_100_mb',
 'height_500_mb',
 'height_850_mb',
 'zonal_wind_250mb',
 'zonal_wind_925mb',
 'long_wind_250mb',
 'long_wind_925mb']

## Prep again for modeling with all contest and high coef features

In [31]:
X_train, y_train, X_validate, y_validate, X_test, y_test = m.prep_for_model(train, validate, test, 'mean_temp', drivers2)

In [41]:
X_train.shape

(229192, 43)

In [32]:
m.best_model(X_train, y_train, X_validate, y_validate, X_test, y_test)

Unnamed: 0,model,RMSE_train,RMSE_validate,RMSE_test
0,Quadratic,1.058332,1.065108,1.055392


In [33]:
final_test = pd.read_csv('test_data.csv')

In [34]:
final_test = w.prep_data(final_test)

In [35]:
final_test = w.rename_data(final_test)

In [36]:
scale_features= list(final_test.select_dtypes(include=np.number).columns)
final_test_scaled = final_test.copy()
minmax = pre.MinMaxScaler()
minmax.fit(final_test[scale_features])

final_test_scaled[scale_features] = pd.DataFrame(minmax.transform(final_test[scale_features]),
                                              columns=final_test[scale_features].columns.values).set_index([final_test.index.values])

In [37]:
final_test_scaled.shape

(31354, 245)

In [38]:
X_test = final_test_scaled[drivers2]

In [39]:
#make list of cat variables to make dummies for
cat_vars = list(X_test.select_dtypes(exclude=np.number).columns)

dummy_df_test = pd.get_dummies(X_test[cat_vars], dummy_na=False, drop_first=[True, True])
X_test = pd.concat([X_test, dummy_df_test], axis=1).drop(columns=cat_vars)

In [40]:
X_test.shape

(31354, 43)

In [42]:
pf = PolynomialFeatures(degree=2)
# fit and transform X_train_scaled
X_train_degree2 = pf.fit_transform(X_train)
# transform X_validate_scaled & X_test_scaled
X_test_degree2 = pf.transform(X_test)
# create the model object
lm2 = LinearRegression(normalize=True)
lm2.fit(X_train_degree2, y_train)

LinearRegression(normalize=True)

In [43]:
predictions = lm2.predict(X_test_degree2)

In [47]:
predictions

array([ 25.88600743,  25.78191308,  25.81701365, ..., -11.6920391 ,
       -12.9145658 , -11.21585552])

In [48]:
jan_27_3_predictions = pd.DataFrame({'index': final_test['index'], 'contest-tmp2m-14d__tmp2m' : predictions})

In [49]:
jan_27_3_predictions

Unnamed: 0,index,contest-tmp2m-14d__tmp2m
0,375734,25.886007
1,375735,25.781913
2,375736,25.817014
3,375737,25.843597
4,375738,25.929932
...,...,...
31349,407083,-9.275610
31350,407084,-9.659880
31351,407085,-11.692039
31352,407086,-12.914566


In [50]:
jan_27_3_predictions.to_csv('jan_27_3_predictions.csv', index=False)