In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
import sklearn.metrics as metric
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures
import sklearn.preprocessing as pre

import warnings
warnings.filterwarnings('ignore')

import wrangle1 as w
import model as m

from importlib import reload

## Prepping Data with all features

In [2]:
df = w.get_explore_data()

In [3]:
df = w.prep_data(df)

In [4]:
df = w.rename_data(df)

In [5]:
#df = w.create_elevation_bins(df)

In [6]:
#dropped nulls because they couldn't go into the model
df = df.dropna()

In [7]:
train, validate, _ = w.split_data(df)

In [8]:
test = pd.read_csv('test_data.csv')

In [9]:
test = w.prep_data(test)

In [10]:
test = w.rename_data(test)

In [11]:
drivers2 = ['nmme0-tmp2m-34w__cancm40',
 'nmme0-tmp2m-34w__gfdlflorb0',
 'nmme-tmp2m-56w__nmmemean',
 'nmme-prate-34w__nmmemean',
 'nmme0-prate-56w__nmme0mean',
 'nmme0-prate-34w__nmme0mean',
 'sea_level_press',
 'nmme-prate-56w__nmmemean',
 'nmme-tmp2m-34w__nmmemean',
 'nmme0mean',
 'wind-hgt-100-2010-1',
 'nmme0-tmp2m-34w__nmme0mean',
 'nmme-tmp2m-56w__cancm4',
 'nmme-tmp2m-56w__ccsm3',
 'nmme-tmp2m-56w__ccsm4',
 'nmme-tmp2m-56w__cfsv2',
 'nmme-tmp2m-56w__gfdlflora',
 'nmme-tmp2m-56w__gfdlflorb',
 'nmme0-prate-56w__cancm30',
 'nmme0-prate-56w__cancm40',
 'nmme0-prate-56w__ccsm30',
 'nmme0-prate-56w__ccsm40',
 'nmme0-prate-56w__cfsv20',
 'nmme0-prate-56w__gfdlflora0',
 'nmme0-prate-56w__gfdlflorb0',
 'nmme0-prate-56w__gfdl0',
 'nmme0-prate-56w__nasa0',
 'nmme0-prate-34w__cancm30',
 'nmme0-prate-34w__cancm40',
 'nmme0-prate-34w__ccsm30',
 'nmme0-prate-34w__ccsm40',
 'nmme0-prate-34w__cfsv20',
 'nmme0-prate-34w__gfdlflora0',
 'nmme0-prate-34w__gfdlflorb0',
 'nmme0-prate-34w__gfdl0',
 'nmme0-prate-34w__nasa0',
 'nmme-prate-56w__gfdlflorb',
 'nmme-tmp2m-34w__cancm3',
 'nmme-tmp2m-34w__ccsm3',
 'nmme-tmp2m-34w__ccsm4',
 'nmme-tmp2m-34w__cfsv2',
 'nmme-tmp2m-34w__gfdlflora',
 'nmme-tmp2m-34w__gfdlflorb',
 'nmme-tmp2m-34w__nasa',
 'cancm30',
 'cancm40',
 'cfsv20',
 'gfdlflora0',
 'gfdl0',
 'nasa0',
 'wind-hgt-500-2010-1',
 'region',
 'elevation',
 'lat',
 'lon',
 'potential_evap',
 'precip',
 'barometric_pressure',
 'all_atmos_precip',
 'relative_humidity',
 'sea_level_press',
 'height_10_mb',
 'height_100_mb',
 'height_500_mb',
 'height_850_mb',
 'zonal_wind_250mb',
 'zonal_wind_925mb',
 'long_wind_250mb',
 'long_wind_925mb']

## Prep again for modeling with all contest and high coef features

In [13]:
reload(m)

<module 'model' from '/Users/fostermark/codeup-data-science/mirzakhani_shared/mark/model.py'>

In [14]:
X_train, y_train, X_validate, y_validate, X_test = m.prep_for_model(train, validate, test, 'mean_temp', drivers2)

In [15]:
# make the polynomial features to get a new set of features
pf = PolynomialFeatures(degree=2)
# fit and transform X_train_scaled
X_train_degree2 = pf.fit_transform(X_train)
# transform X_validate_scaled & X_test_scaled
X_validate_degree2 = pf.transform(X_validate)
X_test_degree2 = pf.transform(X_test)
# create the model object
lm2 = LinearRegression(normalize=True)
lm2.fit(X_train_degree2, y_train)

metric_df = pd.DataFrame(data=[
        {
            'model': 'Quadratic', 
            f'RMSE_train': metric.mean_squared_error(
                y_train,
                lm2.predict(X_train_degree2)) ** .5,
            f'RMSE_validate': metric.mean_squared_error(
                y_validate,
                lm2.predict(X_validate_degree2)) ** .5
        }])

metric_df

Unnamed: 0,model,RMSE_train,RMSE_validate
0,Quadratic,0.739497,0.753958


In [19]:
#m.best_model(X_train, y_train, X_validate, y_validate, X_test, y_test)

Unnamed: 0,model,RMSE_train,RMSE_validate,RMSE_test
0,Quadratic,0.739497,0.753958,0.749123


In [16]:
train_pred = lm2.predict(X_train_degree2)

In [17]:
train_pred

array([23.01771536, 10.85705363, 17.22423696, ...,  4.09293563,
        5.62000389,  9.47790119])

In [19]:
test_pred = lm2.predict(X_test_degree2)

In [20]:
test_pred

array([29.06642541, 29.15045878, 29.19848625, ...,  2.57065308,
        2.5588159 ,  3.58491355])

In [24]:
jan_27_4_predictions = pd.DataFrame({'contest-tmp2m-14d__tmp2m' : test_pred, 
                     'index': test['index']})

In [25]:
jan_27_4_predictions

Unnamed: 0,contest-tmp2m-14d__tmp2m,index
0,29.066425,375734
1,29.150459,375735
2,29.198486,375736
3,29.296406,375737
4,29.420557,375738
...,...,...
31349,3.663703,407083
31350,3.693476,407084
31351,2.570653,407085
31352,2.558816,407086


In [26]:
jan_27_4_predictions.to_csv('jan_27_4_predictions.csv', index=False)