In [10]:
# standard and user-defined functions imports
import pandas as pd
import numpy as np
import wrangle
import explore
import regression_models as model
import os

# visualization imports
import matplotlib.pyplot as plt
import seaborn as sns

# stats and modeling imports
from scipy import stats
from math import sqrt
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,explained_variance_score, r2_score
from sklearn.linear_model import LinearRegression,LassoLars,TweedieRegressor
from sklearn.feature_selection import SelectKBest, RFE, f_regression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV


In [2]:
# acquire time series dataframe using function from wrangle.py and save to a variable
ts_df = wrangle.ts_bee_prep()

In [4]:
# split data
train, validate, test = wrangle.ts_split(ts_df)

In [5]:
# set year column as datetime index
train = train.set_index('year').sort_index()
validate = validate.set_index('year').sort_index()
test = test.set_index('year').sort_index()

In [6]:
# numeric columns to be scaled
columns = [col for col in train.drop(columns = ["state","season","colonies_lost"])]

In [7]:
#run scale data function to scale our numeric columns
scaled_train, scaled_validate, scaled_test = model.scale_data(train,validate,test,columns)

In [8]:
#create list of  features to train the regression model with
features = ['starting_colonies_scaled', 'ending_colonies_scaled','colonies_net_gain_scaled', 'beekeeper_colony_ratio_scaled']

In [9]:
# X_train will be subset of our scaled train data with features only
X_train = scaled_train[features]
# set target
y_train = scaled_train[["colonies_lost"]]
# X_validate will be subset of our scaled validate data with features only
X_validate = scaled_validate[features]
# set target
y_validate = scaled_validate[["colonies_lost"]]
# X_test will be subset of our scaled test data with features only
X_test = scaled_test[features]
#set target
y_test = scaled_test[["colonies_lost"]]

### Linear regression

In [11]:
# specify the parameters we wish to use as a dictionary, then use that dictionary when we create the class.
params = {'normalize': [ True, False],
          'fit_intercept': [True, False]}

In [12]:
grid = GridSearchCV(LinearRegression(), params, cv=5)

In [13]:
grid.fit(X_train,y_train.colonies_lost)

GridSearchCV(cv=5, estimator=LinearRegression(),
             param_grid={'fit_intercept': [True, False],
                         'normalize': [True, False]})

In [14]:
grid.best_estimator_

LinearRegression(normalize=True)

In [23]:
def best_hyperparameter(X_train,y_train):
    '''This function will input X,y and variation in hyperparameters and
    outputs best hyperparameter'''
    #define hyperparamters
    params = {'normalize': [ True, False],
          'fit_intercept': [True, False]}
    #create grid object
    grid = GridSearchCV(LinearRegression(), params, cv=5)
    #fit grid object
    grid.fit(X_train,y_train.colonies_lost)
    #get best hyperparameters
    best = grid.best_estimator_
    
    #return output
    return best
    

In [25]:
best_hyperparameter(X_train,y_train)

LinearRegression(normalize=True)

### Lassolars

In [15]:
params = {
          'normalize': [True, False],
          'fit_intercept':[True, False],
           'alpha': [1,2,3,4]
         }

In [16]:
grid = GridSearchCV(LassoLars(), params, cv=5)

In [17]:
grid.fit(X_train,y_train.colonies_lost)

GridSearchCV(cv=5, estimator=LassoLars(),
             param_grid={'alpha': [1, 2, 3, 4], 'fit_intercept': [True, False],
                         'normalize': [True, False]})

In [18]:
grid.best_estimator_

LassoLars(alpha=1, normalize=True)

In [26]:
def best_hyperparameter(X_train,y_train):
    '''This function will input X,y and variation in hyperparameters and
    outputs best hyperparameter'''
    #define hyperparamters
    params = {
          'normalize': [True, False],
          'fit_intercept':[True, False],
           'alpha': [1,2,3,4]
         }
    #create grid object
    grid = GridSearchCV(LassoLars(), params, cv=5)
    #fit grid object
    grid.fit(X_train,y_train.colonies_lost)
    #get best hyperparameters
    best = grid.best_estimator_
    
    #return output
    return best
    

In [27]:
best_hyperparameter(X_train,y_train)

LassoLars(alpha=1, normalize=True)

### Tweedie

In [19]:
params = {
          'power': [0, 1,2,3,4,5],
           'fit_intercept' : [True , False],
          'warm_start': [True, False], 
           'alpha': [1,2,3,4,5]
}

In [20]:
grid = GridSearchCV(TweedieRegressor(), params, cv=5)

In [21]:
grid.fit(X_train,y_train.colonies_lost)

GridSearchCV(cv=5, estimator=TweedieRegressor(),
             param_grid={'alpha': [1, 2, 3, 4, 5],
                         'fit_intercept': [True, False],
                         'power': [0, 1, 2, 3, 4, 5],
                         'warm_start': [True, False]})

In [22]:
grid.best_estimator_

TweedieRegressor(alpha=5, power=1, warm_start=True)

In [47]:
def best_hyperparameter(X_train,y_train):
    '''This function will input X,y and variation in hyperparameters and
    outputs best hyperparameter'''
    #define hyperparameters
    params = {
          'power': [0, 1,2,3],
           'fit_intercept' : [True , False],
          'warm_start': [True, False], 
           'alpha': [1,2,3,4,5]
}
    #create grid object
    grid = GridSearchCV(TweedieRegressor(), params, cv=5,scoring = 'neg_root_mean_squared_error')
    #fit grid object
    grid.fit(X_train,y_train.colonies_lost)
    #get best parameters
    best = grid.best_estimator_
    
    #return output
    return best

In [48]:
best_hyperparameter(X_train,y_train)

TweedieRegressor(alpha=5, power=1, warm_start=True)