In [1]:
#imports
import wrangle
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,explained_variance_score
from sklearn.linear_model import LinearRegression,LassoLars,TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error,explained_variance_score
from sklearn.feature_selection import SelectKBest, RFE, f_regression

In [2]:
df = wrangle.bee_merged()
df.head()

Unnamed: 0,state,year,season,beekeepers,total_loss,average_loss,starting_colonies,colonies_lost,ending_colonies,beekeepers_exclusive_to_state,colonies_exclusive_to_state,colonies_net_gain,beekeeper_colony_ratio,ansi,latitude,longitude
0,alabama,2022,annual,33,36.488812,34.260096,316,212,369,100.0,100.0,53,11.181818,1,32.806671,-86.79113
1,arkansas,2022,annual,17,49.411765,52.869897,148,126,129,100.0,100.0,-19,7.588235,5,34.969704,-92.373123
2,california,2022,annual,60,20.706772,46.2057,42034,11555,44248,100.0,100.0,2214,737.466667,6,36.116203,-119.681564
3,colorado,2022,annual,90,53.227771,40.620432,379,437,384,100.0,100.0,5,4.266667,8,39.059811,-105.311104
4,connecticut,2022,annual,28,33.862434,38.147562,88,64,125,100.0,100.0,37,4.464286,9,41.597782,-72.755371


In [3]:

def split_data(df):
    '''This function will input dataframe and split into train,validate,test'''
    #split dataframw into 80% train  and 20% test 
    train_validate, test = train_test_split(df, test_size=.2, random_state=825)
     #split train further into 75% train, 25% validate
    train, validate = train_test_split(train_validate, test_size=.25, random_state=825)
    
    #return train,validate,test back to function
    return train,validate,test

In [4]:
#split data
train,validate,test = split_data(df)
train.shape,validate.shape,test.shape

((291, 16), (97, 16), (98, 16))

In [5]:
def scale_data(train,validate,test,columns):
    #make the scaler
    scaler = MinMaxScaler()
    #fit the scaler at train data only
    scaler.fit(train[columns])
    #tranforrm train, validate and test
    train_scaled = scaler.transform(train[columns])
    validate_scaled = scaler.transform(validate[columns])
    test_scaled = scaler.transform(test[columns])
    
    # Generate a list of the new column names with _scaled added on
    scaled_columns = [col+"_scaled" for col in columns]
    
    #concatenate with orginal train, validate and test
    scaled_train = pd.concat([train.reset_index(drop = True),pd.DataFrame(train_scaled,columns = scaled_columns)],axis = 1)
    scaled_validate = pd.concat([validate.reset_index(drop = True),pd.DataFrame(validate_scaled, columns = scaled_columns)], axis = 1)
    scaled_test= pd.concat([test.reset_index(drop = True),pd.DataFrame(test_scaled,columns = scaled_columns)],axis = 1)
    
    return scaled_train,scaled_validate,scaled_test

In [6]:
columns = [col for col in df.drop(columns = ["state","season","colonies_lost"])]
scaled_train,scaled_validate,scaled_test = scale_data(train,validate,test,columns)

In [7]:
scaled_train.head(1)

Unnamed: 0,state,year,season,beekeepers,total_loss,average_loss,starting_colonies,colonies_lost,ending_colonies,beekeepers_exclusive_to_state,...,average_loss_scaled,starting_colonies_scaled,ending_colonies_scaled,beekeepers_exclusive_to_state_scaled,colonies_exclusive_to_state_scaled,colonies_net_gain_scaled,beekeeper_colony_ratio_scaled,ansi_scaled,latitude_scaled,longitude_scaled
0,michigan,2016,annual,192,52.263736,58.693341,1088,1189,1086,100.0,...,0.673037,0.021437,0.022121,0.0,0.0,0.474747,0.008215,0.462963,0.792495,0.712385


In [8]:
def select_kbest(X,y,k):
    #make the thing
    kbest = SelectKBest(f_regression, k=k)
    #fit the thing
    kbest.fit(X,y)
    features = X.columns[kbest.get_support()]
    return features
    

In [9]:
X= scaled_train[[col for col in scaled_train.columns if col.endswith("scaled")]]
y = scaled_train[["colonies_lost"]]

In [10]:
select_kbest(X,y, k =5)

Index(['beekeepers_scaled', 'starting_colonies_scaled',
       'ending_colonies_scaled', 'colonies_net_gain_scaled',
       'beekeeper_colony_ratio_scaled'],
      dtype='object')

In [11]:
def select_rfe(X,y,  n_features_to_select = 5):
    #create the model
    rfe=RFE(LinearRegression(), n_features_to_select = n_features_to_select) 
    #fit the model
    rfe.fit(X,y)
    #use get_support()
    return X.columns[rfe.get_support()]

In [12]:
features = select_rfe(X,y,  n_features_to_select = 5)
features

Index(['beekeepers_scaled', 'starting_colonies_scaled',
       'ending_colonies_scaled', 'colonies_net_gain_scaled',
       'beekeeper_colony_ratio_scaled'],
      dtype='object')

In [13]:
X_train = scaled_train[features]
y_train = scaled_train[["colonies_lost"]]
X_validate = scaled_validate[features]
y_validate = scaled_validate[["colonies_lost"]]
X_test = scaled_test[features]
y_test = scaled_test[["colonies_lost"]]

In [14]:
X_train.head(2)

Unnamed: 0,beekeepers_scaled,starting_colonies_scaled,ending_colonies_scaled,colonies_net_gain_scaled,beekeeper_colony_ratio_scaled
0,0.276758,0.021437,0.022121,0.474747,0.008215
1,0.059633,0.024104,0.019189,0.441524,0.036673


In [15]:
def get_baseline_RMSE(y_train,y_validate):
    '''
    this function will calculate baseline mean and baseline median and calculate RMSE from mean and median
    '''
    #get mean of target from train
    y_train["baseline_mean"] = y_train.colonies_lost.mean()
    #get median of target from train
    y_train["baseline_median"] =y_train.colonies_lost.median()
    #get mean of target from validate
    y_validate["baseline_mean"] = y_validate.colonies_lost.mean()
    #get median from target from validate
    y_validate["baseline_median"] =y_validate.colonies_lost.median()
    
    #calculate RMSE 
    RMSE_train_mean=mean_squared_error(y_train.colonies_lost,y_train.baseline_mean, squared = False)
    RMSE_validate_mean=mean_squared_error(y_validate.colonies_lost,y_validate.baseline_mean, squared = False)

    print("RMSE using Mean on \nTrain: ", round(RMSE_train_mean,2), "\nValidate: ", round(RMSE_validate_mean,2))
    print()

    #calculate RMSE
    RMSE_train_median= mean_squared_error(y_train.colonies_lost,y_train.baseline_median, squared = False)
    RMSE_validate_median= mean_squared_error(y_validate.colonies_lost,y_validate.baseline_median, squared = False)

    print("RMSE using Median on \nTrain: ", round(RMSE_train_median,2), "\nValidate: ", round(RMSE_validate_median,2))

In [16]:
get_baseline_RMSE(y_train,y_validate)

RMSE using Mean on 
Train:  2794.49 
Validate:  1715.12

RMSE using Median on 
Train:  2867.69 
Validate:  1837.86


In [19]:
def RMSE(X_train,y_train, X_validate, y_validate):
    '''
    this function will calculate baseline mean and baseline median and calculate RMSE from mean and median
    '''
    #get mean of target from train
    y_train["baseline_mean"] = y_train.colonies_lost.mean()
    #get median of target from train
    y_train["baseline_median"] =y_train.colonies_lost.median()
    #get mean of target from validate
    y_validate["baseline_mean"] = y_validate.colonies_lost.mean()
    #get median from target from validate
    y_validate["baseline_median"] =y_validate.colonies_lost.median()
    
    #calculate RMSE 
    RMSE_train_mean=mean_squared_error(y_train.colonies_lost,y_train.baseline_mean, squared = False)
    RMSE_validate_mean=mean_squared_error(y_validate.colonies_lost,y_validate.baseline_mean, squared = False)

    #print("RMSE using Mean on \nTrain: ", round(RMSE_train_mean,2), "\nValidate: ", round(RMSE_validate_mean,2))
    #print()

    #calculate RMSE
    RMSE_train_median= mean_squared_error(y_train.colonies_lost,y_train.baseline_median, squared = False)
    RMSE_validate_median= mean_squared_error(y_validate.colonies_lost,y_validate.baseline_median, squared = False)

    #print("RMSE using Median on \nTrain: ", round(RMSE_train_median,2), "\nValidate: ", round(RMSE_validate_median,2))
    
    #make a dataframe to capture model and RMSE 
    metric_df = pd.DataFrame(data=[
            {
                'model': 'Baseline', 
                'RMSE_train': RMSE_train_mean,
                'RMSE_validate': RMSE_validate_mean
                }
            ])
    
   
    
    # create the model object
    lm = LinearRegression(normalize = True)
    # Fit the model
    lm.fit(X_train, y_train.colonies_lost)
    # Predict y on train
    y_train['colonies_lost_pred_lm'] = lm.predict(X_train)
    # predict validate
    y_validate['colonies_lost_pred_lm'] = lm.predict(X_validate)
    
    # evaluate: train rmse
    rmse_train_lm= round(mean_squared_error(y_train.colonies_lost, y_train.colonies_lost_pred_lm,squared = False), 2)
    # evaluate: validate rmse
    rmse_validate_lm= round(mean_squared_error(y_validate.colonies_lost, y_validate.colonies_lost_pred_lm,squared = False),2)

    #append model and RMSE from OLS model to metric dataframe
    metric_df = metric_df.append({
    'model': 'OLS Regressor', 
    'RMSE_train': rmse_train_lm,
    'RMSE_validate': rmse_validate_lm,
    }, ignore_index=True)
    
    
    # create the model object
    lars = LassoLars(alpha=1)
    # fit the model.
    lars.fit(X_train, y_train.colonies_lost)
    # predict train
    y_train['colonies_lost_pred_lars'] = lars.predict(X_train)
    # predict validate
    y_validate['colonies_lost_pred_lars'] = lars.predict(X_validate)
    # evaluate: train rmse
    rmse_train_lars = round(mean_squared_error(y_train.colonies_lost, y_train.colonies_lost_pred_lars, squared = False),2)
    # evaluate: validate rmse
    rmse_validate_lars= round(mean_squared_error(y_validate.colonies_lost, y_validate.colonies_lost_pred_lars,squared = False),2)

    #append model and RMSE from LASSOLARS model to metric dataframe
    metric_df = metric_df.append({
    'model': 'LASSOLARS(alpha = 1)', 
    'RMSE_train': rmse_train_lars,
    'RMSE_validate': rmse_validate_lars,
    }, ignore_index=True)
    
    
    
    # create the model object
    glm = TweedieRegressor(power=1, alpha=0)
    # fit the model to our training data.
    glm.fit(X_train, y_train.colonies_lost)
    # predict train
    y_train['colonies_lost_pred_glm'] = glm.predict(X_train)
    # predict validate
    y_validate['colonies_lost_pred_glm'] = glm.predict(X_validate)
    # evaluate: train rmse
    rmse_train_tw = round(mean_squared_error(y_train.colonies_lost, y_train.colonies_lost_pred_glm,squared = False),2)
    # evaluate: validate rmse
    rmse_validate_tw= round(mean_squared_error(y_validate.colonies_lost, y_validate.colonies_lost_pred_glm, squared = False),2)

    #append model and RMSE from GLM model to metric dataframe
    metric_df = metric_df.append({
    'model': 'Tweedie Regressor(power=1, alpha=0)', 
    'RMSE_train': rmse_train_tw,
    'RMSE_validate': rmse_validate_tw,
    }, ignore_index=True)
    
    
    
    
    #create model object
    pf= PolynomialFeatures(degree= 2)
    # fit and transform X_train_scaled
    X_train_degree5 = pf.fit_transform(X_train)
    # transform X_validate_scaled 
    X_validate_degree5 = pf.transform(X_validate)

    # create the model object
    lm5 = LinearRegression(normalize=True)
    # fit the model to our training data. We must specify the column in y_train,  
    lm5.fit(X_train_degree5, y_train.colonies_lost)
    # predict train
    y_train['colonies_lost_pred_lm5'] = lm5.predict(X_train_degree5)
    # predict validate
    y_validate['colonies_lost_pred_lm5'] = lm5.predict(X_validate_degree5)

    # evaluate: train rmse
    rmse_train_py= round(mean_squared_error(y_train.colonies_lost, y_train.colonies_lost_pred_lm5, squared = False),2)
    # evaluate: validate rmse
    rmse_validate_py= round(mean_squared_error(y_validate.colonies_lost, y_validate.colonies_lost_pred_lm5, squared = False) , 2)

    #append model and RMSE from Polynomial Regression model to metric dataframe
    metric_df = metric_df.append({
    'model': 'Polynomial Regression(degree = 2)', 
    'RMSE_train': rmse_train_py,
    'RMSE_validate': rmse_validate_py,
    }, ignore_index=True)
    
    
    print(metric_df)

In [20]:
RMSE(X_train,y_train, X_validate, y_validate)

                                 model   RMSE_train  RMSE_validate
0                             Baseline  2794.490927    1715.116523
1                        OLS Regressor  1212.600000     900.120000
2                 LASSOLARS(alpha = 1)  1213.010000     882.290000
3  Tweedie Regressor(power=1, alpha=0)  1557.260000    1614.440000
4    Polynomial Regression(degree = 2)   394.350000    3512.810000
