In [3]:
# import libraries for class
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(font_scale=1.5)
%config InlineBackend.figure_format = 'retina'
%matplotlib inline


In [5]:
# import sklearn models needed for class
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression, RidgeCV, LassoCV, ElasticNetCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, cross_val_predict, train_test_split, KFold

In [87]:
class full_linear:
    """A class which automatically does all of the linear regression for you. 
    Defaults are test size 0.15, folds=6, n_alphas = 66 for lasso, logspace =66 for ridge, and l1 = 20 steps
    Created by LukeBetham"""        
    
    def __init__(self, X, y, test_size=0.15, folds=6, shuffle=True):
         
        #Set up the KFolds
        self.folds = folds
        self.shuffle = shuffle
    
        #Add option for train-test if selected
        self.X = X
        self.y = y
        self.test = test_size
        if self.test != 0:
            self.X, self.X_test, self.y, self.y_test = train_test_split(X, y, test_size=self.test, random_state=66)
        
        #Standardise the data
        scaler = StandardScaler()
        self.X = pd.DataFrame(scaler.fit_transform(self.X), columns=X.columns)
        if self.test != 0:
            self.X_test = pd.DataFrame(scaler.transform(self.X_test), columns=X.columns)
        
        # Set up linear Regresssion       
        self.model = LinearRegression()
        self.model_fitter()
        print("Linear Regression Test\nModel R2 Score:",self.score,"\nModel Test Score:",self.test_score,
             '\nCV Fold Score:',self.cv_score)
        self.coefs()
        print("\nUse .ridge(), .lasso() and .elastic_net() to run full regularisation tests.",
              "\nRun .coefs(n) to show n top coefficients and .df to return the coef dataframe.",
              "\nRun .resid_plot() to plot residuals and .resid_df to return dataframe")
        
        #show coefs   
    def coefs(self,show=6):
        df = pd.DataFrame(self.coef_)
        df.columns = ["coefs"]
        df['abs coef']=abs(df['coefs'])
        df['columns']=self.X.columns
        self.df = df.sort_values(by='abs coef',ascending=False)
        print("Highest Coefs:\n",self.df.head(show))
        print("Amount of dropped variables:",len(df['coefs'][df['coefs']==0]),"\nPercent of X variables dropped",
              round(len(df['coefs'][df['coefs']==0])/len(df['coefs'])*100),'%')

        # Ridge regularisation
    def ridge(self, lower= -10, upper=10, log = 66):
        model = RidgeCV(np.logspace(lower,upper,log),cv=KFold(self.folds, shuffle=self.shuffle, random_state=6))
        model.fit(self.X,self.y)
        self.alpha_ = model.alpha_
        self.model = Ridge(alpha=self.alpha_)
        self.model_fitter()
        print("Ridge Regularisation Test\nAlpha log range # used for RidgeCV fit: np.logspace(",lower,upper,log,")\nAlpha:",self.alpha_,
              "\nModel R2 Score:",self.score,"\nModel Test Score:",self.test_score,
              '\nCV Fold Score:',self.cv_score)
        self.coefs()
        
        #Lasso regularisation
    def lasso(self, n_alphas = 66):
        model = LassoCV(n_alphas=n_alphas,cv=KFold(self.folds, shuffle=self.shuffle, random_state=6))
        model.fit(self.X,self.y)
        self.alpha_ = model.alpha_
        self.model = Lasso(alpha=self.alpha_)
        self.model_fitter()
        print("Lasso Regularisation Test\nNumber of alphas used for LassoCV fit:",n_alphas,"\nAlpha output:",self.alpha_,
              "\nModel R2 Score:",self.score,"\nModel Test Score:",self.test_score,
              '\nCV Fold Score:',self.cv_score)
        self.coefs()
        
        #elastic_net regularisation
    def elastic_net(self, n_alphas = 66,l1 = 20):
        model = ElasticNetCV(l1_ratio=np.linspace(0.0001,1,l1), n_alphas=n_alphas, cv=KFold(self.folds,shuffle=self.shuffle, random_state=6))
        model.fit(self.X,self.y)
        self.alpha_ = model.alpha_
        self.l1_ratio_ = model.l1_ratio_
        self.model = ElasticNet(alpha=self.alpha_,l1_ratio=self.l1_ratio_)
        self.model_fitter()
        print("Elastic Net Regularisation Test\nNumber of alphas used for CV fit:",n_alphas,"\nAlpha output:",self.alpha_,
              '\nl1 iterations: np.linspace( 0.0001, 1,',l1,")\nl1 ratio value:",self.l1_ratio_,
              "\nModel R2 Score:",self.score,"\nModel Test Score:",self.test_score,'\nCV Fold Score:',self.cv_score)
        self.coefs()
        
    def model_fitter(self):
        self.model.fit(self.X, self.y)
        self.y_pred = self.model.predict(self.X)
        self.cv_score = (np.mean(cross_val_score(self.model, self.X, self.y, cv=KFold(self.folds, shuffle=self.shuffle, random_state=6))))
        self.score = self.model.score(self.X, self.y)
        self.coef_ = self.model.coef_
        self.intercept_ = self.model.intercept_
        self.resids = self.y - self.y_pred
        self.resid_df = pd.DataFrame({'resids':self.resids, 'y_pred':self.y_pred, 'y':self.y},index=list(self.y.index))
        if self.test != 0:
            self.test_score = self.model.score(self.X_test, self.y_test)
        else:
            self.test_score = "None"
        
#     this isn't working yet
    def resid_plot(self):
        f, ax = plt.subplots(figsize=(8,8))
        ax = sns.scatterplot(self.y_pred,self.y,color="indigo");
        ax = sns.lineplot(x=(0,self.y.max()),y=(0,self.y.max()),color='thistle');
        ax = plt.xlabel("Predicted Y Values")
        ax = plt.ylabel("Actual Y Values")
        return ax


In [None]:
kobe = pd.read_csv('resource-datasets/kobe_bryant/kobe_superwide_games.csv')
y = kobe.pop('SHOTS_MADE')
X = kobe


In [90]:
nm = full_linear(X,y)


Linear Regression Test
Model R2 Score: 0.7835726764598443 
Model Test Score: -5.36452230583681e+25 
CV Fold Score: -1.7854337350623109e+28
Highest Coefs:
             coefs      abs coef                            columns
262  5.824193e+13  5.824193e+13        SEASON_OPPONENT:mil:2001-02
538 -4.902604e+13  4.902604e+13                     SEASON:2004-05
549 -4.673741e+13  4.673741e+13                     SEASON:2015-16
532 -4.433678e+13  4.433678e+13                     SEASON:1998-99
557  4.291533e+13  4.291533e+13            SHOT_ZONE_RANGE:24+_ft.
561 -4.277801e+13  4.277801e+13  SHOT_ZONE_BASIC:above_the_break_3
Amount of dropped variables: 1 
Percent of X variables dropped 0 %

Use .ridge(), .lasso() and .elastic_net() to run full regularisation tests. 
Run .coefs(n) to show n top coefficients and .df to return the coef dataframe. 
Run .resid_plot() to plot residuals and .resid_df to return dataframe


In [89]:
nm.resid_plot

<bound method full_linear.resid_plot of <__main__.full_linear object at 0x1a19348128>>

In [26]:
nm.lasso()

  tol, rng, random, positive)


Lasso Regularisation Test
Number of alphas used for LassoCV fit: 66 
Alpha output: 0.10960240490911179 
Model R2 Score: 0.6861277829847843 
Model Test Score: 0.6141667121969712 
CV Fold Score: 0.6499534123048463
Highest Coefs:
         coefs  abs coef                            columns
579  1.288412  1.288412       COMBINED_SHOT_TYPE:jump_shot
574  0.919108  0.919108           SHOT_TYPE:2pt_field_goal
566  0.556024  0.556024    SHOT_ZONE_BASIC:restricted_area
577  0.325345  0.325345            COMBINED_SHOT_TYPE:dunk
611 -0.234934  0.234934              ACTION_TYPE:jump_shot
561  0.200475  0.200475  SHOT_ZONE_BASIC:above_the_break_3
Amount of dropped variables: 587 
Percent of X variables dropped 91 %


In [27]:
nm.ridge(-5,5)

Ridge Regularisation Test
Alpha log range # used for RidgeCV fit: np.logspace( -5 5 66 )
Alpha: 1425.1026703030022 
Model R2 Score: 0.765379128249869 
Model Test Score: 0.5868594332279788 
CV Fold Score: 0.6233122944049794
Highest Coefs:
         coefs  abs coef                       columns
574  0.244078  0.244078      SHOT_TYPE:2pt_field_goal
579  0.219806  0.219806  COMBINED_SHOT_TYPE:jump_shot
582  0.214363  0.214363             SECONDS_REMAINING
569  0.213853  0.213853      SHOT_ZONE_AREA:center(c)
584  0.206771  0.206771                        PERIOD
577  0.197239  0.197239       COMBINED_SHOT_TYPE:dunk
Amount of dropped variables: 21 
Percent of X variables dropped 3 %


In [28]:
nm.elastic_net()

  tol, rng, random, positive)
  tol, rng, random, positive)


Elastic Net Regularisation Test
Number of alphas used for CV fit: 66 
Alpha output: 0.10960240490911179 
l1 iterations: np.linspace( 0.0001, 1, 20 )
l1 ratio value: 1.0 
Model R2 Score: 0.6861277829847843 
Model Test Score: 0.6141667121969712 
CV Fold Score: 0.6499534123048463
Highest Coefs:
         coefs  abs coef                            columns
579  1.288412  1.288412       COMBINED_SHOT_TYPE:jump_shot
574  0.919108  0.919108           SHOT_TYPE:2pt_field_goal
566  0.556024  0.556024    SHOT_ZONE_BASIC:restricted_area
577  0.325345  0.325345            COMBINED_SHOT_TYPE:dunk
611 -0.234934  0.234934              ACTION_TYPE:jump_shot
561  0.200475  0.200475  SHOT_ZONE_BASIC:above_the_break_3
Amount of dropped variables: 587 
Percent of X variables dropped 91 %
