## Regularization Methods in Python 

In [1]:
import pandas as pd 
df = pd.read_csv('data/student_math.csv')
print(df.columns, df.shape)

Index(['age', 'Medu', 'Fedu', 'traveltime', 'studytime', 'failures', 'famrel',
       'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2',
       'Final_Grade', 'school_MS', 'sex_M', 'address_U', 'famsize_LE3',
       'Pstatus_T', 'Mjob_health', 'Mjob_other', 'Mjob_services',
       'Mjob_teacher', 'Fjob_health', 'Fjob_other', 'Fjob_services',
       'Fjob_teacher', 'reason_home', 'reason_other', 'reason_reputation',
       'guardian_mother', 'guardian_other', 'schoolsup_yes', 'famsup_yes',
       'paid_yes', 'activities_yes', 'nursery_yes', 'higher_yes',
       'internet_yes', 'romantic_yes'],
      dtype='object') (395, 42)


In [2]:
y = df['Final_Grade']
X = df.drop(columns=['Final_Grade'])
from sklearn.model_selection import train_test_split    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [3]:
from sklearn.linear_model import Lasso

lasso = Lasso(alpha=0.05)
lasso.fit(X_train, y_train)

In [4]:
from sklearn.metrics import mean_squared_error
pred_train = lasso.predict(X_train)
pred_test = lasso.predict(X_test)
training_mse = mean_squared_error(y_train, pred_train)
test_mse = mean_squared_error(y_test, pred_test)
print('Training error:', training_mse)
print('Test error:', test_mse)

Training error: 2.813207583885141
Test error: 4.474769444129441


# Grid

In [6]:
import numpy as np
## an array of alpha values between 0.000001 and 1.0

alpha_array = np.logspace(-6, 0, 100)

# dict with key (alpha) and values being alpha_array
tuned_parameters = [{'alpha': alpha_array}]

In [7]:
from sklearn.model_selection import GridSearchCV
model = GridSearchCV(estimator = Lasso(), param_grid = tuned_parameters, \
    scoring = 'neg_mean_squared_error', cv = 5, return_train_score = True)

model.fit(X, y)

In [8]:
test_scores = model.cv_results_['mean_test_score']
train_scores = model.cv_results_['mean_train_score']

In [9]:
print(model.best_params_, model.best_score_)

{'alpha': 0.12328467394420659} -3.7432949015591155


## Ridge

In [10]:
import numpy as np
import pandas as pd
df = pd.read_csv('data/student_math.csv')
print(df.shape)

y = df['Final_Grade']
X = df.drop(columns = ['Final_Grade'])

from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV


#Create an array of alpha values between 0.01 and 10000
alpha_array = np.logspace(-2, 4, 100)


#Create a dictionary with a single key, alpha and values set to alpha_array
tuned_parameters = [{'alpha': alpha_array}]

# Perform GridSearchCV with Ridge regularization on the data
model = GridSearchCV(estimator = Ridge(), param_grid = tuned_parameters, scoring = 'neg_mean_squared_error', cv = 5, return_train_score = True )

model.fit(X, y)

# Print the tuned alpha and the best test score corresponding to it

print(model.best_params_, model.best_score_)


(395, 42)
{'alpha': 151.99110829529332} -3.8086096318338347


## Regularization on a logistic regression

In [11]:
from sklearn.linear_model import LogisticRegression

logistic_no_regularization = LogisticRegression(penalty= 'none')

In [None]:
# for l1

logistic_lasso = LogisticRegression(penalty= 'l1', solver = 'liblinear', C = 0.1)

In [13]:
# for l2

logistic_ridge = LogisticRegression(C = 0.1) 

In [14]:
# for 

logistic_elasticnet = LogisticRegression(penalty = 'elasticnet', solver = 'saga', C = 0.1, l1_ratio = 0.2 )

In [15]:
# Making an array of C's; here we're choosing 100 values between 0.001 and 100

C_array = np.logspace(-3, 2, 100)

# Making a dict to enter as an input to param_grid

tuning_C = {'C': C_array}
clf = LogisticRegression(penalty = 'l1', solver = 'liblinear')
gs = GridSearchCV(clf, param_grid = tuning_C, scoring='accuracy', cv=5) 

# Gridsearch on a logistic regression

In [16]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

df = pd.read_csv('data/candy-data.csv')
y = df['chocolate']
X = df.drop(columns = ['chocolate', 'competitorname'])
model = LogisticRegression(penalty = 'l2', random_state = 42, max_iter = 10000)
tuning_C = {'C': np.logspace(-3,2, 100)}

# Implement GridSearchCV setting scoring to 'accuracy'
gs = GridSearchCV(model, tuning_C, scoring='accuracy', cv=5)
gs.fit(X, y)

# Print the best C value
print(gs.best_params_)

# Print the best score
print(gs.best_score_)

{'C': 2.420128264794381}
0.8823529411764705


In [17]:
# brief way
from sklearn.linear_model import LogisticRegressionCV



model = LogisticRegressionCV( Cs=np.logspace(-3,2, 100),
                                  penalty='l2',
                                  scoring='accuracy', cv=5,
                                  random_state=42,max_iter=10000)
model.fit(X, y)
print(model.C_, model.scores_[1].mean(axis=0).max())

[2.42012826] 0.8823529411764705
