# Importing and Splitting Data

In [1]:
# import packages
import pandas as pd
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Create a Python list to hold the headers for the data
_headers = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'car']

In [3]:
# url path
url_path = 'https://raw.githubusercontent.com/PacktWorkshops/The-Data-Science-Workshop/master/Chapter07/Dataset/car.data'

In [4]:
# load data
df = pd.read_csv(url_path, names=_headers, index_col=None)
df.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,car
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [5]:
# show column information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   buying    1728 non-null   object
 1   maint     1728 non-null   object
 2   doors     1728 non-null   object
 3   persons   1728 non-null   object
 4   lug_boot  1728 non-null   object
 5   safety    1728 non-null   object
 6   car       1728 non-null   object
dtypes: object(7)
memory usage: 94.6+ KB


In [6]:
# split the data
df_train, df_eval = train_test_split(df, train_size=0.8, random_state=0)

# Creating a Five-Fold Cross-Validation Dataset

In [7]:
# import package
from sklearn.model_selection import KFold

In [8]:
# create an instance of the class
n_splits = 5
_kf = KFold(n_splits=n_splits)

In [9]:
# split the data
indices = _kf.split(df)

In [10]:
# Get the first set of indices
train_indices, val_indices = next(indices)

The <b>next()</b> Python function on the generator function: Using <b>next()</b> is the way that you get a generator to return results to you. You asked for five splits, so you can call <b>next()</b> five times on this particular generator. Calling <b>next()</b> a sixth time will cause the Python runtime to raise an exception.

The call to <b>next()</b> yields a tuple. In this case, it is a pair of indices. The first one contains your training indices and the second one contains your validation indices. You assign these to <b>train_indices</b> and <b>val_indices</b>

In [11]:
# Create a training dataset 
train_df = df.drop(val_indices)

In [12]:
# create a validation dataset
val_df = df.drop(train_indices)

# Creating a Five-Fold Cross-Validation Dataset Using a Loop for Calls

In [13]:
# create two lists
_t, _v = [], []

In [14]:
_indices = _kf.split(df)

In [15]:
# Iterate over the generator and create DataFrames
for i in range(n_splits):
    train_idx, val_idx = next(_indices)
    _train_df = df.drop(val_idx)
    _t.append(_train_df)
    _val_df = df.drop(train_idx)
    _v.append(_val_df)

In [16]:
# iterate over the training list
for d in _t:
    print(d.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1382 entries, 346 to 1727
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   buying    1382 non-null   object
 1   maint     1382 non-null   object
 2   doors     1382 non-null   object
 3   persons   1382 non-null   object
 4   lug_boot  1382 non-null   object
 5   safety    1382 non-null   object
 6   car       1382 non-null   object
dtypes: object(7)
memory usage: 86.4+ KB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1382 entries, 0 to 1727
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   buying    1382 non-null   object
 1   maint     1382 non-null   object
 2   doors     1382 non-null   object
 3   persons   1382 non-null   object
 4   lug_boot  1382 non-null   object
 5   safety    1382 non-null   object
 6   car       1382 non-null   object
dtypes: object(7)
memory usage: 86.4+ KB
None
<class 'pa

In [17]:
# iterate over the validation list
for d in _v:
    print(d.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 346 entries, 0 to 345
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   buying    346 non-null    object
 1   maint     346 non-null    object
 2   doors     346 non-null    object
 3   persons   346 non-null    object
 4   lug_boot  346 non-null    object
 5   safety    346 non-null    object
 6   car       346 non-null    object
dtypes: object(7)
memory usage: 21.6+ KB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 346 entries, 346 to 691
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   buying    346 non-null    object
 1   maint     346 non-null    object
 2   doors     346 non-null    object
 3   persons   346 non-null    object
 4   lug_boot  346 non-null    object
 5   safety    346 non-null    object
 6   car       346 non-null    object
dtypes: object(7)
memory usage: 21.6+ KB
None
<class 'pandas

# Getting the Scores from Five-Fold Cross-Validation

In [18]:
# Encode the categorical variables in the dataset
_df = pd.get_dummies(df, columns=['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety'])
_df.head()

Unnamed: 0,car,buying_high,buying_low,buying_med,buying_vhigh,maint_high,maint_low,maint_med,maint_vhigh,doors_2,...,doors_5more,persons_2,persons_4,persons_more,lug_boot_big,lug_boot_med,lug_boot_small,safety_high,safety_low,safety_med
0,unacc,0,0,0,1,0,0,0,1,1,...,0,1,0,0,0,0,1,0,1,0
1,unacc,0,0,0,1,0,0,0,1,1,...,0,1,0,0,0,0,1,0,0,1
2,unacc,0,0,0,1,0,0,0,1,1,...,0,1,0,0,0,0,1,1,0,0
3,unacc,0,0,0,1,0,0,0,1,1,...,0,1,0,0,0,1,0,0,1,0
4,unacc,0,0,0,1,0,0,0,1,1,...,0,1,0,0,0,1,0,0,0,1


In [19]:
# Split the data into features and labels
features = _df.drop(['car'], axis=1).values
labels = _df[['car']].values

In [20]:
# create a model
from sklearn.linear_model import LogisticRegression
_lr = LogisticRegression()

In [21]:
# Compute the cross-validation scores
from sklearn.model_selection import cross_val_score

_scores = cross_val_score(_lr, features, labels, cv=5)

In [22]:
# display the scores
print(_scores)

[0.70231214 0.84971098 0.74566474 0.75652174 0.76231884]


# Training a Logistic Regression Model Using Cross-Validation

In [23]:
# Import logistic regression with cross-validation
from sklearn.linear_model import LogisticRegressionCV

In [24]:
# create the model
model = LogisticRegressionCV(max_iter=2000, multi_class='auto', cv=5)

In [25]:
# train the model
model.fit(features, labels.ravel())

LogisticRegressionCV(cv=5, max_iter=2000)

In [26]:
# Evaluate the training R2
print(model.score(features, labels.ravel()))

0.9456018518518519


# Using Grid Search with Cross-Validation to Find the Best Parameters for a Model

In [27]:
# import packages
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

In [28]:
# create the model
clf = DecisionTreeClassifier()

In [29]:
# create parameters
params = {'max_depth': np.arange(1, 8)}

In [30]:
# instantiate the grid search
clf_cv = GridSearchCV(clf, param_grid=params, cv=5)

In [31]:
# train the model
clf_cv.fit(features, labels)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': array([1, 2, 3, 4, 5, 6, 7])})

In [32]:
# Print the best parameter
print(f'Tuned Decision Tree Parameters: {clf_cv.best_params_}')

Tuned Decision Tree Parameters: {'max_depth': 2}


In [33]:
# Print the best R2
print(f'Best score is {clf_cv.best_score_}')

Best score is 0.7778822149618833


In [34]:
# Access the best model
model = clf_cv.best_estimator_
model

DecisionTreeClassifier(max_depth=2)

# Using Randomized Search for Hyperparameter Tuning

In [35]:
# import packages
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

In [36]:
# create the model
clf = RandomForestClassifier()

In [37]:
# create a dictionary of parameters
params = {'n_estimators': [500, 1000, 2000], 'max_depth': np.arange(1, 8)}

In [38]:
# instantiate a randomized search
clf_cv = RandomizedSearchCV(clf, param_distributions=params, cv=5)

In [39]:
# Perform the search
clf_cv.fit(features, labels.ravel())

RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(),
                   param_distributions={'max_depth': array([1, 2, 3, 4, 5, 6, 7]),
                                        'n_estimators': [500, 1000, 2000]})

In [40]:
# Print the best parameter combination
print(f'Tuned Random Forest Parameters: {clf_cv.best_params_}')

Tuned Random Forest Parameters: {'n_estimators': 500, 'max_depth': 5}


In [41]:
# Inspect the best model
model = clf_cv.best_estimator_
model

RandomForestClassifier(max_depth=5, n_estimators=500)

# Fixing Model Overfitting Using Lasso Regression

In [42]:
# import packages
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures

In [43]:
# url path
url_path = 'https://raw.githubusercontent.com/PacktWorkshops/The-Data-Science-Workshop/master/Chapter07/Dataset/ccpp.csv'

In [44]:
# load data
_df = pd.read_csv(url_path)
_df.head()

Unnamed: 0,AT,V,AP,RH,PE
0,14.96,41.76,1024.07,73.17,463.26
1,25.18,62.96,1020.04,59.08,444.37
2,5.11,39.4,1012.16,92.14,488.56
3,20.86,57.32,1010.24,76.64,446.48
4,10.82,37.5,1009.23,96.62,473.9


In [45]:
# extract features (X) and labels (y)
X = _df.drop(['PE'], axis=1).values
y = _df['PE'].values

In [46]:
# split the data
X_train, X_eval, y_train, y_eval = train_test_split(X, y, train_size=0.8, random_state=0)

In [47]:
# create the model
lr_model_1 = LinearRegression()

In [48]:
# fit the model
lr_model_1.fit(X_train, y_train)

LinearRegression()

In [49]:
# make predictions
y_pred = lr_model_1.predict(X_eval)

In [50]:
# print R2 score
print(f'lr_model_1 R^2 score: {lr_model_1.score(X_eval, y_eval)}')

lr_model_1 R^2 score: 0.9325315554761302


In [51]:
# print the Mean Squared Error (MSE)
print(f'lr_model_1 MSE: {mean_squared_error(y_eval, y_pred)}')

lr_model_1 MSE: 19.733699303497648


In [52]:
# create a list for pipeline
steps = [
    ('scaler', MinMaxScaler()),
    ('poly', PolynomialFeatures()),
    ('lr', LinearRegression())
]

In [53]:
# create a instance of a pipeline
lr_model_2 = Pipeline(steps)

In [54]:
# train the instance of the pipeline
lr_model_2.fit(X_train, y_train)

Pipeline(steps=[('scaler', MinMaxScaler()), ('poly', PolynomialFeatures()),
                ('lr', LinearRegression())])

In [55]:
# print the R^2 score of the model
print(f'lr_model_2 R^2 score: {lr_model_2.score(X_eval, y_eval)}')

lr_model_2 R^2 score: 0.9421687659462381


In [56]:
# make predictions
y_pred2 = lr_model_2.predict(X_eval)

In [57]:
# print the MSE of the model
print(f'lr_model_2 MSE: {mean_squared_error(y_eval, y_pred2)}')

lr_model_2 MSE: 16.914932472815142


In [58]:
# inspect the model coefficients
print(lr_model_2[-1].coef_)

[  0.         -85.42486107 -28.97818226  19.42423838  17.28833859
  19.21121733  23.48408016   3.18187061 -17.4628396   -4.13017095
   6.18669787   2.25194198 -12.32055597 -10.29852285 -10.53464642]


In [59]:
# Check for the number of coefficients in this model
print(len(lr_model_2[-1].coef_))

15


In [60]:
# Create a steps list with PolynomialFeatures of degree 10
steps = [
    ('scaler', MinMaxScaler()),
    ('poly', PolynomialFeatures(degree=10)),
    ('lr', LinearRegression())
]

In [61]:
# create the third model
lr_model_3 = Pipeline(steps)

In [62]:
# train the model
lr_model_3.fit(X_train, y_train)

Pipeline(steps=[('scaler', MinMaxScaler()),
                ('poly', PolynomialFeatures(degree=10)),
                ('lr', LinearRegression())])

In [63]:
# print the R2 score
print(f'lr_model_3 R^2 score: {lr_model_3.score(X_eval, y_eval)}')

lr_model_3 R^2 score: 0.5683464152933574


In [64]:
# make predictions
y_pred3 = lr_model_3.predict(X_eval)

In [65]:
# print the MSE
print(f'lr_model_3 MSE: {mean_squared_error(y_eval, y_pred3)}')

lr_model_3 MSE: 126.25342267768008


In [66]:
# print the number of coefficients
print(len(lr_model_3[-1].coef_))

1001


In [67]:
# Create a list of steps for the pipeline 
steps = [('scaler', MinMaxScaler()),\
         ('poly', PolynomialFeatures(degree=10)),\
         ('lr', Lasso(alpha=0.01))]

In [68]:
# create the model
lasso_model = Pipeline(steps)

In [69]:
# train the model
lasso_model.fit(X_train, y_train)

Pipeline(steps=[('scaler', MinMaxScaler()),
                ('poly', PolynomialFeatures(degree=10)),
                ('lr', Lasso(alpha=0.01))])

In [70]:
# print the R2 score of lasso_model
print(f'lasso_model R^2 score: {lasso_model.score(X_eval, y_eval)}')

lasso_model R^2 score: 0.9418109161488483


In [71]:
# make predictions
lasso_preds = lasso_model.predict(X_eval)

In [72]:
# print the MSE of lasso_model
print(f'lasso_model MSE: {mean_squared_error(y_eval, lasso_preds)}')

lasso_model MSE: 17.019599185488644


In [73]:
# print the number of coefficients
print(len(lasso_model[-1].coef_))

1001


# Fixing Model Overfitting Using Ridge Regression

In [74]:
from sklearn.linear_model import Ridge

In [75]:
# create a model
lr_model_1 = LinearRegression()

In [76]:
# train the model
lr_model_1.fit(X_train, y_train)

LinearRegression()

In [77]:
# make predictions
y_pred = lr_model_1.predict(X_eval)

In [78]:
# print R2 score
print(f'lr_model_1 R^2 score: {lr_model_1.score(X_eval, y_eval)}')

lr_model_1 R^2 score: 0.9325315554761302


In [79]:
# print the MSE
print(f'lr_model_1 MSE: {mean_squared_error(y_eval, y_pred)}')

lr_model_1 MSE: 19.733699303497648


In [80]:
# create a list
steps = [
    ('scaler', MinMaxScaler()),
    ('poly', PolynomialFeatures(degree=3)),
    ('lr', LinearRegression())
]

In [81]:
# create a pipeline
lr_model_2 = Pipeline(steps)

In [82]:
# train the instance of the pipeline
lr_model_2.fit(X_train, y_train)

Pipeline(steps=[('scaler', MinMaxScaler()),
                ('poly', PolynomialFeatures(degree=3)),
                ('lr', LinearRegression())])

In [83]:
# print the R2 score of the model
print(f'lr_model_2 R^2 score: {lr_model_2.score(X_eval, y_eval)}')

lr_model_2 R^2 score: 0.9443678654045207


In [84]:
# make predictions
y_pred2 = lr_model_2.predict(X_eval)

In [85]:
# print the MSE of the model
print(f'lr_model_2 MSE: {mean_squared_error(y_eval, y_pred2)}')

lr_model_2 MSE: 16.271722632207666


In [86]:
# check the number of coefficients of the model
print(len(lr_model_2[-1].coef_))

35


In [87]:
# Create a steps list with PolynomialFeatures of degree 10
steps = [
    ('scaler', MinMaxScaler()),
    ('poly', PolynomialFeatures(degree=10)),
    ('lr', LinearRegression())
]

In [88]:
# create the model
lr_model_3 = Pipeline(steps)

In [89]:
# train the model
lr_model_3.fit(X_train, y_train)

Pipeline(steps=[('scaler', MinMaxScaler()),
                ('poly', PolynomialFeatures(degree=10)),
                ('lr', LinearRegression())])

In [90]:
# print the R2 score
print(f'lr_model_3 R^2 score: {lr_model_3.score(X_eval, y_eval)}')

lr_model_3 R^2 score: 0.5683464152933574


In [91]:
# make predictions
y_pred3 = lr_model_3.predict(X_eval)

In [92]:
# print the MSE of the model
print(f'lr_model_3 MSE: {mean_squared_error(y_eval, y_pred3)}')

lr_model_3 MSE: 126.25342267768008


In [93]:
# check the number of coefficients of the model
print(len(lr_model_3[-1].coef_))

1001


In [94]:
# create a list
steps = [
    ('scaler', MinMaxScaler()),
    ('poly', PolynomialFeatures(degree=10)),
    ('lr', Ridge(alpha=0.9))
]

In [95]:
# create the model
ridge_model = Pipeline(steps)

In [96]:
# train the model
ridge_model.fit(X_train, y_train)

Pipeline(steps=[('scaler', MinMaxScaler()),
                ('poly', PolynomialFeatures(degree=10)),
                ('lr', Ridge(alpha=0.9))])

In [97]:
# print the R2 score
print(f'ridge_model R^2 score: {ridge_model.score(X_eval, y_eval)}')

ridge_model R^2 score: 0.9451949082623449


In [98]:
# make predictions
ridge_pred = ridge_model.predict(X_eval)

In [99]:
# print the MSE for the model
print(f'ridge_model MSE: {mean_squared_error(y_eval, ridge_pred)}')

ridge_model MSE: 16.029822656854975


In [100]:
# check the number of coefficients
print(len(ridge_model[-1].coef_))

1001
