# Feature Engineering Exercise (Core)

In [1]:
# imports
# for the required section
import pandas as pd
import numpy as np

# for the optional section
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import ConfusionMatrixDisplay, classification_report

pd.set_option('display.max_columns',200)
pd.set_option("display.max_info_rows", 800)
pd.set_option('display.max_info_columns',800)

from sklearn import set_config
set_config(transform_output='pandas')

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("data/bikeshare_train.csv")
df.head(3)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 0:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 1:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 2:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32


In [3]:
df = df.drop(columns=['casual','registered'])
df.head(3)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,count
0,2011-01-01 0:00:00,1,0,0,1,9.84,14.395,81,0.0,16
1,2011-01-01 1:00:00,1,0,0,1,9.02,13.635,80,0.0,40
2,2011-01-01 2:00:00,1,0,0,1,9.02,13.635,80,0.0,32


In [4]:
df['datetime'] = pd.to_datetime(df['datetime'])
df['datetime'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 10886 entries, 0 to 10885
Series name: datetime
Non-Null Count  Dtype         
--------------  -----         
10886 non-null  datetime64[ns]
dtypes: datetime64[ns](1)
memory usage: 85.2 KB


In [5]:
df['month'] = df['datetime'].dt.month_name()
df['day of week'] = df['datetime'].dt.day_name()
df['hour'] = df['datetime'].dt.hour.astype('object')
df.head(3)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,count,month,day of week,hour
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,16,January,Saturday,0
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,40,January,Saturday,1
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,32,January,Saturday,2


In [6]:
df = df.drop(columns=['datetime','season'])
df.head(3)

Unnamed: 0,holiday,workingday,weather,temp,atemp,humidity,windspeed,count,month,day of week,hour
0,0,0,1,9.84,14.395,81,0.0,16,January,Saturday,0
1,0,0,1,9.02,13.635,80,0.0,40,January,Saturday,1
2,0,0,1,9.02,13.635,80,0.0,32,January,Saturday,2


In [7]:
df['temp'] = df['temp'].apply(lambda x: (x*9/5) + 32)
df['atemp'] = df['atemp'].apply(lambda x: (x*9/5) + 32)
df.head(3)

Unnamed: 0,holiday,workingday,weather,temp,atemp,humidity,windspeed,count,month,day of week,hour
0,0,0,1,49.712,57.911,81,0.0,16,January,Saturday,0
1,0,0,1,48.236,56.543,80,0.0,40,January,Saturday,1
2,0,0,1,48.236,56.543,80,0.0,32,January,Saturday,2


In [8]:
df['temp variance'] = df['temp'] - df['atemp']
df = df.drop(columns='atemp')
df.head(3)

Unnamed: 0,holiday,workingday,weather,temp,humidity,windspeed,count,month,day of week,hour,temp variance
0,0,0,1,49.712,81,0.0,16,January,Saturday,0,-8.199
1,0,0,1,48.236,80,0.0,40,January,Saturday,1,-8.307
2,0,0,1,48.236,80,0.0,32,January,Saturday,2,-8.307


### Optional

#### Functions

In [9]:
def classification_metrics(y_true, y_pred, label='',
                           output_dict=False, figsize=(8,4),
                           normalize='true', cmap='Blues',
                           colorbar=False):
    # Get the classification report
    report = classification_report(y_true, y_pred)
    ## Print header and report
    header = "-"*70
    print(header, f" Classification Metrics: {label}", header, sep='\n')
    print(report)
    ## CONFUSION MATRICES SUBPLOTS
    fig, axes = plt.subplots(ncols=2, figsize=figsize)
    # create a confusion matrix  of raw counts
    ConfusionMatrixDisplay.from_predictions(y_true, y_pred,
                normalize=None, cmap='gist_gray', colorbar=colorbar,
                ax = axes[0],);
    axes[0].set_title("Raw Counts")
    # create a confusion matrix with the test data
    ConfusionMatrixDisplay.from_predictions(y_true, y_pred,
                normalize=normalize, cmap=cmap, colorbar=colorbar,
                ax = axes[1]);
    axes[1].set_title("Normalized Confusion Matrix")
    # Adjust layout and show figure
    fig.tight_layout()
    plt.show()
    # Return dictionary of classification_report
    if output_dict==True:
        report_dict = classification_report(y_true, y_pred, output_dict=True)
        return report_dict

In [10]:
def evaluate_classification(model, X_train, y_train, X_test, y_test,
                         figsize=(6,4), normalize='true', output_dict = False,
                            cmap_train='Blues', cmap_test="Reds",colorbar=False):
    # Get predictions for training data
    y_train_pred = model.predict(X_train)
    # Call the helper function to obtain regression metrics for training data
    results_train = classification_metrics(y_train, y_train_pred, #verbose = verbose,
                                     output_dict=True, figsize=figsize,
                                         colorbar=colorbar, cmap=cmap_train,
                                     label='Training Data')
    print()
    # Get predictions for test data
    y_test_pred = model.predict(X_test)
    # Call the helper function to obtain regression metrics for test data
    results_test = classification_metrics(y_test, y_test_pred, #verbose = verbose,
                                  output_dict=True,figsize=figsize,
                                         colorbar=colorbar, cmap=cmap_test,
                                    label='Test Data' )
    if output_dict == True:
        # Store results in a dataframe if ouput_frame is True
        results_dict = {'train':results_train,
                    'test': results_test}
        return results_dict

#### Models

In [11]:
og_df = pd.read_csv("data/bikeshare_train.csv").drop(columns=['casual','registered'])
og_df.head(3)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,count
0,2011-01-01 0:00:00,1,0,0,1,9.84,14.395,81,0.0,16
1,2011-01-01 1:00:00,1,0,0,1,9.02,13.635,80,0.0,40
2,2011-01-01 2:00:00,1,0,0,1,9.02,13.635,80,0.0,32


In [12]:
# for engineered data
target = 'count'
X = df.drop(columns=target).copy()
y = df[target].copy()
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42)
# for the og data
og_X = og_df.drop(columns=target).copy()
og_y = og_df[target].copy()
og_X_train, og_X_test, og_y_train, og_y_test = train_test_split(og_X,og_y, random_state=42)

##### Engineered Data

In [13]:
# setting the numerical part of a pipeline
num_cols = X_train.select_dtypes('number').columns
scaler = StandardScaler()
num_pipe = make_pipeline(scaler)
num_tuple = ('numeric', num_pipe, num_cols)

In [14]:
# setting ohe and scalar methods
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
scaler = StandardScaler()

In [15]:
ohe_cols = X_train.select_dtypes('object').columns
ohe_pipe = make_pipeline(encoder, scaler)
ohe_tuple = ('categorical', ohe_pipe, ohe_cols)

In [16]:
# setting up preprocessor
preprocessor = ColumnTransformer([num_tuple, ohe_tuple],
                                    verbose_feature_names_out=False, remainder='drop')

In [17]:
# Make an instance of the model with default parameters
rf = RandomForestClassifier(random_state=42)
rf_pipe = make_pipeline(preprocessor, rf)
rf_pipe.fit(X_train, y_train)

# Evaluate the default model
evaluate_classification(rf_pipe, X_train, y_train, X_test, y_test)

----------------------------------------------------------------------
 Classification Metrics: Training Data
----------------------------------------------------------------------
              precision    recall  f1-score   support

           1       1.00      1.00      1.00        80
           2       0.99      1.00      1.00       102
           3       1.00      0.99      1.00       106
           4       1.00      1.00      1.00       110
           5       1.00      1.00      1.00       129
           6       1.00      1.00      1.00        97
           7       1.00      1.00      1.00        88
           8       1.00      1.00      1.00        72
           9       1.00      1.00      1.00        61
          10       1.00      1.00      1.00        68
          11       1.00      1.00      1.00        72
          12       1.00      1.00      1.00        57
          13       1.00      1.00      1.00        48
          14       1.00      1.00      1.00        57
        

KeyboardInterrupt: 

##### OG Data

In [None]:
# setting the numerical part of a pipeline
og_num_cols = og_X_train.select_dtypes('number').columns
scaler = StandardScaler()
og_num_pipe = make_pipeline(scaler)
og_num_tuple = ('numeric', og_num_pipe, og_num_cols)

In [None]:
og_ohe_cols = og_X_train.select_dtypes('object').columns
og_ohe_pipe = make_pipeline(encoder, scaler)
og_ohe_tuple = ('categorical', og_ohe_pipe, og_ohe_cols)

In [None]:
# setting up preprocessor
og_preprocessor = ColumnTransformer([og_num_tuple, og_ohe_tuple],
                                    verbose_feature_names_out=False, remainder='drop')

In [None]:
# Make an instance of the model with default parameters
og_rf = RandomForestClassifier(random_state=42)
og_rf_pipe = make_pipeline(og_preprocessor, og_rf)
og_rf_pipe.fit(og_X_train, og_y_train)

# Evaluate the default model
evaluate_classification(og_rf_pipe, og_X_train, og_y_train, og_X_test, og_y_test)

**Did these feature engineering choices improve your ability to predict the 'count'?**- Answer
