In [21]:
# Imports
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler,OrdinalEncoder
from sklearn.decomposition import PCA
from sklearn import set_config
set_config(transform_output='pandas')
pd.set_option('display.max_columns',100)

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
import datetime as dt

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix

pd.set_option('display.max_columns',200)
pd.set_option("display.max_info_rows", 800)
pd.set_option('display.max_info_columns',800)

from sklearn import set_config
set_config(transform_output='pandas')

In [2]:
# Load in Data
df = pd.read_csv("Data/bikeshare_train - bikeshare_train.csv")
df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 0:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 1:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 2:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 3:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 4:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


In [3]:
# Drop the casual and registered columns
df.drop(columns = ["casual","registered"], inplace = True)
df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,count
0,2011-01-01 0:00:00,1,0,0,1,9.84,14.395,81,0.0,16
1,2011-01-01 1:00:00,1,0,0,1,9.02,13.635,80,0.0,40
2,2011-01-01 2:00:00,1,0,0,1,9.02,13.635,80,0.0,32
3,2011-01-01 3:00:00,1,0,0,1,9.84,14.395,75,0.0,13
4,2011-01-01 4:00:00,1,0,0,1,9.84,14.395,75,0.0,1


# Transform to Datetime Type

In [4]:
# Convert datetime to type datetime
df['datetime'] = pd.to_datetime(df['datetime'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 10 columns):
 #   Column      Dtype         
---  ------      -----         
 0   datetime    datetime64[ns]
 1   season      int64         
 2   holiday     int64         
 3   workingday  int64         
 4   weather     int64         
 5   temp        float64       
 6   atemp       float64       
 7   humidity    int64         
 8   windspeed   float64       
 9   count       int64         
dtypes: datetime64[ns](1), float64(3), int64(6)
memory usage: 850.6 KB


In [5]:
# Split datetime into month, day, and hour
df['Month'] = df['datetime'].dt.month_name()
df['Day of Week'] = df['datetime'].dt.day_name()
df['Hour of Day'] = df['datetime'].dt.hour

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 13 columns):
 #   Column       Dtype         
---  ------       -----         
 0   datetime     datetime64[ns]
 1   season       int64         
 2   holiday      int64         
 3   workingday   int64         
 4   weather      int64         
 5   temp         float64       
 6   atemp        float64       
 7   humidity     int64         
 8   windspeed    float64       
 9   count        int64         
 10  Month        object        
 11  Day of Week  object        
 12  Hour of Day  int64         
dtypes: datetime64[ns](1), float64(3), int64(7), object(2)
memory usage: 1.1+ MB


In [6]:
df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,count,Month,Day of Week,Hour of Day
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,16,January,Saturday,0
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,40,January,Saturday,1
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,32,January,Saturday,2
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,13,January,Saturday,3
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,1,January,Saturday,4


In [7]:
# Drop the datetime and seaon columns
df.drop(columns = ["datetime", "season"], inplace = True)
df.head()

Unnamed: 0,holiday,workingday,weather,temp,atemp,humidity,windspeed,count,Month,Day of Week,Hour of Day
0,0,0,1,9.84,14.395,81,0.0,16,January,Saturday,0
1,0,0,1,9.02,13.635,80,0.0,40,January,Saturday,1
2,0,0,1,9.02,13.635,80,0.0,32,January,Saturday,2
3,0,0,1,9.84,14.395,75,0.0,13,January,Saturday,3
4,0,0,1,9.84,14.395,75,0.0,1,January,Saturday,4


In [8]:
df["Hour of Day"] = df["Hour of Day"].astype('str')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 11 columns):
 #   Column       Dtype  
---  ------       -----  
 0   holiday      int64  
 1   workingday   int64  
 2   weather      int64  
 3   temp         float64
 4   atemp        float64
 5   humidity     int64  
 6   windspeed    float64
 7   count        int64  
 8   Month        object 
 9   Day of Week  object 
 10  Hour of Day  object 
dtypes: float64(3), int64(5), object(3)
memory usage: 935.6+ KB


In [9]:
df.head()

Unnamed: 0,holiday,workingday,weather,temp,atemp,humidity,windspeed,count,Month,Day of Week,Hour of Day
0,0,0,1,9.84,14.395,81,0.0,16,January,Saturday,0
1,0,0,1,9.02,13.635,80,0.0,40,January,Saturday,1
2,0,0,1,9.02,13.635,80,0.0,32,January,Saturday,2
3,0,0,1,9.84,14.395,75,0.0,13,January,Saturday,3
4,0,0,1,9.84,14.395,75,0.0,1,January,Saturday,4


# Lambda Function

In [10]:
# Use a lambda function to convert Celsius to Fahrenheit
df["temp"] = df["temp"].apply(lambda x: x * (9/5) + 32)

In [11]:
df["atemp"] = df["atemp"].apply(lambda x: x * (9/5) + 32)

In [12]:
df.head()

Unnamed: 0,holiday,workingday,weather,temp,atemp,humidity,windspeed,count,Month,Day of Week,Hour of Day
0,0,0,1,49.712,57.911,81,0.0,16,January,Saturday,0
1,0,0,1,48.236,56.543,80,0.0,40,January,Saturday,1
2,0,0,1,48.236,56.543,80,0.0,32,January,Saturday,2
3,0,0,1,49.712,57.911,75,0.0,13,January,Saturday,3
4,0,0,1,49.712,57.911,75,0.0,1,January,Saturday,4


# Create New Column

In [13]:
temp_v = df['temp'] - df['atemp']
df["temp_variance"] = temp_v

In [14]:
df.head()

Unnamed: 0,holiday,workingday,weather,temp,atemp,humidity,windspeed,count,Month,Day of Week,Hour of Day,temp_variance
0,0,0,1,49.712,57.911,81,0.0,16,January,Saturday,0,-8.199
1,0,0,1,48.236,56.543,80,0.0,40,January,Saturday,1,-8.307
2,0,0,1,48.236,56.543,80,0.0,32,January,Saturday,2,-8.307
3,0,0,1,49.712,57.911,75,0.0,13,January,Saturday,3,-8.199
4,0,0,1,49.712,57.911,75,0.0,1,January,Saturday,4,-8.199


In [16]:
df.drop(columns = ["atemp"], inplace = True)

In [17]:
df.head()

Unnamed: 0,holiday,workingday,weather,temp,humidity,windspeed,count,Month,Day of Week,Hour of Day,temp_variance
0,0,0,1,49.712,81,0.0,16,January,Saturday,0,-8.199
1,0,0,1,48.236,80,0.0,40,January,Saturday,1,-8.307
2,0,0,1,48.236,80,0.0,32,January,Saturday,2,-8.307
3,0,0,1,49.712,75,0.0,13,January,Saturday,3,-8.199
4,0,0,1,49.712,75,0.0,1,January,Saturday,4,-8.199


# Test Model on Original and Engineered Data

In [15]:
df_orig = pd.read_csv("Data/bikeshare_train - bikeshare_train.csv")
df_orig.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 0:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 1:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 2:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 3:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 4:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


In [18]:
# Drop the casual and registered columns
df_orig.drop(columns = ["casual","registered"], inplace = True)
df_orig.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,count
0,2011-01-01 0:00:00,1,0,0,1,9.84,14.395,81,0.0,16
1,2011-01-01 1:00:00,1,0,0,1,9.02,13.635,80,0.0,40
2,2011-01-01 2:00:00,1,0,0,1,9.02,13.635,80,0.0,32
3,2011-01-01 3:00:00,1,0,0,1,9.84,14.395,75,0.0,13
4,2011-01-01 4:00:00,1,0,0,1,9.84,14.395,75,0.0,1


In [None]:
# Helper Functions
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

def classification_metrics(y_true, y_pred, label='',
                           output_dict=False, figsize=(8,4),
                           normalize='true', cmap='Blues',
                           colorbar=False):
    # Get the classification report
    report = classification_report(y_true, y_pred)
    ## Print header and report
    header = "-"*70
    print(header, f" Classification Metrics: {label}", header, sep='\n')
    print(report)
    ## CONFUSION MATRICES SUBPLOTS
    fig, axes = plt.subplots(ncols=2, figsize=figsize)
    # create a confusion matrix  of raw counts
    ConfusionMatrixDisplay.from_predictions(y_true, y_pred,
                normalize=None, cmap='gist_gray', colorbar=colorbar,
                ax = axes[0],);
    axes[0].set_title("Raw Counts")
    # create a confusion matrix with the test data
    ConfusionMatrixDisplay.from_predictions(y_true, y_pred,
                normalize=normalize, cmap=cmap, colorbar=colorbar,
                ax = axes[1]);
    axes[1].set_title("Normalized Confusion Matrix")
    # Adjust layout and show figure
    fig.tight_layout()
    plt.show()
    # Return dictionary of classification_report
    if output_dict==True:
        report_dict = classification_report(y_true, y_pred, output_dict=True)
        return report_dict


def evaluate_classification(model, X_train, y_train, X_test, y_test,
                         figsize=(6,4), normalize='true', output_dict = False,
                            cmap_train='Blues', cmap_test="Reds",colorbar=False):
    # Get predictions for training data
    y_train_pred = model.predict(X_train)
    # Call the helper function to obtain regression metrics for training data
    results_train = classification_metrics(y_train, y_train_pred, #verbose = verbose,
                                     output_dict=True, figsize=figsize,
                                         colorbar=colorbar, cmap=cmap_train,
                                     label='Training Data')
    print()
    # Get predictions for test data
    y_test_pred = model.predict(X_test)
    # Call the helper function to obtain regression metrics for test data
    results_test = classification_metrics(y_test, y_test_pred, #verbose = verbose,
                                  output_dict=True,figsize=figsize,
                                         colorbar=colorbar, cmap=cmap_test,
                                    label='Test Data' )
    if output_dict == True:
        # Store results in a dataframe if ouput_frame is True
        results_dict = {'train':results_train,
                    'test': results_test}
        return results_dict

### Original Data

In [19]:
## Split the data
X = df.drop("count", axis=1)
y = df['count']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
X_train.head()

Unnamed: 0,holiday,workingday,weather,temp,humidity,windspeed,Month,Day of Week,Hour of Day,temp_variance
2930,0,1,1,83.66,65,12.998,July,Monday,0,-6.975
7669,0,1,1,73.328,52,22.0028,May,Friday,22,-6.399
1346,0,1,1,54.14,61,6.0032,April,Friday,23,-6.498
9432,0,0,1,74.804,60,8.9981,September,Sunday,9,-6.291
453,0,1,3,46.76,93,12.998,February,Tuesday,23,-2.97


In [23]:

# Without PCA
#knn = KNeighborsClassifier()

# Now we will build the pipeline
#knn.fit(X_train, y_train)

ValueError: could not convert string to float: 'July'

In [None]:
#%%time
# evaluate the pipeline
#evaluate_classification(knn, X_train_tf, y_train, X_test_tf, y_test)