### Install Scikit-Learn

In [1]:
pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


### Import Libraries

In [2]:
import pandas as pd
import numpy as np

### Example of PCA
##### PCA can only work with numeric data type, so you should convert all columns into numeric or integer

In [3]:
df = pd.read_csv('https://raw.githubusercontent.com/delinai/schulich_ds1/main/Datasets/module_5_feature_engineering_demo.csv')

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,Height,Weight,Calories_Intake,Exercise_Time,BMI,Age,Resting_Heart_Rate
0,0,187.640523,81.119254,1733.539473,4.593274,23.039374,70,54.540176
1,1,174.001572,87.849478,1644.014918,3.568722,29.015686,38,63.510746
2,2,179.78738,61.553704,2523.067529,2.885513,19.043018,76,49.502603
3,3,192.408932,72.094281,2020.81276,3.25163,19.473784,35,72.31022
4,4,188.67558,74.561067,2459.594194,1.789144,20.945017,49,67.864721


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Unnamed: 0          1000 non-null   int64  
 1   Height              1000 non-null   float64
 2   Weight              1000 non-null   float64
 3   Calories_Intake     1000 non-null   float64
 4   Exercise_Time       1000 non-null   float64
 5   BMI                 1000 non-null   float64
 6   Age                 1000 non-null   int64  
 7   Resting_Heart_Rate  1000 non-null   float64
dtypes: float64(6), int64(2)
memory usage: 62.6 KB


In [6]:
df.drop('Unnamed: 0', axis=1, inplace=True)

In [7]:
df

Unnamed: 0,Height,Weight,Calories_Intake,Exercise_Time,BMI,Age,Resting_Heart_Rate
0,187.640523,81.119254,1733.539473,4.593274,23.039374,70,54.540176
1,174.001572,87.849478,1644.014918,3.568722,29.015686,38,63.510746
2,179.787380,61.553704,2523.067529,2.885513,19.043018,76,49.502603
3,192.408932,72.094281,2020.812760,3.251630,19.473784,35,72.310220
4,188.675580,74.561067,2459.594194,1.789144,20.945017,49,67.864721
...,...,...,...,...,...,...,...
995,174.128708,71.955016,3539.588332,2.496110,23.731234,36,58.531050
996,168.016011,98.030468,2046.267009,4.779559,34.726398,32,63.266300
997,170.941923,73.168677,2403.797896,2.969428,25.039637,20,52.155337
998,158.523891,47.161972,1893.742128,4.577088,18.767331,72,57.429598


In [8]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Standardize the features
scaler = StandardScaler()
df_std = scaler.fit_transform(df)

# Perform PCA
pca = PCA()
df_pca = pca.fit_transform(df_std)

# The transformed data is an array, convert it back into a dataframe
df_pca = pd.DataFrame(df_pca, columns=[f'PC{i+1}' for i in range(len(df.columns))])

# Print the explained variance ratio
print('Explained variance ratio:', pca.explained_variance_ratio_)

# Print the cumulative explained variance ratio
cumsum_variance = np.cumsum(pca.explained_variance_ratio_)
print('Cumulative explained variance ratio:', cumsum_variance)

# Show the first few rows of transformed dataframe
df_pca.head()

Explained variance ratio: [0.28857346 0.18072185 0.14675488 0.14381463 0.13558216 0.10353415
 0.00101888]
Cumulative explained variance ratio: [0.28857346 0.46929531 0.61605019 0.75986482 0.89544697 0.99898112
 1.        ]


Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7
0,-0.395268,-1.182948,1.791319,2.336967,0.037477,0.366259,-0.018202
1,0.819082,-0.205962,2.084828,0.028116,-0.058311,0.446441,-0.061576
2,-1.171866,-0.469183,-0.324207,1.639464,-0.240676,-0.935364,0.040073
3,-1.106793,0.798769,2.060352,0.535101,1.421538,0.537776,0.082936
4,-0.7725,1.500282,0.797973,0.753864,1.115887,-0.730499,0.036682


The result shown in the explained variance ratio indicates that the last variance(column) only explains 0.00101 of the information, so we probably could drop it. It does not catch sufficient information, and thus, does not provide any useful insights for our analysis.

In [9]:
# Let's do the same, but now let's reduce to 2 components
# Perform PCA
pca = PCA(n_components = 2) #or n_components can also specify how many information you want the PCA analysis to include (has to between 0 and 1)
df_pca = pca.fit_transform(df_std)

# The transformed data is an array, convert it back into a dataframe
df_pca = pd.DataFrame(df_pca)

# Print the explained variance ratio
print('Explained variance ratio:', pca.explained_variance_ratio_)

# Print the cumulative explained variance ratio
cumsum_variance = np.cumsum(pca.explained_variance_ratio_)
print('Cumulative explained variance ratio:', cumsum_variance)

# Show the first few rows of transformed dataframe
df_pca.head()

Explained variance ratio: [0.28857346 0.18072185]
Cumulative explained variance ratio: [0.28857346 0.46929531]


Unnamed: 0,0,1
0,-0.395268,-1.182948
1,0.819082,-0.205962
2,-1.171866,-0.469183
3,-1.106793,0.798769
4,-0.7725,1.500282


We can see that with 2 componentts, only 46% of the varability in data is explained

### Example of Feature Engineering with Pandas

In [10]:
bikes = pd.read_csv('https://raw.githubusercontent.com/delinai/schulich_ds1/main/Datasets/bikes_sharing.csv')

In [11]:
bikes.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


Registered means people have an account and rent the bicycle regularly. Casual means people don't have an account and just rent casually using their credit cards.

In [12]:
# Transform datetime to a datetime data format
bikes['datetime'] = pd.to_datetime(bikes['datetime'])

In [13]:
# Extract hour, month, day into new columns
bikes['hour'] = bikes['datetime'].dt.hour

In [14]:
bikes['month'] = bikes['datetime'].dt.month

In [15]:
bikes['day'] = bikes['datetime'].dt.day

In [16]:
# Create function to classify amounts
def total_cat(x):
    if x >= 0 and x < 10:
        return '0-10'
    elif x >= 10 and x < 50:
        return '10-50'
    elif x >= 50 and x < 100:
        return '50-100'
    else:
        return '100+'


In [17]:
# Create new column with rental count range using total_cat() function created above
bikes['rental_total_group'] = bikes['count'].apply(total_cat)

In [18]:
bikes.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,hour,month,day,rental_total_group
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16,0,1,1,10-50
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40,1,1,1,10-50
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32,2,1,1,10-50
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13,3,1,1,10-50
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1,4,1,1,0-10


In [19]:
# Create function with 2 inputs - temp and humidity - to classify good/bad days
def good_bad(temp,hum):
    if temp > 25 and hum > 70:
        return 'too hot'
    elif temp <=25 and hum >= 50 and hum <= 70:
        return 'so so day'
    else:
        return 'good day'


In [20]:
def application_function(x):
    good_bad(x['temp'], x['humidity'])

In [21]:
# apply function
bikes['day_type'] = bikes.apply(lambda x: good_bad(x['temp'], x['humidity']), axis=1)

In [22]:
bikes.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,hour,month,day,rental_total_group,day_type
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16,0,1,1,10-50,good day
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40,1,1,1,10-50,good day
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32,2,1,1,10-50,good day
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13,3,1,1,10-50,good day
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1,4,1,1,0-10,good day


In [23]:
# Dummy variables - convert season to dummies; first - rename season

season_mapping = {1:'winter', 2:'spring', 3:'summer', 4:'fall'}
bikes['season'] = bikes['season'].map(season_mapping)


In [24]:
bikes.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,hour,month,day,rental_total_group,day_type
0,2011-01-01 00:00:00,winter,0,0,1,9.84,14.395,81,0.0,3,13,16,0,1,1,10-50,good day
1,2011-01-01 01:00:00,winter,0,0,1,9.02,13.635,80,0.0,8,32,40,1,1,1,10-50,good day
2,2011-01-01 02:00:00,winter,0,0,1,9.02,13.635,80,0.0,5,27,32,2,1,1,10-50,good day
3,2011-01-01 03:00:00,winter,0,0,1,9.84,14.395,75,0.0,3,10,13,3,1,1,10-50,good day
4,2011-01-01 04:00:00,winter,0,0,1,9.84,14.395,75,0.0,0,1,1,4,1,1,0-10,good day


In [25]:
# Create season dummies
season_dummies = pd.get_dummies(bikes['season'])

In [26]:
season_dummies.head()

Unnamed: 0,fall,spring,summer,winter
0,0,0,0,1
1,0,0,0,1
2,0,0,0,1
3,0,0,0,1
4,0,0,0,1


In [27]:
bikes = pd.concat([bikes,season_dummies], axis=1)

In [28]:
bikes.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,...,count,hour,month,day,rental_total_group,day_type,fall,spring,summer,winter
0,2011-01-01 00:00:00,winter,0,0,1,9.84,14.395,81,0.0,3,...,16,0,1,1,10-50,good day,0,0,0,1
1,2011-01-01 01:00:00,winter,0,0,1,9.02,13.635,80,0.0,8,...,40,1,1,1,10-50,good day,0,0,0,1
2,2011-01-01 02:00:00,winter,0,0,1,9.02,13.635,80,0.0,5,...,32,2,1,1,10-50,good day,0,0,0,1
3,2011-01-01 03:00:00,winter,0,0,1,9.84,14.395,75,0.0,3,...,13,3,1,1,10-50,good day,0,0,0,1
4,2011-01-01 04:00:00,winter,0,0,1,9.84,14.395,75,0.0,0,...,1,4,1,1,0-10,good day,0,0,0,1
