In [3]:
#importing the necessary libraries

import pandas as pd #used to store data
import numpy as np #used to do array calculations
import matplotlib.pyplot as plt #used for visualizing

from sklearn.model_selection import train_test_split #used to split training data and testing data
from sklearn.preprocessing import LabelEncoder, StandardScaler #LabelEncoder is used to encode categorical labels to int, StandardScaler sets the mean=0 and s.d = 1 for all variables, ensuirng theyre on the sames scale for optimal operation of the model
from sklearn import metrics #metrics contains many evaluation function used to evaluate the model's perfomance
from sklearn.svm import SVC #support vector classifier, used to classify
from sklearn.metrics import mean_absolute_error as mae #lower mean absolute error equals better model perfomance
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor #Ensemble is a ML technique that combines multiple ml models to produce one stronger, more accurate model

import warnings
warnings.filterwarnings('ignore') #filters out warnings that come with the word "ignore"

from datetime import datetime

In [4]:
#reading the ola data

df = pd.read_csv(r'C:\Users\Jackson Jesse\Desktop\everything\Sem 5\MLOps\ML\Ola ride request forecasting\ola.csv') #use raw string 'r' while entering file paths
df.head()

Unnamed: 0,datetime,season,weather,temp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,3,2,7.653428,122.430885,18.783703,5,128,315.731778
1,2011-01-01 01:00:00,4,3,13.269244,101.703802,14.035245,36,184,340.457093
2,2011-01-01 02:00:00,1,3,30.886922,77.583743,45.36508,34,97,224.278595
3,2011-01-01 03:00:00,3,1,10.463377,114.925671,25.347168,6,47,132.169549
4,2011-01-01 04:00:00,3,4,30.214783,149.193592,20.288982,14,199,315.867137


In [5]:
#Feature Engineering - used to derive multiple features from existing features

parts = df['datetime'].str.split(" ", n=2, expand=True)
df['date'] = parts[0]
df['time'] = parts[1].str[:2].astype('int')

date_parts = df['date'].str.split("-",n=3, expand=True)
df['year'] = date_parts[0].astype('int')
df['month'] = date_parts[1].astype('int')
df['date'] = date_parts[2].astype('int')

In [6]:
#Since the rides frequency can differ based on its a weekday or a weeknd, we are segregating them

df['datetime'] = pd.to_datetime(df['datetime'])
df['year'] = df['datetime'].dt.year
df['month'] = df['datetime'].dt.month
df['day'] = df['datetime'].dt.day

def weekend_or_weekday(year, month, day):
    try:
        d = datetime(year, month, day)
        if d.weekday() > 4: #weekday return the index of the day of the weel, mon - 0, sun -6
            return 0
        else:
            return 1
    except ValueError:
        return np.nan

df['weekday'] = df.apply(lambda x: weekend_or_weekday(x['year'], x['month'], x['day']), axis = 1)

In [7]:
df.tail()

Unnamed: 0,datetime,season,weather,temp,humidity,windspeed,casual,registered,count,date,time,year,month,day,weekday
10881,2012-03-29 09:00:00,4,4,,,,48,28,177.565718,29,9,2012,3,29,1
10882,2012-03-29 10:00:00,3,4,16.149249,69.572176,24.725833,4,1,113.712131,29,10,2012,3,29,1
10883,2012-03-29 11:00:00,3,3,20.303661,148.447082,40.606129,38,10,209.684018,29,11,2012,3,29,1
10884,2012-03-29 12:00:00,2,4,17.425577,135.793223,50.788697,36,15,149.063403,29,12,2012,3,29,1
10885,2012-03-29 13:00:00,2,3,36.732634,103.274013,15.918189,39,178,368.789538,29,13,2012,3,29,1


In [8]:
#checking the time
def am_or_pm(x):
    if x > 11:
        return 1
    else:
        return 0
df['am_or_pm'] = df['time'].apply(am_or_pm)
df.head()

Unnamed: 0,datetime,season,weather,temp,humidity,windspeed,casual,registered,count,date,time,year,month,day,weekday,am_or_pm
0,2011-01-01 00:00:00,3,2,7.653428,122.430885,18.783703,5,128,315.731778,1,0,2011,1,1,0,0
1,2011-01-01 01:00:00,4,3,13.269244,101.703802,14.035245,36,184,340.457093,1,1,2011,1,1,0,0
2,2011-01-01 02:00:00,1,3,30.886922,77.583743,45.36508,34,97,224.278595,1,2,2011,1,1,0,0
3,2011-01-01 03:00:00,3,1,10.463377,114.925671,25.347168,6,47,132.169549,1,3,2011,1,1,0,0
4,2011-01-01 04:00:00,3,4,30.214783,149.193592,20.288982,14,199,315.867137,1,4,2011,1,1,0,0


In [9]:
!pip install holidays




[notice] A new release of pip is available: 24.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [10]:
import holidays
india_holidays = holidays.country_holidays("IN")
def is_holiday(x):
    if india_holidays.get(x.date()):
        return 1
    else:
        return 0
df['holidays'] = df['datetime'].apply(is_holiday)

In [11]:
df[df['holidays'] == 1]

Unnamed: 0,datetime,season,weather,temp,humidity,windspeed,casual,registered,count,date,time,year,month,day,weekday,am_or_pm,holidays
600,2011-01-26 00:00:00,1,2,20.208552,107.409911,43.646428,28,80,231.358440,26,0,2011,1,26,1,0,1
601,2011-01-26 01:00:00,4,1,18.585673,144.183377,43.066567,47,27,199.328597,26,1,2011,1,26,1,0,1
602,2011-01-26 02:00:00,4,1,32.974643,145.631040,18.416455,11,63,198.343445,26,2,2011,1,26,1,0,1
603,2011-01-26 03:00:00,2,2,25.602134,151.530090,56.246775,22,94,261.839586,26,3,2011,1,26,1,0,1
604,2011-01-26 04:00:00,3,4,,,,49,71,263.042359,26,4,2011,1,26,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9979,2012-02-20 19:00:00,4,2,-0.824501,71.195778,40.706095,24,9,81.824753,20,19,2012,2,20,1,1,1
9980,2012-02-20 20:00:00,4,3,25.538828,87.980406,44.385331,37,83,180.623188,20,20,2012,2,20,1,1,1
9981,2012-02-20 21:00:00,2,1,25.907977,124.649552,26.719506,24,52,111.223960,20,21,2012,2,20,1,1,1
9982,2012-02-20 22:00:00,4,2,27.222301,59.872830,27.184512,16,126,199.597787,20,22,2012,2,20,1,1,1


In [12]:
df.drop('date', axis=1, inplace=True)

# Step 3
Exploratory Data Analysis - EDA
analyzing data visually

In [24]:
df.isna().sum()

datetime         0
season           0
weather          0
temp          1632
humidity      1632
windspeed     1632
casual           0
registered       0
count            0
time             0
year             0
month            0
day              0
weekday          0
am_or_pm         0
holidays         0
dtype: int64

In [None]:
#checking the relation beetween ride request count and the day, time or month

