In [1]:
!pip install category_encoders



In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
import numpy as np
import pandas as pd

In [4]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from category_encoders import TargetEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import set_config
from sklearn.linear_model import LinearRegression

In [5]:
!wget https://archive.ics.uci.edu/static/public/275/bike+sharing+dataset.zip
!unzip bike+sharing+dataset.zip

--2024-08-27 16:22:06--  https://archive.ics.uci.edu/static/public/275/bike+sharing+dataset.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: 'bike+sharing+dataset.zip'

bike+sharing+datase     [  <=>               ] 273.43K  1.13MB/s    in 0.2s    

2024-08-27 16:22:06 (1.13 MB/s) - 'bike+sharing+dataset.zip' saved [279992]

Archive:  bike+sharing+dataset.zip
  inflating: Readme.txt              
  inflating: day.csv                 
  inflating: hour.csv                


In [32]:
df = pd.read_csv('hour.csv')

In [33]:
df

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0000,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.80,0.0000,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.80,0.0000,5,27,32
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0000,3,10,13
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0000,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17374,17375,2012-12-31,1,1,12,19,0,1,1,2,0.26,0.2576,0.60,0.1642,11,108,119
17375,17376,2012-12-31,1,1,12,20,0,1,1,2,0.26,0.2576,0.60,0.1642,8,81,89
17376,17377,2012-12-31,1,1,12,21,0,1,1,1,0.26,0.2576,0.60,0.1642,7,83,90
17377,17378,2012-12-31,1,1,12,22,0,1,1,1,0.26,0.2727,0.56,0.1343,13,48,61


# **Setting the categorical types**

In [34]:
df['day_night'] = df['hr'].apply(lambda x: 'day' if 6 <= x <= 18 else 'night')
df.drop(['instant', 'casual', 'registered'], axis=1, inplace=True)
df['dteday'] = pd.to_datetime(df.dteday)
df['season'] = df.season.astype('category')
df['holiday'] = df.holiday.astype('category')
df['weekday'] = df.weekday.astype('category')
df['weathersit'] = df.weathersit.astype('category')
df['workingday'] = df.workingday.astype('category')
df['mnth'] = df.mnth.astype('category')
df['yr'] = df.yr.astype('category')
df['hr'] = df.hr.astype('category')
df.drop(columns=['dteday'], inplace=True)

# **Creating interaction features**

In [35]:
# Creating interaction features
df['temp_hum'] = df['temp'] * df['hum']
df['temp_windspeed'] = df['temp'] * df['windspeed']
df['hum_windspeed'] = df['hum'] * df['windspeed']
# Created these features because they are weather related

In [36]:
# Separating features and target variable
X = df.drop(columns=['cnt']) # Features
y = df['cnt'] # Target

# **Checking for NaN values**

In [37]:
nan_summary = X.isna().sum()
nan_columns = nan_summary[nan_summary > 0]
print(nan_columns)

Series([], dtype: int64)


In [38]:
X

Unnamed: 0,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,day_night,temp_hum,temp_windspeed,hum_windspeed
0,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0000,night,0.1944,0.000000,0.000000
1,1,0,1,1,0,6,0,1,0.22,0.2727,0.80,0.0000,night,0.1760,0.000000,0.000000
2,1,0,1,2,0,6,0,1,0.22,0.2727,0.80,0.0000,night,0.1760,0.000000,0.000000
3,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0000,night,0.1800,0.000000,0.000000
4,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0000,night,0.1800,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17374,1,1,12,19,0,1,1,2,0.26,0.2576,0.60,0.1642,night,0.1560,0.042692,0.098520
17375,1,1,12,20,0,1,1,2,0.26,0.2576,0.60,0.1642,night,0.1560,0.042692,0.098520
17376,1,1,12,21,0,1,1,1,0.26,0.2576,0.60,0.1642,night,0.1560,0.042692,0.098520
17377,1,1,12,22,0,1,1,1,0.26,0.2727,0.56,0.1343,night,0.1456,0.034918,0.075208


In [39]:
y

0         16
1         40
2         32
3         13
4          1
        ... 
17374    119
17375     89
17376     90
17377     61
17378     49
Name: cnt, Length: 17379, dtype: int64

In [41]:
categorical_features = ['season', 'weathersit', 'day_night']
X[categorical_features]

Unnamed: 0,season,weathersit,day_night
0,1,1,night
1,1,1,night
2,1,1,night
3,1,1,night
4,1,1,night
...,...,...,...
17374,1,2,night
17375,1,2,night
17376,1,1,night
17377,1,1,night


# **Target Encoding**

In [15]:
# Numerical features
numerical_features = ['temp', 'hum', 'windspeed']
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values with mean
    ('scaler', MinMaxScaler())  # Normalize using MinMaxScaler
])

# Transforming numerical features
X[numerical_features] = numerical_pipeline.fit_transform(X[numerical_features])
y = y.astype(float)

# Categorical features
categorical_features = ['season', 'weathersit', 'day_night']
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values with the most frequent
    ('target', TargetEncoder())  # Apply TargetEncoder instead of OneHotEncoder
])

# Transforming categorical features
X_encoded = categorical_pipeline.fit_transform(X[categorical_features], y)
X_encoded.columns = ['season', 'weathersit', 'day_night']

# The categorical features are now encoded and can be directly concatenated back to the original data (if needed)
X[categorical_features] = X_encoded
X_new = pd.concat([X.drop(columns=categorical_features), X_encoded], axis=1)

In [16]:
X_encoded

Unnamed: 0,season,weathersit,day_night
0,111.114569,204.869272,98.894138
1,111.114569,204.869272,98.894138
2,111.114569,204.869272,98.894138
3,111.114569,204.869272,98.894138
4,111.114569,204.869272,98.894138
...,...,...,...
17374,111.114569,175.165493,98.894138
17375,111.114569,175.165493,98.894138
17376,111.114569,204.869272,98.894138
17377,111.114569,204.869272,98.894138


# **Splitting**

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2, random_state=42)

In [18]:
class Linear_Regression:
    def __init__(self, lr=0.001, n_iters=1000, random_state = 42):
        self.lr = lr
        self.n_iters = n_iters
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        num_samples, num_features = X.shape

        # init parameters
        self.weights = np.zeros(num_features)
        self.bias = 0
        
        for _ in range(self.n_iters):
            y_pred = np.dot(X, self.weights) + self.bias

            dw = 2/(num_samples)*(np.dot(X.T, (y_pred - y)))
            db = 2/(num_samples)*(np.sum(y_pred - y))

            self.weights = self.weights - (self.lr * dw)
            self.bias = self.bias - (self.lr * db)

    def predict(self, X):
        y_pred = np.dot(X, self.weights) + self.bias
        return y_pred

In [19]:
X_train

Unnamed: 0,yr,mnth,hr,holiday,weekday,workingday,temp,atemp,hum,windspeed,temp_hum,temp_windspeed,hum_windspeed,season,weathersit,day_night
335,0,1,11,0,6,0,0.183673,0.1970,0.55,0.263195,0.1100,0.044780,0.123145,111.114569,204.869272,265.225933
7035,0,10,18,0,2,1,0.510204,0.5000,0.42,0.122840,0.2184,0.054340,0.043890,198.868856,204.869272,265.225933
8051,0,12,3,0,3,1,0.448980,0.4545,1.00,0.263195,0.4600,0.102994,0.223900,198.868856,111.579281,98.894138
2133,0,4,18,0,0,0,0.448980,0.4545,0.31,0.000000,0.1426,0.000000,0.000000,208.344069,175.165493,265.225933
8485,0,12,6,0,0,0,0.183673,0.2273,0.75,0.122840,0.1500,0.020900,0.078375,111.114569,204.869272,265.225933
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11284,1,4,9,0,5,1,0.448980,0.4545,0.88,0.105325,0.4048,0.041216,0.078848,208.344069,204.869272,265.225933
11964,1,5,17,0,5,1,0.653061,0.6212,0.34,0.157870,0.2244,0.088638,0.045662,208.344069,204.869272,265.225933
5390,0,8,12,0,3,1,0.795918,0.7273,0.43,0.333373,0.3440,0.226880,0.121948,236.016237,204.869272,265.225933
860,0,2,7,0,2,1,0.224490,0.1970,0.65,0.491243,0.1560,0.100296,0.271635,111.114569,204.869272,265.225933


In [20]:
y_train

335       72.0
7035     518.0
8051       3.0
2133     172.0
8485       1.0
         ...  
11284    359.0
11964    812.0
5390     189.0
860      100.0
15795    779.0
Name: cnt, Length: 13903, dtype: float64

In [21]:
model = Linear_Regression(lr = 1e-5, n_iters = 100)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [22]:
y_pred

array([-1.19449925e+11, -9.19015067e+10, -1.11071188e+11, ...,
       -1.23549271e+11, -1.49353351e+11, -1.11301143e+11])

In [23]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

Mean Squared Error: 1.6477619470821124e+22
R-squared: -5.203665063984268e+17


# **Using in-built Linear Regression**

In [24]:
model1 = LinearRegression()
model1.fit(X_train, y_train)
y_pred1 = model1.predict(X_test)

# **Evaluating the Model**

mse = mean_squared_error(y_test, y_pred1)
r2 = r2_score(y_test, y_pred1)
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

Mean Squared Error: 14974.440654149745
R-squared: 0.5271041801718359


In [25]:
y_pred1

array([360.07785732, 112.3256242 , -21.33980801, ...,  91.04240451,
       267.15057364, 131.24166643])

In [26]:
final_pipeline = Pipeline([
('num_preprocess', numerical_pipeline),
('cat_preprocess', categorical_pipeline),
('model', Linear_Regression(n_iters=100, random_state=42))
])

In [28]:
from sklearn.preprocessing import OneHotEncoder

# **Using One-Hot Encoding to compare**

In [40]:
numerical_features = ['temp', 'hum', 'windspeed']
numerical_pipeline = Pipeline([
('imputer', SimpleImputer(strategy='mean')), # Impute missing values with mean
('scaler', MinMaxScaler()) # Normalize using MinMaxScaler
])
# Transforming above
X[numerical_features] = numerical_pipeline.fit_transform(X[numerical_features])
# Categorical features
categorical_features = ['season', 'weathersit', 'day_night']
categorical_pipeline = Pipeline([
('imputer', SimpleImputer(strategy='most_frequent')),
('onehot', OneHotEncoder(sparse_output=False, drop='first'))
])
# Transforming above
X_encoded1 = categorical_pipeline.fit_transform(X[categorical_features])
# Converting it to a dataframe

X_encoded1 = pd.DataFrame(X_encoded,
columns=categorical_pipeline.named_steps['onehot'].get_feature_names_out(categorical_features))
# Encoded categorical features + Numerical features
X_new1 = pd.concat([X.drop(columns=categorical_features), X_encoded], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X_new1, y, test_size=0.2, random_state=42)

model2 = LinearRegression()
model2.fit(X_train, y_train)
y_pred2 = model2.predict(X_test)

# **Evaluating the Model**

mse = mean_squared_error(y_test, y_pred2)
r2 = r2_score(y_test, y_pred2)
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

Mean Squared Error: 14974.440654149745
R-squared: 0.5271041801718359


In [30]:
set_config(display='diagram')
final_pipeline