## Importing Libraries

In [10]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import confusion_matrix, mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

## 1. Info of the dataset

In [11]:
df = pd.read_csv(r"C:\Users\HP\Desktop\Folders\Coding\Practice\DataScience_Tasks\Bike_sharing data\hour.csv")
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17379 entries, 0 to 17378
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   instant     17379 non-null  int64  
 1   dteday      17379 non-null  object 
 2   season      17379 non-null  int64  
 3   yr          17379 non-null  int64  
 4   mnth        17379 non-null  int64  
 5   hr          17379 non-null  int64  
 6   holiday     17379 non-null  int64  
 7   weekday     17379 non-null  int64  
 8   workingday  17379 non-null  int64  
 9   weathersit  17379 non-null  int64  
 10  temp        17379 non-null  float64
 11  atemp       17379 non-null  float64
 12  hum         17379 non-null  float64
 13  windspeed   17379 non-null  float64
 14  casual      17379 non-null  int64  
 15  registered  17379 non-null  int64  
 16  cnt         17379 non-null  int64  
dtypes: float64(4), int64(12), object(1)
memory usage: 2.3+ MB
None


## 2. Describe of the dataset

In [12]:
print(df.describe())

          instant        season            yr          mnth            hr  \
count  17379.0000  17379.000000  17379.000000  17379.000000  17379.000000   
mean    8690.0000      2.501640      0.502561      6.537775     11.546752   
std     5017.0295      1.106918      0.500008      3.438776      6.914405   
min        1.0000      1.000000      0.000000      1.000000      0.000000   
25%     4345.5000      2.000000      0.000000      4.000000      6.000000   
50%     8690.0000      3.000000      1.000000      7.000000     12.000000   
75%    13034.5000      3.000000      1.000000     10.000000     18.000000   
max    17379.0000      4.000000      1.000000     12.000000     23.000000   

            holiday       weekday    workingday    weathersit          temp  \
count  17379.000000  17379.000000  17379.000000  17379.000000  17379.000000   
mean       0.028770      3.003683      0.682721      1.425283      0.496987   
std        0.167165      2.005771      0.465431      0.639357      0.

## 3. Find any duplicate data of the dataset

In [13]:
duplicates = df[df.duplicated()]
print(duplicates)

Empty DataFrame
Columns: [instant, dteday, season, yr, mnth, hr, holiday, weekday, workingday, weathersit, temp, atemp, hum, windspeed, casual, registered, cnt]
Index: []


## 4. Slicing of the list(row and column both)

In [14]:
print("Slicing: Rows 10 to 15 and columns ['dteday', 'hr', 'temp', 'cnt']\n")

Slicing: Rows 10 to 15 and columns ['dteday', 'hr', 'temp', 'cnt']



In [15]:
print(df.loc[10:15, ['dteday', 'hr', 'temp', 'cnt']])

        dteday  hr  temp  cnt
10  2011-01-01  10  0.38   36
11  2011-01-01  11  0.36   56
12  2011-01-01  12  0.42   84
13  2011-01-01  13  0.46   94
14  2011-01-01  14  0.46  106
15  2011-01-01  15  0.44  110


## 5. Linear regression (single and multiple)

### a. Single

In [16]:
X_single = df[['temp']]
y = df['cnt']
lr_single = LinearRegression().fit(X_single, y)

print(f"  Coefficient: {lr_single.coef_[0]:.2f}, Intercept: {lr_single.intercept_:.2f}")

  Coefficient: 381.29, Intercept: -0.04


### b. Multiple

In [17]:
features_multi = ['temp', 'hum', 'windspeed', 'hr', 'weekday', 'workingday']
X_multi = df[features_multi]
y = df['cnt']
lr_multi = LinearRegression().fit(X_multi, y)

for feature, coef in zip(features_multi, lr_multi.coef_):
    print(f"  {feature}: {coef:.2f}")
print(f"  Intercept: {lr_multi.intercept_:.2f}")

  temp: 329.15
  hum: -206.34
  windspeed: -2.41
  hr: 7.49
  weekday: 1.80
  workingday: 5.06
  Intercept: 60.39


## 6. Logistic regression 

In [22]:
df['high_usage'] = (df['cnt'] > df['cnt'].mean()).astype(int)
X_log = df[features_multi]
y_log = df['high_usage']

X_train, X_test, y_train, y_test = train_test_split(X_log, y_log, test_size=0.3, random_state=42)
log_reg = LogisticRegression(max_iter=200)
log_reg.fit(X_train, y_train)

print(f"  Logistic Regression Accuracy: {log_reg.score(X_test, y_test):.2f}")

  Logistic Regression Accuracy: 0.76


## 7. Confusion matrix

In [23]:
y_pred_log = log_reg.predict(X_test)
cm = confusion_matrix(y_test, y_pred_log)

print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[2673  521]
 [ 756 1264]]


## 8. Any error (MSE, MAE)

In [24]:
y_pred_reg = lr_multi.predict(X_multi)

mse = mean_squared_error(y, y_pred_reg)
mae = mean_absolute_error(y, y_pred_reg)

print("Error Metrics:")
print(f"  Mean Squared Error (MSE): {mse:.2f}")
print(f"  Mean Absolute Error (MAE): {mae:.2f}")

Error Metrics:
  Mean Squared Error (MSE): 22186.92
  Mean Absolute Error (MAE): 108.98


## 9. Gradient descent

In [25]:
X_gd = df['temp'].values
y_gd = df['cnt'].values
m, b = 0.0, 0.0
lr = 0.01
epochs = 1000
n = len(X_gd)

for i in range(epochs):
    y_pred = m * X_gd + b
    error = y_pred - y_gd
    m_grad = (2/n) * np.dot(error, X_gd)
    b_grad = (2/n) * np.sum(error)
    m -= lr * m_grad
    b -= lr * b_grad

print("Gradient Descent for Linear Regression (cnt ~ temp):")
print(f"  Slope (m): {m:.2f}, Intercept (b): {b:.2f}")

Gradient Descent for Linear Regression (cnt ~ temp):
  Slope (m): 214.05, Intercept (b): 85.61
