In [1]:
# 1. Load libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from sklearn.feature_selection import RFE
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# 2. Load dataset
df = pd.read_csv("hour.csv")

In [7]:
#Checking for null values
df.isnull().sum()

season        0
yr            0
mnth          0
hr            0
holiday       0
weekday       0
workingday    0
weathersit    0
temp          0
atemp         0
hum           0
windspeed     0
cnt           0
dtype: int64

In [21]:
def detect_outliers_iqr(df, column):
   
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    outlier_mask = (df[column] < lower_bound) | (df[column] > upper_bound)
    return lower_bound, upper_bound, outlier_mask

def handle_outliers(df, columns, method='remove'):
   
    df_cleaned = df.copy()

    for col in columns:
        lower, upper, mask = detect_outliers_iqr(df_cleaned, col)
        print(f"\n🔍 {col.upper()} — Outliers: {mask.sum()} | Lower: {lower:.2f}, Upper: {upper:.2f}")

        if method == 'remove':
            df_cleaned = df_cleaned[~mask]
        elif method == 'cap':
            df_cleaned[col] = df_cleaned[col].clip(lower=lower, upper=upper)
        else:
            raise ValueError("Invalid method. Use 'remove' or 'cap'.")

    return df_cleaned

# Load your data
df = pd.read_csv("hour.csv")

# Columns to check
columns_to_check = ['temp', 'atemp', 'hum', 'windspeed', 'cnt']

# caps the values to the IQR bounds
method = 'cap' 

# Apply outlier handling
df_cleaned = handle_outliers(df, columns_to_check, method=method)

# Save cleaned data
df_cleaned.to_csv("cleaned_hour_no_outliers.csv", index=False)
print("\n✅ Cleaned dataset saved as 'cleaned_hour_no_outliers.csv'")



🔍 TEMP — Outliers: 0 | Lower: -0.14, Upper: 1.14

🔍 ATEMP — Outliers: 0 | Lower: -0.10, Upper: 1.05

🔍 HUM — Outliers: 22 | Lower: 0.03, Upper: 1.23

🔍 WINDSPEED — Outliers: 342 | Lower: -0.12, Upper: 0.48

🔍 CNT — Outliers: 505 | Lower: -321.50, Upper: 642.50

✅ Cleaned dataset saved as 'cleaned_hour_no_outliers.csv'
