Data loading

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

df = pd.read_csv('data/hour.csv')

print(df.shape)
print(df.columns.tolist())

display(df.head())
print(df.info())


In [None]:
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print(df[numeric_cols].describe().T)

means = df[numeric_cols].mean()
stds = df[numeric_cols].std()
vars_ = df[numeric_cols].var()

print("\nMeans:\n", means)
print("\nStandard Deviations:\n", stds)
print("\nVariances:\n", vars_)

Exploratory Data Analysis

In [None]:
import matplotlib.pyplot as plt

df['dteday'] = pd.to_datetime(df['dteday'])
df['year'] = df['dteday'].dt.year

ndays = df['dteday'].nunique()
print('Date range:', df['dteday'].min(), 'to', df['dteday'].max())
print('Unique days (records):', ndays)


plt.figure(figsize=(12,4))
plt.plot(df.sort_values('dteday').groupby('dteday')['cnt'].sum())
plt.title('Daily total rentals over time')
plt.ylabel('Total rentals per day')
plt.xlabel('Date')
plt.tight_layout()

In [None]:
import seaborn as sns

plt.figure(figsize=(10,6))
for i,col in enumerate(['temp','atemp','hum','windspeed','cnt']):
    plt.subplot(2,3,i+1)
    sns.histplot(df[col], kde=True)
    plt.title(col)
plt.tight_layout()

# boxplot for cnt to identify outliers
plt.figure(figsize=(8,3))
sns.boxplot(x=df['cnt'])
plt.title('Boxplot â€” total count (cnt)')
plt.tight_layout()

In [None]:
cat_cols = ['season','mnth','hr','weekday','weathersit','holiday','workingday']
df.melt(value_vars=cat_cols).groupby(['variable', 'value']).size()

plt.figure(figsize=(12,4))
plt.subplot(1,2,1)
sns.barplot(x='hr', y='cnt', data=df.groupby('hr')['cnt'].mean().reset_index())
plt.title('Average rentals by hour')
plt.subplot(1,2,2)
sns.barplot(x='weathersit', y='cnt', data=df.groupby('weathersit')['cnt'].mean().reset_index())
plt.title('Average rentals by weather situation')
plt.tight_layout()

In [None]:
corr = df[['temp','atemp','hum','windspeed','casual','registered','cnt']].corr()
sns.heatmap(corr, annot=True, fmt='.2f', cmap='BrBG')
plt.title('Correlation matrix (numeric features)')
plt.tight_layout()