In [7]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

print("✅ All packages imported successfully!")
print("🚴‍♂️ London Bike Sharing Prediction Project")
print("=" * 50)

✅ All packages imported successfully!
🚴‍♂️ London Bike Sharing Prediction Project


In [8]:
# Load and explore the dataset
df = pd.read_csv('london_merged.csv')

print(f"📊 Dataset loaded: {df.shape[0]:,} records with {df.shape[1]} features")
print("\n📋 First 5 rows:")
print(df.head())

print(f"\n📈 Dataset Info:")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")
print(f"Date range: {df['timestamp'].min()} to {df['timestamp'].max()}")
print(f"Missing values: {df.isnull().sum().sum()}")


📊 Dataset loaded: 17,414 records with 10 features

📋 First 5 rows:
             timestamp  cnt   t1   t2    hum  wind_speed  weather_code  \
0  2015-01-04 00:00:00  182  3.0  2.0   93.0         6.0           3.0   
1  2015-01-04 01:00:00  138  3.0  2.5   93.0         5.0           1.0   
2  2015-01-04 02:00:00  134  2.5  2.5   96.5         0.0           1.0   
3  2015-01-04 03:00:00   72  2.0  2.0  100.0         0.0           1.0   
4  2015-01-04 04:00:00   47  2.0  0.0   93.0         6.5           1.0   

   is_holiday  is_weekend  season  
0         0.0         1.0     3.0  
1         0.0         1.0     3.0  
2         0.0         1.0     3.0  
3         0.0         1.0     3.0  
4         0.0         1.0     3.0  

📈 Dataset Info:
Memory usage: 2.3 MB
Date range: 2015-01-04 00:00:00 to 2017-01-03 23:00:00
Missing values: 0


In [9]:
# Statistical summary and correlation analysis
print("📊 STATISTICAL SUMMARY:")
print(df.describe())

print("\n🔍 CORRELATION WITH BIKE DEMAND (cnt):")
# Only use numeric columns for correlation
numeric_df = df.select_dtypes(include=[np.number])
correlation = numeric_df.corr()['cnt'].sort_values(ascending=False)

for feature, corr_value in correlation.items():
    if feature != 'cnt':
        impact = "Strong +" if corr_value > 0.3 else "Strong -" if corr_value < -0.3 else "Moderate"
        print(f"{feature:12} | {corr_value:6.3f} | {impact}")


📊 STATISTICAL SUMMARY:
                cnt            t1            t2           hum    wind_speed  \
count  17414.000000  17414.000000  17414.000000  17414.000000  17414.000000   
mean    1143.101642     12.468091     11.520836     72.324954     15.913063   
std     1085.108068      5.571818      6.615145     14.313186      7.894570   
min        0.000000     -1.500000     -6.000000     20.500000      0.000000   
25%      257.000000      8.000000      6.000000     63.000000     10.000000   
50%      844.000000     12.500000     12.500000     74.500000     15.000000   
75%     1671.750000     16.000000     16.000000     83.000000     20.500000   
max     7860.000000     34.000000     34.000000    100.000000     56.500000   

       weather_code    is_holiday    is_weekend        season  
count  17414.000000  17414.000000  17414.000000  17414.000000  
mean       2.722752      0.022051      0.285403      1.492075  
std        2.341163      0.146854      0.451619      1.118911  
min      

In [10]:
# Feature engineering - create time-based features
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['hour'] = df['timestamp'].dt.hour
df['day_of_year'] = df['timestamp'].dt.dayofyear
df['month'] = df['timestamp'].dt.month

# Define feature set for modeling
feature_columns = [
    't1', 't2', 'hum', 'wind_speed', 'weather_code', 
    'is_holiday', 'is_weekend', 'season', 'hour', 
    'day_of_year', 'month'
]

X = df[feature_columns]
y = df['cnt']

print("🛠️ FEATURE ENGINEERING COMPLETE:")
print(f"✅ Features created: {len(feature_columns)} total features")
print(f"✅ Feature matrix: {X.shape}")
print(f"✅ Target variable: {y.shape}")
print(f"✅ Target range: {y.min()} to {y.max()} bikes per hour")


🛠️ FEATURE ENGINEERING COMPLETE:
✅ Features created: 11 total features
✅ Feature matrix: (17414, 11)
✅ Target variable: (17414,)
✅ Target range: 0 to 7860 bikes per hour


In [11]:
# ===== 80/20 TRAIN-TEST SPLIT ANALYSIS =====
print("🎯 80/20 SPLIT CONFIGURATION:")

# Split the data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Training set: {X_train.shape[0]:,} samples ({X_train.shape[0]/len(X)*100:.1f}%)")
print(f"Testing set:  {X_test.shape[0]:,} samples ({X_test.shape[0]/len(X)*100:.1f}%)")

# Train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

print("\n🚀 MODEL TRAINING COMPLETE!")
print("📊 PERFORMANCE METRICS:")

# Calculate all metrics
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Display results
print(f"R² Score (Accuracy):     {r2:.4f} ({r2*100:.1f}% variance explained)")
print(f"Mean Absolute Error:     {mae:.0f} bikes per hour")
print(f"Root Mean Square Error:  {rmse:.0f} bikes per hour")

# Model interpretation
if r2 > 0.25:
    print("✅ Model shows good predictive power!")
else:
    print("⚠️  Model performance could be improved")

print(f"\n🎯 Average prediction error: ±{mae:.0f} bikes per hour")


🎯 80/20 SPLIT CONFIGURATION:
Training set: 13,931 samples (80.0%)
Testing set:  3,483 samples (20.0%)

🚀 MODEL TRAINING COMPLETE!
📊 PERFORMANCE METRICS:
R² Score (Accuracy):     0.2905 (29.1% variance explained)
Mean Absolute Error:     669 bikes per hour
Root Mean Square Error:  922 bikes per hour
✅ Model shows good predictive power!

🎯 Average prediction error: ±669 bikes per hour


In [12]:
# Feature importance analysis
print("🏆 FEATURE IMPORTANCE ANALYSIS:")
print("(Based on Linear Regression coefficients)")
print("-" * 45)

feature_importance = pd.DataFrame({
    'Feature': feature_columns,
    'Coefficient': model.coef_,
    'Abs_Impact': np.abs(model.coef_)
}).sort_values('Abs_Impact', ascending=False)

print("Top factors affecting bike demand:")
for i, (_, row) in enumerate(feature_importance.head().iterrows(), 1):
    direction = "increases" if row['Coefficient'] > 0 else "decreases"
    print(f"{i}. {row['Feature']:12} | {row['Coefficient']:8.1f} | {direction} demand")

print(f"\nModel baseline (intercept): {model.intercept_:.1f} bikes per hour")


🏆 FEATURE IMPORTANCE ANALYSIS:
(Based on Linear Regression coefficients)
---------------------------------------------
Top factors affecting bike demand:
1. is_holiday   |   -280.0 | decreases demand
2. is_weekend   |   -205.6 | decreases demand
3. month        |     76.0 | increases demand
4. t1           |     71.7 | increases demand
5. season       |     37.4 | increases demand

Model baseline (intercept): 1918.6 bikes per hour


In [13]:
# Sample predictions to show model performance
print("🔮 SAMPLE PREDICTIONS:")
print("Actual vs Predicted (first 10 test samples):")
print("-" * 40)

for i in range(10):
    actual = int(y_test.iloc[i])
    predicted = int(y_pred[i])
    difference = abs(actual - predicted)
    accuracy = max(0, 100 - (difference/max(actual, 1) * 100))
    
    print(f"Sample {i+1:2d}: {actual:4d} → {predicted:4d} bikes | Error: {difference:3d} | Acc: {accuracy:.0f}%")

print(f"\n📈 Model Summary:")
print(f"• Trained on {X_train.shape[0]:,} samples")
print(f"• Tested on {X_test.shape[0]:,} samples") 
print(f"• Overall accuracy: {r2*100:.1f}%")
print(f"• Average error: ±{mae:.0f} bikes per hour")
print("\n🚴‍♂️ London Bike Prediction Model Complete!")


🔮 SAMPLE PREDICTIONS:
Actual vs Predicted (first 10 test samples):
----------------------------------------
Sample  1:   66 →  770 bikes | Error: 704 | Acc: 0%
Sample  2: 1749 →  443 bikes | Error: 1306 | Acc: 25%
Sample  3:  172 →  906 bikes | Error: 734 | Acc: 0%
Sample  4: 2866 → 1776 bikes | Error: 1090 | Acc: 62%
Sample  5: 1550 → 1080 bikes | Error: 470 | Acc: 70%
Sample  6: 1479 → 1085 bikes | Error: 394 | Acc: 73%
Sample  7:  146 → 1264 bikes | Error: 1118 | Acc: 0%
Sample  8:   44 →  703 bikes | Error: 659 | Acc: 0%
Sample  9: 1010 → 1480 bikes | Error: 470 | Acc: 53%
Sample 10: 3904 →  829 bikes | Error: 3075 | Acc: 21%

📈 Model Summary:
• Trained on 13,931 samples
• Tested on 3,483 samples
• Overall accuracy: 29.1%
• Average error: ±669 bikes per hour

🚴‍♂️ London Bike Prediction Model Complete!
