<a href="https://colab.research.google.com/github/LiamDuero03/DS-Society-Project/blob/main/5-Predictive Modelling/Predictive_modelling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# --- LFS SETUP & REPO CLONING ---
import os
import pandas as pd

!git lfs install

REPO_NAME = "DS-Society-Project"
REPO_URL = f"https://github.com/LiamDuero03/{REPO_NAME}.git"

if not os.path.exists(REPO_NAME):
    !git clone {REPO_URL}
else:
    # If it exists, pull latest changes (including your new CSV)
    %cd {REPO_NAME}
    !git pull
    %cd ..

# --- 4. READ THE PROCESSED DATA ---
# This points to the new folder/file you created in your repo
processed_data_path = f"/content/{REPO_NAME}/5-Predictive Modelling/processed_urban_weather.csv"

# Check if file exists to avoid errors
if os.path.exists(processed_data_path):
    df = pd.read_csv(processed_data_path)
    print(f"Success! Model-ready Data Shape: {df.shape}")
else:
    print("Error: Processed CSV not found in the repo. Did you push it to GitHub yet?")

Git LFS initialized.
/content/DS-Society-Project
Already up to date.
/content
Success! Model-ready Data Shape: (5000, 29)


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# 1. Drop rows where we don't have our target variable (feels_like)
df_clean = df.dropna(subset=['feels_like'])

# 2. Define Features (X) and Target (y)
# We exclude 'City', 'temp', and the original 'Population' (since we have 'Population_scaled')
# We also drop 'feels_like' because that's what we want to predict!
X = df_clean.drop(columns=['City', 'temp', 'feels_like', 'Population'])
y = df_clean['feels_like']

# 3. Split into Train (80%) and Test (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Cleaned Dataset: {df_clean.shape[0]} cities ready for modeling.")

Cleaned Dataset: 528 cities ready for modeling.


In [8]:
# Train
lr = LinearRegression()
lr.fit(X_train, y_train)

# Predict
lr_preds = lr.predict(X_test)

print("--- Linear Regression ---")
print(f"R² (Accuracy): {r2_score(y_test, lr_preds):.4f}")
print(f"Avg Error: {mean_absolute_error(y_test, lr_preds):.2f}°C")

--- Linear Regression ---
R² (Accuracy): 0.7590
Avg Error: 6.06°C


In [9]:
# Train
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predict
rf_preds = rf.predict(X_test)

print("--- Random Forest ---")
print(f"R² (Accuracy): {r2_score(y_test, rf_preds):.4f}")
print(f"Avg Error: {mean_absolute_error(y_test, rf_preds):.2f}°C")

--- Random Forest ---
R² (Accuracy): 0.8945
Avg Error: 3.60°C
