# City Day Data Analysis from Zip File

This notebook is designed to automatically load and process a .zip file containing air quality data. It includes steps for:

1.  Data Loading: Extracts a CSV file from a zip archive and loads it into a Pandas DataFrame.
2.  Exploratory Data Analysis (EDA): Provides a quick overview of the data, including a description, and checks for missing values.
3.  Preprocessing & Model Training: Sets up a machine learning pipeline to handle both numeric and categorical features and trains a baseline RandomForest model.
4.  Evaluation: Reports key metrics to assess the model's performance.

In [ ]:
# --- 1. Imports & Setup ---
import os, zipfile
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score, f1_score, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

# Path to the ZIP file and contained CSV
ZIP_PATH = r"/mnt/data/archive (1).zip"
CSV_FILE_NAME = "city_day.csv"
TARGET_FALLBACK = "AQI_Bucket"  # target for classification

print("Zip file path:", ZIP_PATH)

In [ ]:
# --- 2. Data Loading and Extraction ---
def load_csv_from_zip(zip_path, file_name):
    try:
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extract(file_name, ".")
        df = pd.read_csv(file_name)
        return df
    except FileNotFoundError:
        print(f"Error: The file '{zip_path}' or '{file_name}' was not found.")
        return None
    except Exception as e:
        print(f"An error occurred while loading the data: {e}")
        return None

df = load_csv_from_zip(ZIP_PATH, CSV_FILE_NAME)
if df is not None:
    print("Shape:", df.shape)
    print("Columns (first 10):", list(df.columns)[:10], "..." if df.shape[1] > 10 else "")
    display(df.head())
else:
    print("Could not load data. Please check the file path and name.")

In [ ]:
# --- 3. Exploratory Data Analysis (EDA) ---
if df is None:
    raise ValueError("DataFrame is empty, cannot proceed with analysis.")

cols_lower = {c.lower(): c for c in df.columns}
if TARGET_FALLBACK and TARGET_FALLBACK in df.columns:
    target_col = TARGET_FALLBACK
elif "class" in cols_lower:
    target_col = cols_lower["class"]
else:
    target_col = df.columns[-1] if len(df.columns) else None

print("Target column selected:", target_col)

display(df.describe(include="all"))
print("Missing values (top 20):")
print(df.isna().sum().sort_values(ascending=False).head(20))

In [ ]:
# --- 4. Data Splitting and Model Training ---
if target_col is None:
    raise ValueError("Could not determine a target column. Please set target_col manually.")

X = df.drop(columns=[target_col]).copy()
y = df[target_col].copy()

is_numeric_target = pd.api.types.is_numeric_dtype(y)
n_unique = y.nunique(dropna=True)
task = "classification"
if is_numeric_target and n_unique > 15:
    task = "regression"
print(f"Detected task: {task} (unique target values: {n_unique})")

valid_indices = y.dropna().index
X = X.loc[valid_indices]
y = y.loc[valid_indices]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42,
    stratify=y if task=="classification" and n_unique>1 else None
)

num_cols = [c for c in X.columns if pd.api.types.is_numeric_dtype(X[c])]
cat_cols = [c for c in X.columns if c not in num_cols]

preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(with_mean=False), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ]
)

if task == "classification":
    model = RandomForestClassifier(n_estimators=200, random_state=42)
else:
    model = RandomForestRegressor(n_estimators=200, random_state=42)

pipe = Pipeline(steps=[("prep", preprocess), ("model", model)])
pipe.fit(X_train, y_train)

In [ ]:
# --- 5. Model Evaluation ---
if task == "classification":
    pred = pipe.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, pred))
    print("F1 (macro):", f1_score(y_test, pred, average="macro"))
    print("\nClassification report:\n", classification_report(y_test, pred))
else:
    pred = pipe.predict(X_test)
    rmse = mean_squared_error(y_test, pred, squared=False)
    r2 = r2_score(y_test, pred)
    print("RMSE:", rmse)
    print("R^2:", r2)

In [ ]:
# --- 6. (Optional) Permutation Importance ---
try:
    from sklearn.inspection import permutation_importance
    r = permutation_importance(pipe, X_test, y_test, n_repeats=5, random_state=42)
    cat_encoder = pipe.named_steps["prep"].transformers_[1][1]
    cat_feature_names = list(cat_encoder.get_feature_names_out(cat_cols)) if len(cat_cols) else []
    feature_names = num_cols + cat_feature_names
    importances = pd.Series(r.importances_mean, index=feature_names)
    display(importances.sort_values(ascending=False).head(20))
except Exception as e:
    print("Permutation importance skipped. Error:", e)

In [ ]:
# --- 7. Save a Raw CSV Copy ---
clean_csv_path = "/mnt/data/city_day_raw.csv"
if df is not None:
    df.to_csv(clean_csv_path, index=False)
    print("Saved raw CSV to:", clean_csv_path)