📌 1. Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score


ModuleNotFoundError: No module named 'pandas'

2. Load & Merge Datasets (Task 1)

In [None]:
paths = {
    "Dongsi": "data/PRSA_Data_Dongsi_20130301-20170228.csv",
    "Huairou": "data/PRSA_Data_Huairou_20130301-20170228.csv",
    "Changping": "data/PRSA_Data_Changping_20130301-20170228.csv",
    "Guanyuan": "data/PRSA_Data_Guanyuan_20130301-20170228.csv"
}

dataframes = []
for station, path in paths.items():
    df = pd.read_csv(path)
    df["station"] = station
    dataframes.append(df)

df = pd.concat(dataframes, ignore_index=True)
df.head()


3. Basic Data Understanding (Task 2a)

In [None]:
print("Shape:", df.shape)
print("Missing values:\n", df.isnull().sum())
print("Data types:\n", df.dtypes)


4. Data Preprocessing (Task 2b)

In [None]:
# Drop rows with missing critical values
df.dropna(subset=["PM2.5", "PM10", "SO2", "NO2", "CO", "O3", "TEMP", "PRES", "DEWP", "RAIN", "WSPM"], inplace=True)

# Create datetime index
df["datetime"] = pd.to_datetime(df[['year', 'month', 'day', 'hour']])
df.set_index("datetime", inplace=True)

# Drop unused columns
df.drop(columns=["No", "year", "month", "day", "hour", "wd"], inplace=True)


5. Visualisation (Task 2c)

In [None]:
# Distribution plot
sns.histplot(df['PM2.5'], bins=50, kde=True)
plt.title("Distribution of PM2.5")
plt.show()

# Correlation heatmap
plt.figure(figsize=(12, 6))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Matrix")
plt.show()


6. Model Building (Task 3)

In [None]:
# Feature matrix and target variable
features = ["PM10", "SO2", "NO2", "CO", "O3", "TEMP", "PRES", "DEWP", "RAIN", "WSPM"]
X = df[features]
y = df["PM2.5"]

# Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Evaluation
print("R² Score:", r2_score(y_test, y_pred))
print("RMSE:", mean_squared_error(y_test, y_pred, squared=False))


7. Save Cleaned Data (For GUI)

In [None]:
df.reset_index().to_csv("data/cleaned_data.csv", index=False)