In [None]:
import pandas as pd
import numpy as np

# Load your dataset
data = pd.read_csv('data.csv')

# Check for missing values
print(data.isnull().sum())

# Option 1: Drop rows with missing values
data.dropna(inplace=True)

# Create a datetime column
data['datetime'] = pd.to_datetime(data[['year', 'month', 'day', 'hour']])

# Set as index if needed
data.set_index('datetime', inplace=True)

# Extract day of the week and weekend indicator
data['day_of_week'] = data.index.dayofweek
data['is_weekend'] = data['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)

# One-hot encode 'cbwd'
data = pd.get_dummies(data, columns=['cbwd'])

# Define the threshold
threshold = 50

# Create the binary target variable
data['pm2.5_binary'] = data['pm2.5'].apply(lambda x: 1 if x > threshold else 0)

# Create lag features for PM2.5
for lag in range(1, 25):  # Lags from 1 to 24 hours
    data[f'pm2.5_lag_{lag}'] = data['pm2.5'].shift(lag)

# Drop rows with NaN values resulting from lagging
data.dropna(inplace=True)

No          0
year        0
month       0
day         0
hour        0
pm2.5    2067
DEWP        0
TEMP        0
PRES        0
cbwd        0
Iws         0
Is          0
Ir          0
dtype: int64


In [None]:
# Sort data by datetime
data.sort_index(inplace=True)

# Define the split point (e.g., last 20% of data for testing)
split_fraction = 0.8
split_point = int(len(data) * split_fraction)

# Split the data
train_data = data.iloc[:split_point]
test_data = data.iloc[split_point:]

# Separate features and target
X_train = train_data.drop(['pm2.5', 'pm2.5_binary'], axis=1)
y_train = train_data['pm2.5_binary']

X_test = test_data.drop(['pm2.5', 'pm2.5_binary'], axis=1)
y_test = test_data['pm2.5_binary']


In [None]:
from sklearn.preprocessing import StandardScaler

# Select features to scale
features_to_scale = ['DEWP', 'TEMP', 'PRES', 'Iws', 'Is', 'Ir'] + [f'pm2.5_lag_{lag}' for lag in range(1, 25)]

scaler = StandardScaler()

# Fit the scaler on the training data
X_train_scaled = X_train.copy()
X_train_scaled[features_to_scale] = scaler.fit_transform(X_train[features_to_scale])

# Apply the scaler to the test data
X_test_scaled = X_test.copy()
X_test_scaled[features_to_scale] = scaler.transform(X_test[features_to_scale])


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping

# Define the model
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')  # Output layer for binary classification
])

# Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['Precision', 'Recall', 'AUC'])


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
# Check class distribution
from collections import Counter

counter = Counter(y_train)
print(f'Class distribution in training set: {counter}')

# Calculate class weights
neg, pos = np.bincount(y_train)
total = neg + pos
class_weight = {0: (1 / neg) * (total / 2.0),
                1: (1 / pos) * (total / 2.0)}

print(f'Calculated class weights: {class_weight}')


Class distribution in training set: Counter({1: 20640, 0: 12746})
Calculated class weights: {0: 1.3096657774988232, 1: 0.8087693798449612}


In [None]:
# Define early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model
history = model.fit(
    X_train_scaled, y_train,
    validation_split=0.2,  # Further split training data for validation
    epochs=100,
    batch_size=64,
    callbacks=[early_stopping],
    class_weight=class_weight  # Adjust if imbalance exists
)


  sample_weight[i] = class_weight.get(int(y[i]), 1.0)


Epoch 1/100
[1m418/418[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - AUC: 0.5563 - Precision: 0.6687 - Recall: 0.5567 - loss: 33.9475 - val_AUC: 0.5137 - val_Precision: 0.6088 - val_Recall: 1.0000 - val_loss: 8.0088
Epoch 2/100
[1m418/418[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - AUC: 0.6952 - Precision: 0.7639 - Recall: 0.6688 - loss: 5.1675 - val_AUC: 0.5000 - val_Precision: 0.6077 - val_Recall: 1.0000 - val_loss: 15.3131
Epoch 3/100
[1m418/418[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - AUC: 0.7211 - Precision: 0.7892 - Recall: 0.7082 - loss: 5.1764 - val_AUC: 0.5000 - val_Precision: 0.6077 - val_Recall: 1.0000 - val_loss: 17.5398
Epoch 4/100
[1m418/418[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - AUC: 0.7341 - Precision: 0.7949 - Recall: 0.7094 - loss: 5.5144 - val_AUC: 0.5577 - val_Precision: 0.9839 - val_Recall: 0.0902 - val_loss: 15.9202
Epoch 5/100
[1m418/418[0m [32m━━━━━━━━━━━━━━━━━━━━[0

In [None]:
# Predict probabilities
y_pred_prob = model.predict(X_test_scaled)

# Convert probabilities to binary predictions
y_pred = (y_pred_prob > 0.5).astype(int).flatten()

from sklearn.metrics import f1_score, roc_auc_score, classification_report, confusion_matrix

# F1 Score
f1 = f1_score(y_test, y_pred)
print(f'F1 Score: {f1:.4f}')

# AUC-ROC
auc = roc_auc_score(y_test, y_pred_prob)
print(f'AUC-ROC: {auc:.4f}')

# Classification Report
print(classification_report(y_test, y_pred))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(cm)


[1m261/261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step
F1 Score: 0.8404
AUC-ROC: 0.9249
              precision    recall  f1-score   support

           0       0.96      0.42      0.58      3199
           1       0.73      0.99      0.84      5148

    accuracy                           0.77      8347
   macro avg       0.84      0.70      0.71      8347
weighted avg       0.82      0.77      0.74      8347

Confusion Matrix:
[[1329 1870]
 [  62 5086]]
