In [None]:
import pandas as pd
import numpy as np

# Load your dataset
data = pd.read_csv('data.csv')

# Impute missing values using interpolation
data.interpolate(method='linear', limit_direction='forward', inplace=True)

# Create a datetime column without the 'year' feature
data['datetime'] = pd.to_datetime({
    'year': data['year'],  # Used to create datetime, will be dropped afterward
    'month': data['month'],
    'day': data['day'],
    'hour': data['hour']
})

# Set 'datetime' as the index
data.set_index('datetime', inplace=True)

# Drop the 'year' column as per your requirement
data.drop(['year'], axis=1, inplace=True)

# Extract day of the week and weekend indicator
data['day_of_week'] = data.index.dayofweek
data['is_weekend'] = data['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)

# One-hot encode 'cbwd'
data = pd.get_dummies(data, columns=['cbwd'])

# Create the binary target variable
threshold = 50
data['pm2.5_binary'] = data['pm2.5'].apply(lambda x: 1 if x > threshold else 0)

# List to hold DataFrames of lag features
lagged_features = []

# Define the features and lags
lag_features = ['DEWP', 'TEMP', 'PRES', 'Iws', 'Is', 'Ir']
lags = range(1, 25)  # Lags from 1 to 24 hours

# Generate lagged features for specified features
for feature in lag_features + ['pm2.5']:
    # Create a dictionary of lagged series
    lagged_data = {f'{feature}_lag_{lag}': data[feature].shift(lag) for lag in lags}
    # Create a DataFrame from the dictionary
    lagged_df = pd.DataFrame(lagged_data)
    # Append to the list
    lagged_features.append(lagged_df)

# Concatenate all lagged features into a single DataFrame
lagged_features_df = pd.concat(lagged_features, axis=1)

# Concatenate lagged features with the original data
data = pd.concat([data, lagged_features_df], axis=1)

# Drop rows with NaN values resulting from lagging
data.dropna(inplace=True)


  data.interpolate(method='linear', limit_direction='forward', inplace=True)


In [None]:
# Sort data by datetime
data.sort_index(inplace=True)

# Define the split point (e.g., last 20% of data for testing)
split_fraction = 0.8
split_point = int(len(data) * split_fraction)

# Split the data
train_data = data.iloc[:split_point]
test_data = data.iloc[split_point:]

# Separate features and target
X_train = train_data.drop(['pm2.5', 'pm2.5_binary'], axis=1)
y_train = train_data['pm2.5_binary']

X_test = test_data.drop(['pm2.5', 'pm2.5_binary'], axis=1)
y_test = test_data['pm2.5_binary']


In [None]:
from sklearn.preprocessing import StandardScaler

# Identify numeric features to scale
numeric_features = X_train.select_dtypes(include=[np.number]).columns.tolist()

# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler on the training data
X_train_scaled = X_train.copy()
X_train_scaled[numeric_features] = scaler.fit_transform(X_train[numeric_features])

# Apply the scaler to the test data
X_test_scaled = X_test.copy()
X_test_scaled[numeric_features] = scaler.transform(X_test[numeric_features])


In [None]:
# Check class distribution in the training set
from collections import Counter

counter = Counter(y_train)
print(f'Class distribution in training set: {counter}')

# Calculate class weights if imbalance exists
neg, pos = np.bincount(y_train)
total = neg + pos
class_weight = {0: (1 / neg) * (total / 2.0),
                1: (1 / pos) * (total / 2.0)}

print(f'Calculated class weights: {class_weight}')


Class distribution in training set: Counter({1: 21740, 0: 13280})
Calculated class weights: {0: 1.3185240963855422, 1: 0.8054277828886844}


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping

# Define the model architecture
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')  # Output layer for binary classification
])

# Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['Precision', 'Recall', 'AUC'])


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
# Define early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model
history = model.fit(
    X_train_scaled, y_train,
    validation_split=0.2,  # Further split of training data for validation
    epochs=100,
    batch_size=64,
    callbacks=[early_stopping],
    class_weight=class_weight  # Use class weights if imbalance exists
)


  sample_weight[i] = class_weight.get(int(y[i]), 1.0)


Epoch 1/100
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - AUC: 0.9142 - Precision: 0.8968 - Recall: 0.8539 - loss: 0.3510 - val_AUC: 0.9635 - val_Precision: 0.9445 - val_Recall: 0.8759 - val_loss: 0.2541
Epoch 2/100
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - AUC: 0.9778 - Precision: 0.9542 - Recall: 0.9151 - loss: 0.1908 - val_AUC: 0.9691 - val_Precision: 0.9513 - val_Recall: 0.8957 - val_loss: 0.2422
Epoch 3/100
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - AUC: 0.9832 - Precision: 0.9620 - Recall: 0.9296 - loss: 0.1637 - val_AUC: 0.9724 - val_Precision: 0.9584 - val_Recall: 0.8796 - val_loss: 0.2416
Epoch 4/100
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - AUC: 0.9854 - Precision: 0.9661 - Recall: 0.9345 - loss: 0.1530 - val_AUC: 0.9763 - val_Precision: 0.9561 - val_Recall: 0.9089 - val_loss: 0.2029
Epoch 5/100
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3

In [None]:
# Predict probabilities on the test set
y_pred_prob = model.predict(X_test_scaled)

# Convert probabilities to binary predictions
y_pred = (y_pred_prob > 0.5).astype(int).flatten()

# Evaluate the model
from sklearn.metrics import f1_score, roc_auc_score, classification_report, confusion_matrix

# F1 Score
f1 = f1_score(y_test, y_pred)
print(f'F1 Score: {f1:.4f}')

# AUC-ROC
auc = roc_auc_score(y_test, y_pred_prob)
print(f'AUC-ROC: {auc:.4f}')

# Classification Report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)


[1m274/274[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
F1 Score: 0.9481
AUC-ROC: 0.9844
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.93      0.92      3348
           1       0.96      0.94      0.95      5408

    accuracy                           0.94      8756
   macro avg       0.93      0.94      0.93      8756
weighted avg       0.94      0.94      0.94      8756

Confusion Matrix:
[[3115  233]
 [ 324 5084]]
