In [None]:
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning Models
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import f1_score, roc_auc_score, classification_report, roc_curve, auc
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [19]:
# Load the dataset
df = pd.read_csv('data.csv')

In [20]:
# Shift the 'pm2.5' column to get next hour's PM2.5
df['pm2.5_next'] = df['pm2.5'].shift(-1)

# Create the binary target variable
df['target'] = np.where(df['pm2.5_next'] > 50, 1, 0)

# Drop the last row with NaN in 'pm2.5_next' after shift
df = df[:-1]

In [21]:
# Check missing values
print(df.isnull().sum())

# Since 'pm2.5' might have missing values, we can fill them using interpolation or drop them
df['pm2.5'].interpolate(method='linear', inplace=True)
df['pm2.5_next'].interpolate(method='linear', inplace=True)

# For any remaining missing values, we can drop them
df.dropna(inplace=True)

No               0
year             0
month            0
day              0
hour             0
pm2.5         2067
DEWP             0
TEMP             0
PRES             0
cbwd             0
Iws              0
Is               0
Ir               0
pm2.5_next    2066
target           0
dtype: int64


In [22]:
# Encode 'cbwd' using Label Encoding
le = LabelEncoder()
df['cbwd_encoded'] = le.fit_transform(df['cbwd'])

# Drop the original 'cbwd' column
df.drop('cbwd', axis=1, inplace=True)

In [23]:
# Drop unnecessary columns
df.drop(['No', 'pm2.5_next', 'pm2.5'], axis=1, inplace=True)

# Features and target
X = df.drop('target', axis=1)
y = df['target']

In [24]:
# Split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [25]:
# Initialize the scaler
scaler = StandardScaler()

# Fit only on training data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [26]:
# Initialize the model
log_reg = LogisticRegression()

# Train the model
log_reg.fit(X_train_scaled, y_train)

# Predict on test data
y_pred_log = log_reg.predict(X_test_scaled)

In [27]:
# Initialize the model
rf_clf = RandomForestClassifier(random_state=42)

# Train the model
rf_clf.fit(X_train, y_train)

# Predict on test data
y_pred_rf = rf_clf.predict(X_test)

In [None]:
# Initialize the model
xgb_clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# Train the model
xgb_clf.fit(X_train, y_train)

# Predict on test data
y_pred_xgb = xgb_clf.predict(X_test)

In [28]:
# Initialize the model
knn_clf = KNeighborsClassifier()

# Train the model
knn_clf.fit(X_train_scaled, y_train)

# Predict on test data
y_pred_knn = knn_clf.predict(X_test_scaled)

In [29]:
def evaluate_model(y_test, y_pred, y_prob):
    f1 = f1_score(y_test, y_pred)
    auc_score = roc_auc_score(y_test, y_prob)
    print(f'F1 Score: {f1:.4f}')
    print(f'AUC ROC Score: {auc_score:.4f}')

In [30]:
# Get predicted probabilities
y_prob_log = log_reg.predict_proba(X_test_scaled)[:, 1]

print("Logistic Regression Performance:")
evaluate_model(y_test, y_pred_log, y_prob_log)

Logistic Regression Performance:
F1 Score: 0.8101
AUC ROC Score: 0.8009


In [31]:
# Get predicted probabilities
y_prob_rf = rf_clf.predict_proba(X_test)[:, 1]

print("\nRandom Forest Performance:")
evaluate_model(y_test, y_pred_rf, y_prob_rf)


Random Forest Performance:
F1 Score: 0.9056
AUC ROC Score: 0.9540


In [None]:
# Get predicted probabilities
y_prob_xgb = xgb_clf.predict_proba(X_test)[:, 1]

print("\nXGBoost Performance:")
evaluate_model(y_test, y_pred_xgb, y_prob_xgb)

In [32]:
# Get predicted probabilities
y_prob_knn = knn_clf.predict_proba(X_test_scaled)[:, 1]

print("\nKNN Performance:")
evaluate_model(y_test, y_pred_knn, y_prob_knn)


KNN Performance:
F1 Score: 0.8777
AUC ROC Score: 0.9184
