In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.cluster import KMeans
from mlxtend.frequent_patterns import apriori, association_rules
from sklearn.preprocessing import StandardScaler

# Load Dataset
df = pd.read_csv('/content/air_pollution (1).csv')

# Display basic info
display(df.head())

def preprocess_data(df):
    # Handling missing values
    df = df.dropna()

    # Convert categorical variables to numeric if needed
    df = pd.get_dummies(df, drop_first=True)
    return df

df = preprocess_data(df)

# --- CLUSTERING ---
# Normalize the dataset
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)

# Apply K-Means Clustering
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(df_scaled)

df['Cluster'] = kmeans.labels_
print("K-Means Clustering Done")

# --- ASSOCIATION RULE MINING ---
# Convert dataset to binary format (for Apriori algorithm)
df_bin = df.gt(df.mean()).astype(int)
frequent_itemsets = apriori(df_bin, min_support=0.1, use_colnames=True)
rules = association_rules(frequent_itemsets, metric='lift', min_threshold=1.0)
print("Association Rules Generated:")
display(rules.head())

# --- VALIDATION ---
# Holdout Method
X = df.drop(columns=['Cluster'])
y = df['Cluster']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Holdout: Train size = {X_train.shape}, Test size = {X_test.shape}")

# Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(kmeans, X, y, cv=kf, scoring='accuracy')
print(f"Cross-Validation Scores: {cv_scores}")
print(f"Mean CV Score: {np.mean(cv_scores)}")

Unnamed: 0,date,pollution_today,dew,temp,press,wnd_spd,snow,rain,pollution_yesterday
0,2010-01-02,145.958333,-8.5,-5.125,1024.75,24.86,0.708333,0.0,10.041667
1,2010-01-03,78.833333,-10.125,-8.541667,1022.791667,70.937917,14.166667,0.0,145.958333
2,2010-01-04,31.333333,-20.875,-11.5,1029.291667,111.160833,0.0,0.0,78.833333
3,2010-01-05,42.458333,-24.583333,-14.458333,1033.625,56.92,0.0,0.0,31.333333
4,2010-01-06,56.416667,-23.708333,-12.541667,1033.75,18.511667,0.0,0.0,42.458333


K-Means Clustering Done
Association Rules Generated:




Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,(pollution_today),(dew),0.390685,0.50137,0.219726,0.562412,1.121751,1.0,0.023848,1.139498,0.178129,0.326813,0.12242,0.500332
1,(dew),(pollution_today),0.50137,0.390685,0.219726,0.438251,1.121751,1.0,0.023848,1.084676,0.21767,0.326813,0.078065,0.500332
2,(pollution_today),(temp),0.390685,0.533699,0.208767,0.534362,1.001243,1.0,0.000259,1.001424,0.002037,0.29173,0.001422,0.462766
3,(temp),(pollution_today),0.533699,0.390685,0.208767,0.39117,1.001243,1.0,0.000259,1.000797,0.002662,0.29173,0.000797,0.462766
4,(pollution_today),(pollution_yesterday),0.390685,0.390685,0.240548,0.615708,1.575971,1.0,0.087913,1.585553,0.599806,0.444782,0.369305,0.615708


Holdout: Train size = (1460, 1832), Test size = (365, 1832)
Cross-Validation Scores: [0.15890411 0.1260274  0.13972603 0.30958904 0.47671233]
Mean CV Score: 0.24219178082191783
