In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tabulate import tabulate
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

In [2]:
# Load your dataset
df = pd.read_csv('Air_Quality.csv')

In [3]:
# Drop unnecessary columns
df = df.drop(['Time Period'], axis=1)

EDA Part

In [4]:
df.dtypes

Unique ID           int64
Indicator ID        int64
Name               object
Measure            object
Measure Info       object
Geo Type Name      object
Geo Join ID         int64
Geo Place Name     object
Start_Date         object
Data Value        float64
Message           float64
dtype: object

In [5]:
unique_categories = df['Name'].unique()
print(f'Unique categories in the "Name" column are:\n{unique_categories}')

Unique categories in the "Name" column are:
['Nitrogen dioxide (NO2)' 'Fine particles (PM 2.5)' 'Ozone (O3)'
 'Asthma emergency department visits due to PM2.5'
 'Annual vehicle miles traveled' 'Asthma hospitalizations due to Ozone'
 'Respiratory hospitalizations due to PM2.5 (age 20+)'
 'Boiler Emissions- Total SO2 Emissions'
 'Cardiovascular hospitalizations due to PM2.5 (age 40+)'
 'Boiler Emissions- Total PM2.5 Emissions'
 'Boiler Emissions- Total NOx Emissions'
 'Annual vehicle miles travelled (cars)'
 'Annual vehicle miles travelled (trucks)'
 'Cardiac and respiratory deaths due to Ozone'
 'Asthma emergency departments visits due to Ozone'
 'Outdoor Air Toxics - Formaldehyde' 'Outdoor Air Toxics - Benzene'
 'Deaths due to PM2.5']


In [6]:
# Assuming 'Data Value' and 'Measure Info' are the columns for which you want to find max and min values
result = df.groupby('Name').agg({'Data Value': ['min', 'max'], 'Measure Info': 'first'})

# Rename the columns for clarity
result.columns = ['Min Value', 'Max Value', 'Unit']

# Print the result as a formatted table
table = tabulate(result.reset_index(), headers=['Name', 'Min Value', 'Max Value', 'Unit'], tablefmt='pretty', showindex=False)

print(table)

+--------------------------------------------------------+-----------+-----------+----------------------+
|                          Name                          | Min Value | Max Value |         Unit         |
+--------------------------------------------------------+-----------+-----------+----------------------+
|             Annual vehicle miles traveled              |    4.9    |   86.0    |       per km2        |
|         Annual vehicle miles travelled (cars)          |    4.7    |   80.8    |       per km2        |
|        Annual vehicle miles travelled (trucks)         |    0.2    |    5.0    |       per km2        |
|    Asthma emergency department visits due to PM2.5     |    4.9    |   424.7   | per 100,000 children |
|    Asthma emergency departments visits due to Ozone    |    7.0    |   292.5   |  per 100,000 adults  |
|          Asthma hospitalizations due to Ozone          |    0.6    |   57.8    |  per 100,000 adults  |
|         Boiler Emissions- Total NOx Emission

In [7]:
# Label encode 'Geo Type Name encoded' column
label_encoder = LabelEncoder()
df['Geo Type Name encoded'] = label_encoder.fit_transform(df['Geo Type Name'])

In [8]:
# Group by 'Measure Info' and normalize 'Data Value' within each group
scaler = MinMaxScaler()
df['Normalized_Data_Value'] = df.groupby('Measure Info')['Data Value'].transform(lambda x: scaler.fit_transform(x.values.reshape(-1, 1)).flatten())

In [9]:
max_values = df.groupby('Name')['Data Value'].max()

# Define the risk thresholds
low_threshold = 0.3
medium_threshold = 0.6

df['Message'] = pd.cut(df['Normalized_Data_Value'], bins=[-np.inf, low_threshold, medium_threshold, np.inf], labels=['Low Risk', 'Medium Risk', 'High Risk'])

# Save the updated DataFrame to the same CSV file
df.to_csv('E:/Data Science/Air Quality/Air_Quality.csv', index=False)


In [10]:
# Label encode 'Name' column
label_encoder = LabelEncoder()
df['Name'] = label_encoder.fit_transform(df['Name'])

In [11]:
# Set 'Unique ID' as index
df.set_index('Unique ID', inplace=True)

In [12]:
# Convert 'Start_Date' to datetime
df['Start_Date'] = pd.to_datetime(df['Start_Date'], errors='coerce')

In [13]:
# Extract 'Measure Info' column and perform label encoding
measure_info_column = df['Measure Info']
label_encoder = LabelEncoder()
df['Measure Info Encoded'] = label_encoder.fit_transform(measure_info_column)

In [14]:
# Group by 'Measure Info' and print normalized data values along with min and max
grouped_data = df.groupby('Measure Info')

for measure_info, group_data in grouped_data:
    min_value = group_data['Normalized_Data_Value'].min()
    max_value = group_data['Normalized_Data_Value'].max()

    print(f"Measure Info: {measure_info}")
    print(f"Min Normalized Data Value: {min_value:.2f}")
    print(f"Max Normalized Data Value: {max_value:.2f}")
    print(f"Normalized Data Values:")
    print(group_data[['Normalized_Data_Value']])
    print("\n" + "="*50 + "\n")


Measure Info: mcg/m3
Min Normalized Data Value: 0.00
Max Normalized Data Value: 1.00
Normalized Data Values:
           Normalized_Data_Value
Unique ID                       
173129                  0.368343
669692                  0.122781
212069                  0.275148
547517                  0.206361
173125                  0.449704
...                          ...
669829                  0.059172
179657                  0.304734
168400                  0.690089
213760                  0.625000
211409                  0.477811

[5499 rows x 1 columns]


Measure Info: number
Min Normalized Data Value: 0.00
Max Normalized Data Value: 1.00
Normalized Data Values:
           Normalized_Data_Value
Unique ID                       
179789                  0.004215
130443                  0.007727
179793                  0.005971
179807                  0.145767
179792                  0.003864
...                          ...
130414                  0.001054
179783                  0.009

In [15]:
# Separate features (X) and target variable (Y)
X = df[['Name', 'Measure Info Encoded', 'Geo Type Name encoded', 'Normalized_Data_Value']]
Y = df['Message']

Classification Model: KNN, Decision Tree, and Random forest.

In [16]:
# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Define a range of k values to try
k_values = list(range(1, 25))

# Create a parameter grid for GridSearchCV
param_grid = {'n_neighbors': k_values}

# Initialize the KNN classifier
knn_classifier = KNeighborsClassifier()

# Perform grid search with 5-fold cross-validation
grid_search = GridSearchCV(knn_classifier, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, Y_train)

# Display the best k value and corresponding accuracy
print("Best k value:", grid_search.best_params_['n_neighbors'])
print("Best Cross-Validation Accuracy:", grid_search.best_score_)

# Train the KNN classifier with the best k value on the entire training set
best_k = grid_search.best_params_['n_neighbors']
best_knn_classifier = KNeighborsClassifier(n_neighbors=best_k)
best_knn_classifier.fit(X_train, Y_train)

# Make predictions on the test set with the best k value
Y_pred_knn_best = best_knn_classifier.predict(X_test)

# Evaluate the performance of the KNN classifier with the best k value
accuracy_knn_best = accuracy_score(Y_test, Y_pred_knn_best)
classification_report_str_knn_best = classification_report(Y_test, Y_pred_knn_best)

# Display the results for KNN with the best k value
print(f'\nKNN Accuracy with Best k: {accuracy_knn_best:.2f}')
print('KNN Classification Report with Best k:')
print(classification_report_str_knn_best)

Best k value: 1
Best Cross-Validation Accuracy: 0.9972250769895847

KNN Accuracy with Best k: 1.00
KNN Classification Report with Best k:
              precision    recall  f1-score   support

   High Risk       0.99      0.99      0.99       193
    Low Risk       1.00      1.00      1.00      1552
 Medium Risk       1.00      1.00      1.00      1499

    accuracy                           1.00      3244
   macro avg       1.00      1.00      1.00      3244
weighted avg       1.00      1.00      1.00      3244



In [17]:
# Initialize the Decision Tree classifier
decision_tree_classifier = DecisionTreeClassifier(random_state=42)

# Perform grid search with 5-fold cross-validation for Decision Tree
param_grid_dt = {'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 5, 10]}
grid_search_dt = GridSearchCV(decision_tree_classifier, param_grid_dt, cv=5, scoring='accuracy')
grid_search_dt.fit(X_train, Y_train)

# Display the best parameters and corresponding accuracy for Decision Tree
print("Best Decision Tree Parameters:", grid_search_dt.best_params_)
print("Best Decision Tree Cross-Validation Accuracy:", grid_search_dt.best_score_)

# Train the Decision Tree classifier with the best parameters on the entire training set
best_dt_classifier = grid_search_dt.best_estimator_
best_dt_classifier.fit(X_train, Y_train)

# Make predictions on the test set for Decision Tree
Y_pred_dt = best_dt_classifier.predict(X_test)

# Evaluate the performance of the Decision Tree classifier
accuracy_dt = accuracy_score(Y_test, Y_pred_dt)
classification_report_str_dt = classification_report(Y_test, Y_pred_dt)

# Display the results for Decision Tree
print(f'\nDecision Tree Accuracy: {accuracy_dt:.2f}')
print('Decision Tree Classification Report:')
print(classification_report_str_dt)

Best Decision Tree Parameters: {'max_depth': None, 'min_samples_split': 2}
Best Decision Tree Cross-Validation Accuracy: 1.0

Decision Tree Accuracy: 1.00
Decision Tree Classification Report:
              precision    recall  f1-score   support

   High Risk       1.00      1.00      1.00       193
    Low Risk       1.00      1.00      1.00      1552
 Medium Risk       1.00      1.00      1.00      1499

    accuracy                           1.00      3244
   macro avg       1.00      1.00      1.00      3244
weighted avg       1.00      1.00      1.00      3244



In [18]:
# Initialize the Random Forest classifier
random_forest_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Perform cross-validation for Random Forest
cv_scores_rf = cross_val_score(random_forest_classifier, X_train, Y_train, cv=5)

# Display cross-validation scores for Random Forest
print("\nRandom Forest Cross-Validation Scores:", cv_scores_rf)
print("Mean CV Score for Random Forest:", cv_scores_rf.mean())

# Train the Random Forest classifier on the entire training set
random_forest_classifier.fit(X_train, Y_train)

# Make predictions on the test set for Random Forest
Y_pred_rf = random_forest_classifier.predict(X_test)

# Evaluate the performance of the Random Forest classifier
accuracy_rf = accuracy_score(Y_test, Y_pred_rf)
classification_report_str_rf = classification_report(Y_test, Y_pred_rf)

# Display the results for Random Forest
print(f'\nRandom Forest Accuracy: {accuracy_rf:.2f}')
print('Random Forest Classification Report:')
print(classification_report_str_rf)


Random Forest Cross-Validation Scores: [1. 1. 1. 1. 1.]
Mean CV Score for Random Forest: 1.0

Random Forest Accuracy: 1.00
Random Forest Classification Report:
              precision    recall  f1-score   support

   High Risk       1.00      1.00      1.00       193
    Low Risk       1.00      1.00      1.00      1552
 Medium Risk       1.00      1.00      1.00      1499

    accuracy                           1.00      3244
   macro avg       1.00      1.00      1.00      3244
weighted avg       1.00      1.00      1.00      3244

