In [3]:
import pandas as pd
import numpy as np


In [4]:
data = pd.read_csv("../notebooks/data/Air_quality_data.csv")

In [5]:
data.shape

(29531, 16)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29531 entries, 0 to 29530
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   City        29531 non-null  object 
 1   Date        29531 non-null  object 
 2   PM2.5       24933 non-null  float64
 3   PM10        18391 non-null  float64
 4   NO          25949 non-null  float64
 5   NO2         25946 non-null  float64
 6   NOx         25346 non-null  float64
 7   NH3         19203 non-null  float64
 8   CO          27472 non-null  float64
 9   SO2         25677 non-null  float64
 10  O3          25509 non-null  float64
 11  Benzene     23908 non-null  float64
 12  Toluene     21490 non-null  float64
 13  Xylene      11422 non-null  float64
 14  AQI         24850 non-null  float64
 15  AQI_Bucket  24850 non-null  object 
dtypes: float64(13), object(3)
memory usage: 3.6+ MB


In [7]:
data.isnull().sum()

City              0
Date              0
PM2.5          4598
PM10          11140
NO             3582
NO2            3585
NOx            4185
NH3           10328
CO             2059
SO2            3854
O3             4022
Benzene        5623
Toluene        8041
Xylene        18109
AQI            4681
AQI_Bucket     4681
dtype: int64

In [8]:
data["PM2.5"].fillna(data["PM2.5"].mean().round(2), inplace=True)
data["PM10"].fillna(data["PM10"].mean().round(2), inplace=True)
data["NO"].fillna(data["NO"].mean().round(2), inplace=True)
data["NO2"].fillna(data["NO2"].mean().round(2), inplace=True)
data["NOx"].fillna(data["NOx"].mean().round(2), inplace=True)
data["NH3"].fillna(data["NH3"].mean().round(2), inplace=True)
data["CO"].fillna(data["CO"].mean().round(2), inplace=True)
data["SO2"].fillna(data["SO2"].mean().round(2), inplace=True)
data["O3"].fillna(data["O3"].mean().round(2), inplace=True)
data["Benzene"].fillna(data["Benzene"].mean().round(2), inplace=True)
data["Toluene"].fillna(data["Toluene"].mean().round(2), inplace=True)
data["Xylene"].fillna(data["Xylene"].mean().round(2), inplace=True)
data["AQI"].fillna(data["AQI"].mean().round(2), inplace=True)

In [9]:
data.duplicated().sum()

0

In [10]:
data["Date"] = pd.to_datetime(data["Date"], format="%Y-%m-%d")

In [11]:
'''data["Year"] = data["Date"].dt.year
data["Month"] = data["Date"].dt.month
data["Day"] = data["Date"].dt.day'''

'data["Year"] = data["Date"].dt.year\ndata["Month"] = data["Date"].dt.month\ndata["Day"] = data["Date"].dt.day'

In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29531 entries, 0 to 29530
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   City        29531 non-null  object        
 1   Date        29531 non-null  datetime64[ns]
 2   PM2.5       29531 non-null  float64       
 3   PM10        29531 non-null  float64       
 4   NO          29531 non-null  float64       
 5   NO2         29531 non-null  float64       
 6   NOx         29531 non-null  float64       
 7   NH3         29531 non-null  float64       
 8   CO          29531 non-null  float64       
 9   SO2         29531 non-null  float64       
 10  O3          29531 non-null  float64       
 11  Benzene     29531 non-null  float64       
 12  Toluene     29531 non-null  float64       
 13  Xylene      29531 non-null  float64       
 14  AQI         29531 non-null  float64       
 15  AQI_Bucket  24850 non-null  object        
dtypes: datetime64[ns](1), 

In [13]:
data = data.drop(labels=["Date"], axis=1)

In [14]:
data.loc[(data["AQI"] >= 0) & (data["AQI"]<=50), "AQI_Bucket"] = "Good"
data.loc[(data["AQI"] >= 51) & (data["AQI"]<=100), "AQI_Bucket"] = "Satisfactory"
data.loc[(data["AQI"] >= 101) & (data["AQI"]<=200), "AQI_Bucket"] = "Moderate"
data.loc[(data["AQI"] >= 201) & (data["AQI"]<=300), "AQI_Bucket"] = "Poor"
data.loc[(data["AQI"] >= 301) & (data["AQI"]<=400), "AQI_Bucket"] = "Very Poor"
data.loc[(data["AQI"] >= 401) & (data["AQI"]<=500), "AQI_Bucket"] = "Severe"

In [15]:
X = data.drop(labels=["AQI_Bucket"], axis=1)
Y = data[["AQI_Bucket"]]

In [16]:
categorical_data = X.select_dtypes(include="object").columns
numerical_data = X.select_dtypes(exclude="object").columns

In [17]:
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [18]:
num_pipeline =  Pipeline(
    steps=[
        ("Scaler", StandardScaler())
    ]
)

cat_pipeline = Pipeline(
    steps=[
        ("OrdinalEncoder", OrdinalEncoder()),
        ("Scaler", StandardScaler())
        
    ]
)

preprocessor = ColumnTransformer(
    [
        ("numerical pipeline", num_pipeline, numerical_data),
        ("Categorical Pipeline", cat_pipeline, categorical_data)
    ]
)

In [19]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=45)

In [20]:
x_train = pd.DataFrame(preprocessor.fit_transform(X_train), columns=numerical_data.to_list() + categorical_data.to_list())

In [21]:
x_test = pd.DataFrame(preprocessor.transform(X_test), columns=numerical_data.to_list() + categorical_data.to_list())

In [28]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [23]:
model = SVC(kernel="rbf")

In [24]:
model.fit(x_train, y_train)

  y = column_or_1d(y, warn=True)


In [25]:
model.predict(x_test)

array(['Satisfactory', 'Moderate', 'Satisfactory', ..., 'Moderate',
       'Moderate', 'Good'], dtype=object)

In [29]:
accuracy_score(y_test, model.predict(x_test))

0.9375846501128668