# Data Pre-processing

In [76]:
# import libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [77]:
# set input directory
input_dir = 'input'

In [78]:
# set dataset file name
input_data_file = 'data.xlsx'

In [79]:
# display list of files in input directory
os.listdir('input')

['data.xlsx', 'labeled_data.csv']

In [80]:
# to get the input data path
data_path = os.path.join(os.curdir,input_dir,input_data_file)
data_path

'.\\input\\data.xlsx'

In [None]:
# to read data from excel file
raw_data = pd.read_excel(data_path)

In [None]:
raw_data.head()

In [None]:
raw_data.info()

In [None]:
raw_data.columns

In [None]:
raw_data.isna().sum()

In [None]:
# drop rows which contains nan
raw_data.dropna(axis = 0, inplace=True)

In [None]:
raw_data.isna().sum()

In [None]:
def numeric(row):
    try:
        if np.isnan(row):
            return
        else:
            row =str(row)
            return float(row.replace('x','').replace('#','').replace('*',''))
    except TypeError:
        row =str(row)
        return float(row.replace('x','').replace('#','').replace('*',''))

In [None]:
raw_data['O3'] = raw_data['O3'].apply(numeric)
#print(raw_data['NO'].describe())

In [None]:
raw_data['PM2.5'] = raw_data['PM2.5'].apply(numeric)
raw_data['TEMP'] = raw_data['TEMP'].apply(numeric)
raw_data['CH4'] = raw_data['CH4'].apply(numeric)
raw_data['CO'] = raw_data['CO'].apply(numeric)
raw_data['NMHC'] = raw_data['NMHC'].apply(numeric)
raw_data['NO'] = raw_data['NO'].apply(numeric)
raw_data['NO2'] = raw_data['NO2'].apply(numeric)
raw_data['NOx'] = raw_data['NOx'].apply(numeric)
raw_data['PM10'] = raw_data['PM10'].apply(numeric)
raw_data['RH'] = raw_data['RH'].apply(numeric)
raw_data['SO2'] = raw_data['SO2'].apply(numeric)

In [None]:
len(raw_data)

In [None]:
raw_data = raw_data[0:5000]

In [None]:
raw_data.info()

In [None]:
raw_data.describe()

In [None]:
temp_data = raw_data[['TEMP','RH','CH4','NMHC','NO', 'NOx']]

In [None]:
# axis = 1 : columnwise operation
data = raw_data.drop(columns=['TEMP','RH','CH4','NMHC','NO', 'NOx'],axis=1)
data.head()

In [None]:
data['AQI']= data.max(axis=1)
data.head()

In [None]:
data['AQI'].describe()

In [None]:
aqi = data['AQI']
#aqi

In [None]:
aqi[(aqi > 0) & (aqi <= 50)] = 0
aqi[(aqi > 50) & (aqi <= 100)] = 1
aqi[(aqi > 100) & (aqi <= 150)] = 2
aqi[(aqi > 150) & (aqi <= 200)] = 3
aqi[(aqi > 200)] = 4

In [None]:
aqi.unique()

In [None]:
data['label'] = data['AQI'].astype('int8')

In [None]:
data.drop(columns=['AQI'], axis= 1, inplace=True)

In [None]:
data.tail()

In [None]:
temp_data.head()

In [None]:
temp_data.info()

In [None]:
data.index

In [None]:
data.info()

In [None]:
data = temp_data.join(data)

In [None]:
data.head()

In [None]:
# save the labeled data
data.to_csv('input/labeled_data.csv',index=False)

# Data Analysis

In [None]:
data.head()

In [None]:
sns.pairplot(data)

In [None]:
sns.set(style="white", color_codes=True)
sns.jointplot(x='PM2.5',y='RH',data=data)     

In [None]:
sns.set(style="white", color_codes=True)
sns.jointplot(x='CH4',y='RH',data=data)

In [None]:
sns.set(style="white", color_codes=True)
sns.jointplot(x='CH4',y='NOx',data=data)

In [None]:
sns.set(style="white", color_codes=True)
sns.jointplot(x='CO',y='SO2',data=data)

In [None]:
sns.set(style="white", color_codes=True)
sns.jointplot(x='NO2',y='O3',data=data)

In [None]:
sns.set(style="white", color_codes=True)
sns.jointplot(x='TEMP',y='RH',data=data)

In [None]:
sns.set(style="white", color_codes=True)
sns.jointplot(x='NO2',y='CO',data=data)

# Train Test Split

Now its time to split our data into a training set and a testing set!

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = data.drop('label', axis=1)
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [None]:
X_train.to_excel('Training_Testing/X_train.xlsx')
X_test.to_excel('Training_Testing/X_test.xlsx')
y_train.to_excel('Training_Testing/y_train.xlsx')
y_test.to_excel('Training_Testing/y_test.xlsx')

# Training the Random Forest model

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rfc = RandomForestClassifier(n_estimators=600)

In [None]:
rfc.fit(X_train,y_train)

# Predictions

In [None]:
predictions = rfc.predict(X_test)

In [None]:
predictions

**Now create a classification report from the results**

In [None]:
from sklearn.metrics import classification_report,confusion_matrix

In [None]:
print(classification_report(y_test,predictions))

In [None]:
print(confusion_matrix(y_test,predictions))

In [None]:
CO = X_test.CO
SO2 = X_test.SO2
NO2 = X_test.NO2
O3 = X_test.O3
PM10 = X_test.PM10

In [None]:
CO_mean = CO.mean()
SO2_mean = SO2.mean()
NO2_mean = NO2.mean()
O3_mean = O3.mean()
PM10_mean = PM10.mean()

In [None]:
def checkCo(q):
    v = ''
    if q > 0 and q <= 1:
        v = "Good"
    elif q > 1.1 and q <= 2.0:
        v = "Satisfactory"
    elif q > 2.1 and q <= 10:
        v = "Moderately Polluted"
    elif q > 10 and q <= 17:
        v = "Poor"
    elif q > 17 and q <= 34:
        v = "Very Poor"
    else:
        v = "Severe"
        
    print('CO-->  %2.3f    %s' %(q, v))
        

In [None]:
def checkSO2(q):
    v = ''
    if q > 0 and q <= 40:
        v = "Good"
    elif q > 41 and q <= 80:
        v = "Satisfactory"
    elif q > 81 and q <= 380:
        v = "Moderately Polluted"
    elif q > 381 and q <= 800:
        v = "Poor"
    elif q > 801 and q <= 1600:
        v = "Very Poor"
    else:
        v = "Severe"
        
    print('SO2-->  %2.3f    %s' %(q, v))

In [None]:
def checkNO2(q):
    v = ''
    if q > 0 and q <= 40:
        v = "Good"
    elif q > 41 and q <= 80:
        v = "Satisfactory"
    elif q > 81 and q <= 180:
        v = "Moderately Polluted"
    elif q > 181 and q <= 280:
        v = "Poor"
    elif q > 281 and q <= 400:
        v = "Very Poor"
    else:
        v = "Severe"
        
    print('NO2-->  %2.3f    %s' %(q, v))

In [None]:
def checkO3(q):
    v = ''
    if q > 0 and q <= 50:
        v = "Good"
    elif q > 51 and q <= 100:
        v = "Satisfactory"
    elif q > 101 and q <= 168:
        v = "Moderately Polluted"
    elif q > 169 and q <= 208:
        v = "Poor"
    elif q > 209 and q <= 748:
        v = "Very Poor"
    else:
        v = "Severe"
        
    print('O3-->  %2.3f    %s' %(q, v))

In [None]:
def checkPM(q):
    v = ''
    if q > 0 and q <= 50:
        v = "Good"
    elif q > 51 and q <= 100:
        v = "Satisfactory"
    elif q > 101 and q <= 250:
        v = "Moderately Polluted"
    elif q > 251 and q <= 350:
        v = "Poor"
    elif q > 351 and q <= 430:
        v = "Very Poor"
    else:
        v = "Severe"
        
    print('PM-->  %2.3f    %s' %(q, v))

# Air Quality Analysis

In [None]:
l = [checkCo(CO_mean), checkSO2(SO2_mean), checkNO2(NO2_mean), checkO3(O3_mean), checkPM(PM10_mean)]

# Real Time Prediction

In [None]:
x_test_dt = [[56,27,2.1,0.14,0.2,17,0.79,23,37,20,199,12]]

In [None]:
df = pd.DataFrame(x_test_dt)

In [None]:
pred = rfc.predict(df)

In [None]:
print (pred)