In [8]:
import numpy as np 
import pandas as pd 
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score,accuracy_score
from sklearn.linear_model import LogisticRegression


In [9]:
data=pd.read_csv('data.csv',encoding="ISO-8859-1")


In [10]:
data.fillna(0, inplace=True)


In [11]:
data.head()

Unnamed: 0,Temperature,Humidity,pm2_5,CO
0,22.4,19.995,0.0,0.92
1,24.93,20.73,0.0,0.97
2,23.44,17.387,0.0,17.4
3,22.5,18.725,0.0,1.7
4,22.0,20.622,0.0,22.1


In [12]:
data.shape

(4999, 4)

## Data understanding

In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4999 entries, 0 to 4998
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Temperature  4999 non-null   float64
 1   Humidity     4999 non-null   float64
 2   pm2_5        4999 non-null   float64
 3   CO           4999 non-null   float64
dtypes: float64(4)
memory usage: 156.3 KB


In [14]:
data.isnull().sum()

Temperature    0
Humidity       0
pm2_5          0
CO             0
dtype: int64

In [15]:
data.describe()

Unnamed: 0,Temperature,Humidity,pm2_5,CO
count,4999.0,4999.0,4999.0,4999.0
mean,32.171189,0.818006,6.569348,0.372903
std,13.715349,5.73115,14.711733,4.086251
min,0.0,0.0,0.0,0.0
25%,19.267,0.0,0.02,0.0
50%,31.681,0.0,0.68,0.0
75%,43.572,0.0,4.52,0.0
max,66.08,55.004,175.81,132.07


In [16]:
data.nunique()

Temperature    1508
Humidity         45
pm2_5          1368
CO              113
dtype: int64

In [17]:
data.columns

Index(['Temperature', 'Humidity', 'pm2_5', 'CO'], dtype='object')

## Calculating Air Quality Index

In [18]:
#Function to calculate co individual pollutant index(coi)
def calculate_coi(co):
    coi=0
    if co<= 4.4:
        coi = co * 50 / 4.4
    elif co <= 9.4:
        coi = 50 + ((co - 4.4) * 50 / 5)
    elif co <= 12.4:
        coi = 100 + ((co - 9.4) * 100 / 3)
    elif co <= 15.4:
        coi = 200 + ((co - 12.4) * 100 / 3)
    else:
        coi = 300 + ((co - 15.4) * 100 / 4.6)
    
    return coi
data['coi']=data['CO'].apply(calculate_coi)
df= data[['CO','coi']]
df.head()

Unnamed: 0,CO,coi
0,0.92,10.454545
1,0.97,11.022727
2,17.4,343.478261
3,1.7,19.318182
4,22.1,445.652174


In [19]:
#Function to calculate pm2.5 individual pollutant index(pm25i)
def calculate_pm25i(pm25_concentration):
    if pm25_concentration <= 12:
        pm25i = pm25_concentration * 50 / 12
    elif pm25_concentration <= 35.4:
        pm25i = 50 + ((pm25_concentration - 12) * 50 / 23.4)
    elif pm25_concentration <= 55.4:
        pm25i = 100 + ((pm25_concentration - 35.4) * 100 / 20)
    elif pm25_concentration <= 150.4:
        pm25i = 200 + ((pm25_concentration - 55.4) * 100 / 95)
    elif pm25_concentration <= 250.4:
        pm25i = 300 + ((pm25_concentration - 150.4) * 100 / 100)
    elif pm25_concentration <= 350.4:
        pm25i = 400 + ((pm25_concentration - 250.4) * 100 / 100)
    elif pm25_concentration <= 500.4:
        pm25i = 500 + ((pm25_concentration - 350.4) * 100 / 150)
    else:
        pm25i = None  
        
    return pm25i

data['pm25i']=data['pm2_5'].apply(calculate_pm25i)
df= data[['pm2_5','pm25i']]
df.head()



Unnamed: 0,pm2_5,pm25i
0,0.0,0.0
1,0.0,0.0
2,0.0,0.0
3,0.0,0.0
4,0.0,0.0


In [21]:
def calculate_aqi(pm25i, coi, temperature, humidity):
    aqi=0
    if(pm25i>coi and pm25i>temperature and pm25i>humidity):
        aqi=pm25i
    if(coi>pm25i and coi>temperature and coi>humidity):
        aqi=coi
        
    return aqi
data['AQI']=data.apply(lambda x:calculate_aqi(x['pm25i'],x['coi'],x['Temperature'],x['Humidity']),axis=1)
df= data[['pm25i','coi','Temperature','Humidity','AQI']]
df.head()

Unnamed: 0,pm25i,coi,Temperature,Humidity,AQI
0,0.0,10.454545,22.4,19.995,0.0
1,0.0,11.022727,24.93,20.73,0.0
2,0.0,343.478261,23.44,17.387,343.478261
3,0.0,19.318182,22.5,18.725,0.0
4,0.0,445.652174,22.0,20.622,445.652174


In [22]:
def AQI_Range(x):
    if x<=50:
        return "Good"
    elif x>50 and x<=100:
        return "Moderate"
    elif x>100 and x<=200:
        return "Poor"
    elif x>200 and x<=300:
        return "Unhealthy"
    elif x>300 and x<=400:
        return "Very Unhealthy"
    elif x>400:
        return "Hazardous"
    
df['AQI_Range']=data['AQI'].apply(AQI_Range)
df.head()

Unnamed: 0,pm25i,coi,Temperature,Humidity,AQI,AQI_Range
0,0.0,10.454545,22.4,19.995,0.0,Good
1,0.0,11.022727,24.93,20.73,0.0,Good
2,0.0,343.478261,23.44,17.387,343.478261,Very Unhealthy
3,0.0,19.318182,22.5,18.725,0.0,Good
4,0.0,445.652174,22.0,20.622,445.652174,Hazardous


In [23]:
df['AQI_Range'].value_counts()

Good              4105
Moderate           606
Poor               150
Unhealthy           96
Hazardous           28
Very Unhealthy      14
Name: AQI_Range, dtype: int64

## Splitting dataset into dependent and independent column

In [24]:
X=df[['pm25i','coi','Temperature','Humidity']]
Y=df['AQI']
X.head()

Unnamed: 0,pm25i,coi,Temperature,Humidity
0,0.0,10.454545,22.4,19.995
1,0.0,11.022727,24.93,20.73
2,0.0,343.478261,23.44,17.387
3,0.0,19.318182,22.5,18.725
4,0.0,445.652174,22.0,20.622


In [25]:
Y.head()

0      0.000000
1      0.000000
2    343.478261
3      0.000000
4    445.652174
Name: AQI, dtype: float64

In [27]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=70)
print(X_train.shape,X_test.shape,Y_train.shape,Y_test.shape)

(3999, 4) (1000, 4) (3999,) (1000,)


# RandomForestRegressor

In [28]:
RF=RandomForestRegressor().fit(X_train,Y_train)


In [29]:
#predicting train
Train_preds1=RF.predict(X_train)
#predicting test
Test_preds1=RF.predict(X_test)



In [30]:
RMSE_train=(np.sqrt(metrics.mean_squared_error(Y_train, Train_preds1))) 
RMSE_test=(np.sqrt(metrics.mean_squared_error(Y_test, Test_preds1))) 
print("RMSE TrainingData=", str(RMSE_train)) 
print("RMSE TestData=", str(RMSE_test)) 
print('-'*50) 
print('RSquared value on train:',RF.score(X_train, Y_train)) 
print('RSquared value on test:',RF.score (X_test, Y_test))


RMSE TrainingData= 3.9138053103233266
RMSE TestData= 3.738538726000864
--------------------------------------------------
RSquared value on train: 0.998526191665603
RSquared value on test: 0.9957138711222199


## Logistic Regression

In [81]:
X2=df[['pm25i','coi','Temperature','Humidity']]
Y2=df['AQI_Range']

In [82]:
X_train2,X_test2,Y_train2,Y_test2=train_test_split(X2,Y2,test_size=0.33,random_state=70)

In [83]:
log_reg =linear_model.LogisticRegression().fit(X_train2,Y_train2)

In [84]:
#predict on train
Train_preds2 =log_reg.predict(X_train2)

#accuracy on train
print("Model accuracy on train is: ",accuracy_score(Y_train2, Train_preds2))



Model accuracy on train is:  0.9441624365482234


In [85]:
#predict on test
Test_preds2 =log_reg.predict(X_test2)

#accuracy on test
print("Model accuracy on test is: ", accuracy_score(Y_test2,Test_preds2))
print('-'*50)

#kappa score.
print("kappaScore is: ",metrics.cohen_kappa_score(Y_test2,Test_preds2))

Model accuracy on test is:  0.9472727272727273
--------------------------------------------------
kappaScore is:  0.8290137110635454


### Testing with random values

In [86]:
log_reg.predict([[723.3,456.4,77.8,92.4]])

array(['Unhealthy'], dtype=object)

In [87]:
log_reg.predict([[200.3,46.4,77.8,92.4]])

array(['Good'], dtype=object)