In [1]:
import time
import datetime
import pandas as pd
import numpy as np
import datetime
import warnings
warnings.filterwarnings("ignore")
from matplotlib import pyplot
# Loading datasets
data = pd.read_csv("./Data/CO.csv", na_values = "NaN",  parse_dates=['Date'], date_parser = pd.to_datetime )

In [2]:
# Replacing -1 values (which is missing values) to NaN
data.replace(-1, 'NaN', inplace=True)
data = data[(data != "NaN")]

In [3]:
#Dataset Data Types
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 708 entries, 0 to 707
Data columns (total 20 columns):
 #   Column                             Non-Null Count  Dtype         
---  ------                             --------------  -----         
 0   Date                               708 non-null    datetime64[ns]
 1   Source                             708 non-null    object        
 2   Site ID                            708 non-null    int64         
 3   POC                                708 non-null    int64         
 4   Daily Max 8-hour CO Concentration  708 non-null    float64       
 5   UNITS                              708 non-null    object        
 6   DAILY_AQI_VALUE                    708 non-null    int64         
 7   Site Name                          708 non-null    object        
 8   DAILY_OBS_COUNT                    708 non-null    int64         
 9   PERCENT_COMPLETE                   708 non-null    float64       
 10  AQS_PARAMETER_CODE                 708

In [4]:
#Statistical data Description
data.describe()

Unnamed: 0,Site ID,POC,Daily Max 8-hour CO Concentration,DAILY_AQI_VALUE,DAILY_OBS_COUNT,PERCENT_COMPLETE,AQS_PARAMETER_CODE,CBSA_CODE,STATE_CODE,COUNTY_CODE,SITE_LATITUDE,SITE_LONGITUDE
count,708.0,708.0,708.0,708.0,708.0,708.0,708.0,708.0,708.0,708.0,708.0,708.0
mean,175736500.0,1.0,0.382627,4.293785,23.614407,98.396893,42101.0,16980.0,17.512712,60.737288,41.87804,-87.626098
std,5289935.0,0.0,0.222422,2.689203,2.270197,9.442559,0.0,0.0,0.500192,29.011121,0.25556,0.168901
min,170314200.0,1.0,0.0,0.0,3.0,13.0,42101.0,16980.0,17.0,31.0,41.629073,-87.799227
25%,170314200.0,1.0,0.2,2.0,24.0,100.0,42101.0,16980.0,17.0,31.0,41.629073,-87.799227
50%,180890000.0,1.0,0.3,3.0,24.0,100.0,42101.0,16980.0,18.0,89.0,41.629073,-87.461554
75%,180890000.0,1.0,0.5,6.0,24.0,100.0,42101.0,16980.0,18.0,89.0,42.139996,-87.461554
max,180890000.0,1.0,1.7,19.0,24.0,100.0,42101.0,16980.0,18.0,89.0,42.139996,-87.461554


In [5]:
#Describing Percentage for each column Total Missing Values in the Datasets
data.isnull().sum() * 100 / len(data)

Date                                 0.0
Source                               0.0
Site ID                              0.0
POC                                  0.0
Daily Max 8-hour CO Concentration    0.0
UNITS                                0.0
DAILY_AQI_VALUE                      0.0
Site Name                            0.0
DAILY_OBS_COUNT                      0.0
PERCENT_COMPLETE                     0.0
AQS_PARAMETER_CODE                   0.0
AQS_PARAMETER_DESC                   0.0
CBSA_CODE                            0.0
CBSA_NAME                            0.0
STATE_CODE                           0.0
STATE                                0.0
COUNTY_CODE                          0.0
COUNTY                               0.0
SITE_LATITUDE                        0.0
SITE_LONGITUDE                       0.0
dtype: float64

In [6]:
# Dataset shape with NaN Values
data.shape

(708, 20)

In [7]:
# NaN Values Drop and Clean Dataset 
data = data.dropna()

In [8]:
# After removing NaN Values the Dateset shape
data.shape

(708, 20)

In [9]:
#Extraction in the new column Year, Month, Day, Time from Date
data['Year'] = data.Date.dt.year.astype('int64')
data['Month'] = data.Date.dt.month.astype('int64')
data['Week'] = data.Date.dt.week.astype('int64')
data['Day'] = data.Date.dt.day.astype('int64')
# create a list of our conditions
conditions = [
    (data['DAILY_AQI_VALUE'] <= 50),
    (data['DAILY_AQI_VALUE'] > 50) & (data['DAILY_AQI_VALUE'] <= 100),
    (data['DAILY_AQI_VALUE'] > 100) & (data['DAILY_AQI_VALUE'] <= 150),
    (data['DAILY_AQI_VALUE'] > 151)
    ]

# create a list of the values we want to assign for each condition
values = ['1', '2', '3', '4']

# create a new column and use np.select to assign values to it using our lists as arguments
data['Level'] = np.select(conditions, values)

In [10]:
#Dataset
data

Unnamed: 0,Date,Source,Site ID,POC,Daily Max 8-hour CO Concentration,UNITS,DAILY_AQI_VALUE,Site Name,DAILY_OBS_COUNT,PERCENT_COMPLETE,...,STATE,COUNTY_CODE,COUNTY,SITE_LATITUDE,SITE_LONGITUDE,Year,Month,Week,Day,Level
0,2018-01-01,AQS,170314201,1,0.5,ppm,6,NORTHBROOK WATER PLANT,19,79.0,...,Illinois,31,Cook,42.139996,-87.799227,2018,1,1,1,1
1,2018-01-02,AQS,170314201,1,0.4,ppm,5,NORTHBROOK WATER PLANT,24,100.0,...,Illinois,31,Cook,42.139996,-87.799227,2018,1,1,2,1
2,2018-01-03,AQS,170314201,1,0.6,ppm,7,NORTHBROOK WATER PLANT,24,100.0,...,Illinois,31,Cook,42.139996,-87.799227,2018,1,1,3,1
3,2018-01-04,AQS,170314201,1,0.6,ppm,7,NORTHBROOK WATER PLANT,24,100.0,...,Illinois,31,Cook,42.139996,-87.799227,2018,1,1,4,1
4,2018-01-05,AQS,170314201,1,0.7,ppm,8,NORTHBROOK WATER PLANT,24,100.0,...,Illinois,31,Cook,42.139996,-87.799227,2018,1,1,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
703,2018-12-27,AQS,180890015,1,0.3,ppm,3,East Chicago- Post Office,24,100.0,...,Indiana,89,Lake,41.629073,-87.461554,2018,12,52,27,1
704,2018-12-28,AQS,180890015,1,0.2,ppm,2,East Chicago- Post Office,24,100.0,...,Indiana,89,Lake,41.629073,-87.461554,2018,12,52,28,1
705,2018-12-29,AQS,180890015,1,0.3,ppm,3,East Chicago- Post Office,24,100.0,...,Indiana,89,Lake,41.629073,-87.461554,2018,12,52,29,1
706,2018-12-30,AQS,180890015,1,0.2,ppm,2,East Chicago- Post Office,24,100.0,...,Indiana,89,Lake,41.629073,-87.461554,2018,12,52,30,1


In [11]:
# Spliting dataset for training data and for testing data
from sklearn.model_selection import train_test_split
# Classifier Algorithm libraries loading

from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Features
f = data[['SITE_LATITUDE','SITE_LONGITUDE','COUNTY_CODE','Month','Week','Day']]
# Lables
l = data['Level']
# Spliting 30% data for test and 70% data fro Training
x_train, x_test, y_train, y_test = train_test_split(f, l, test_size=0.3,random_state=42)
# Evaluate Algorithms on default parameters
models = []
models.append(('(Multi-layer Perceptron)', MLPClassifier()))
models.append(('(K-nearest Neighbors)', KNeighborsClassifier()))
models.append(('(Naive Bayes)', GaussianNB()))
models.append(('(Logistic Regression)', LogisticRegression()))

for name, model in models:
    print("Model: ",name)
    # Execution time is calculating for each model accuracy
    start = time.time()
    # model fit for traning and prediction
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    # Accuracy Score
    print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
    stop = time.time()
    print("Execution Time: ",stop-start)
    print("\n")
    print('F1:', metrics.f1_score(y_test, y_pred,average='micro'))
    print('Precision:', metrics.precision_score(y_test, y_pred,average='micro'))
    print('Recall:', metrics.recall_score(y_test, y_pred,average='micro'))
    print("\n")
    print("\n#######################################\n")

Model:  (Multi-layer Perceptron)
Accuracy: 1.0
Execution Time:  0.1939377784729004


F1: 1.0
Precision: 1.0
Recall: 1.0



#######################################

Model:  (K-nearest Neighbors)
Accuracy: 1.0
Execution Time:  0.04198622703552246


F1: 1.0
Precision: 1.0
Recall: 1.0



#######################################

Model:  (Naive Bayes)
Accuracy: 1.0
Execution Time:  0.01499485969543457


F1: 1.0
Precision: 1.0
Recall: 1.0



#######################################

Model:  (Logistic Regression)


ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: '1'