In [7]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os

# Dataset Loading and Initial Analysis

In [8]:
path = "."
filename_read = os.path.join(path, "AirQuality.csv")
dataset = pd.read_csv(filename_read, sep = ";", decimal = ",")
dataset.replace(to_replace = -200, value = np.nan, inplace = True)

print(dataset.head())
print(dataset.info())
print(dataset.shape)
print(dataset.isnull().sum(axis = 0))

         Date      Time  CO(GT)  PT08.S1(CO)  NMHC(GT)  C6H6(GT)  \
0  10/03/2004  18.00.00     2.6       1360.0     150.0      11.9   
1  10/03/2004  19.00.00     2.0       1292.0     112.0       9.4   
2  10/03/2004  20.00.00     2.2       1402.0      88.0       9.0   
3  10/03/2004  21.00.00     2.2       1376.0      80.0       9.2   
4  10/03/2004  22.00.00     1.6       1272.0      51.0       6.5   

   PT08.S2(NMHC)  NOx(GT)  PT08.S3(NOx)  NO2(GT)  PT08.S4(NO2)  PT08.S5(O3)  \
0         1046.0    166.0        1056.0    113.0        1692.0       1268.0   
1          955.0    103.0        1174.0     92.0        1559.0        972.0   
2          939.0    131.0        1140.0    114.0        1555.0       1074.0   
3          948.0    172.0        1092.0    122.0        1584.0       1203.0   
4          836.0    131.0        1205.0    116.0        1490.0       1110.0   

      T    RH      AH  Unnamed: 15  Unnamed: 16  
0  13.6  48.9  0.7578          NaN          NaN  
1  13.3  47.7  0

# NaN values cleaning

In [9]:
dataset.drop(['NMHC(GT)','Unnamed: 15', 'Unnamed: 16'], axis = 1,inplace=True, errors = 'ignore')
# Test performance change by dropping NA values or filling them with average of their column
dataset.dropna(inplace=True)

# fill_mean = lambda col : col.fillna(col.mean())
# dataset.apply(fill_mean, axis = 0)
print(dataset.info())

<class 'pandas.core.frame.DataFrame'>
Index: 6941 entries, 0 to 9356
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Date           6941 non-null   object 
 1   Time           6941 non-null   object 
 2   CO(GT)         6941 non-null   float64
 3   PT08.S1(CO)    6941 non-null   float64
 4   C6H6(GT)       6941 non-null   float64
 5   PT08.S2(NMHC)  6941 non-null   float64
 6   NOx(GT)        6941 non-null   float64
 7   PT08.S3(NOx)   6941 non-null   float64
 8   NO2(GT)        6941 non-null   float64
 9   PT08.S4(NO2)   6941 non-null   float64
 10  PT08.S5(O3)    6941 non-null   float64
 11  T              6941 non-null   float64
 12  RH             6941 non-null   float64
 13  AH             6941 non-null   float64
dtypes: float64(12), object(2)
memory usage: 813.4+ KB
None


# Date split into Day, Month, Year

In [10]:
dataset['DateTime'] = pd.to_datetime(dataset['Date'].astype(str) + '-' + dataset['Time'].astype(str), format='%d/%m/%Y-%H.%M.%S')
dataset["Day"] = dataset["DateTime"].dt.day
dataset["Month"] = dataset["DateTime"].dt.month_name().astype('category').cat.codes
dataset["Year"] = dataset["DateTime"].dt.year
dataset["Weekday"] = dataset["DateTime"].dt.day_name().astype('category').cat.codes
dataset['Time'] = dataset["DateTime"].dt.hour
dataset.drop('Date', axis = 1,inplace=True, errors = 'ignore')

print(dataset.info())
print(dataset.head())

<class 'pandas.core.frame.DataFrame'>
Index: 6941 entries, 0 to 9356
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Time           6941 non-null   int32         
 1   CO(GT)         6941 non-null   float64       
 2   PT08.S1(CO)    6941 non-null   float64       
 3   C6H6(GT)       6941 non-null   float64       
 4   PT08.S2(NMHC)  6941 non-null   float64       
 5   NOx(GT)        6941 non-null   float64       
 6   PT08.S3(NOx)   6941 non-null   float64       
 7   NO2(GT)        6941 non-null   float64       
 8   PT08.S4(NO2)   6941 non-null   float64       
 9   PT08.S5(O3)    6941 non-null   float64       
 10  T              6941 non-null   float64       
 11  RH             6941 non-null   float64       
 12  AH             6941 non-null   float64       
 13  DateTime       6941 non-null   datetime64[ns]
 14  Day            6941 non-null   int32         
 15  Month          6941 non-nu

In [11]:
corr = dataset.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,Time,CO(GT),PT08.S1(CO),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,DateTime,Day,Month,Year,Weekday
Time,1.0,0.334863,0.287104,0.312121,0.349429,0.212155,-0.27514,0.351247,0.197884,0.213664,0.1875,-0.266642,-0.012305,-0.01414,0.012507,0.010539,-0.010742,0.004125
CO(GT),0.334863,1.0,0.877014,0.930008,0.91431,0.786456,-0.701038,0.67384,0.630834,0.85348,0.018334,0.064753,0.059346,0.020334,0.008363,0.110445,-0.062863,0.049215
PT08.S1(CO),0.287104,0.877014,1.0,0.87743,0.886068,0.707705,-0.762895,0.628263,0.67591,0.897166,0.028277,0.169234,0.149752,-0.014803,0.017307,0.098372,0.008029,0.042228
C6H6(GT),0.312121,0.930008,0.87743,1.0,0.982705,0.718344,-0.725722,0.603241,0.761805,0.861154,0.189003,-0.021592,0.187072,-0.081959,0.006162,0.179121,-0.156601,0.058746
PT08.S2(NMHC),0.349429,0.91431,0.886068,0.982705,1.0,0.705359,-0.78163,0.63331,0.774288,0.876777,0.228333,-0.046084,0.20559,-0.103637,0.000271,0.178384,-0.171024,0.05372
NOx(GT),0.212155,0.786456,0.707705,0.718344,0.705359,1.0,-0.662166,0.757029,0.233793,0.78855,-0.275998,0.232255,-0.144186,0.425542,-0.007137,0.139436,0.217982,0.037727
PT08.S3(NOx),-0.27514,-0.701038,-0.762895,-0.725722,-0.78163,-0.662166,1.0,-0.641377,-0.511223,-0.793364,-0.099495,-0.116479,-0.223381,-0.269086,0.005305,-0.101997,-0.173039,-0.028237
NO2(GT),0.351247,0.67384,0.628263,0.603241,0.63331,0.757029,-0.641377,1.0,0.142612,0.702524,-0.214325,-0.075333,-0.349646,0.365618,-0.000331,-0.005744,0.384341,0.038163
PT08.S4(NO2),0.197884,0.630834,0.67591,0.761805,0.774288,0.233793,-0.511223,0.142612,1.0,0.574242,0.566586,-0.00916,0.64639,-0.545047,0.002206,0.178562,-0.518358,0.048381
PT08.S5(O3),0.213664,0.85348,0.897166,0.861154,0.876777,0.78855,-0.793364,0.702524,0.574242,1.0,-0.046146,0.164821,0.075807,0.103162,-0.01279,0.121807,0.032893,0.064462


In [None]:
import seaborn as sns
# dataset_percent = dataset.sample(frac=0.2)
g = sns.PairGrid(dataset, hue="PT08.S3(NOx)")
g.map_diag(sns.histplot)
g.map_offdiag(sns.scatterplot)
g.add_legend()

In [None]:
y = dataset['NOx(GT)']
dataset.drop('DateTime', axis = 1,inplace=True, errors = 'ignore')
dataset_final = dataset.drop('PT08.S3(NOx)',axis=1)
print(dataset_final.info())
# Time series dataframe cannot be loaded into sklearn -> Useful for analysis
dataset.drop('DateTime', axis = 1,inplace=True, errors = 'ignore')
X = dataset_final.values

In [None]:
from sklearn.model_selection import train_test_split
print(X.shape)
print(y.shape)

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=42)
print(ytrain[:10])
print(ytest[:10])

print(Xtrain[:10])
print(Xtest[:10])

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn import svm
from sklearn import metrics

# model = svm.SVR()
model = LinearRegression(fit_intercept=True)
print(model)
model.fit(Xtrain, ytrain)
#print(model.coef_)
ypred = model.predict(Xtest)

print(ytest[:10])
print(ytest.shape)
print(ypred[:10])
print(ypred.shape)

print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(ytest, ypred)))

In [None]:
def chart_regression(pred, y, sort=True):
    t = pd.DataFrame({'pred': pred, 'y': y.flatten()})
    if sort:
        t.sort_values(by=['y'], inplace=True)
    plt.plot(t['y'].tolist(), label='expected')
    plt.plot(t['pred'].tolist(), label='prediction')
    plt.ylabel('output')
    plt.legend()
    plt.show()
    
chart_regression(ypred[:200].flatten(),ytest[:200].to_numpy(),sort=True) 

In [None]:
path = "."

#read in the data as csv
filename_read = os.path.join(path, "AirQuality.csv")
dataset = pd.read_csv(filename_read)

print(dataset.head())
print(dataset.info())


# dataset.drop(dataset.columns[0],axis=1, inplace=True)
# dataset.drop(['artist_name', 'track_id'], axis=1, inplace=True)
# dataset = dataset.sample(frac=0.1)
#dataset['popularity'] = np.where(dataset['popularity'] < 50, 0, 1)
# labels = dataset['popularity']


# le = LabelEncoder()
# # for label in ['artist_name', 'track_name','track_id', 'genre']:
# for label in ['track_name', 'genre']:
#     dataset[label] = le.fit_transform(dataset[label])
# # dataset[['artist_name', 'track_name','track_id', 'genre']] = dataset[['artist_name', 'track_name','track_id', 'genre']].astype(float)

# #print(labels)
# print(dataset.head())
# print(dataset.info())

# dataset_final = dataset.drop('popularity',axis=1)

# X = dataset_final.values
# #print(X[:10])
# y = labels
# print(y[:10])

# print(dataset_final.columns)
# print(X.shape)
# print(y.shape)
# print(type(X))  
# print(type(y))  



In [None]:
corr = dataset.corr()
corr.style.background_gradient(cmap='coolwarm')

In [None]:
print(dataset.iloc[:,:4])

In [None]:
import seaborn as sns
# dataset_percent = dataset.sample(frac=0.2)
g = sns.PairGrid(dataset.iloc[:,:4], hue="popularity")
g.map_diag(sns.histplot)
g.map_offdiag(sns.scatterplot)
g.add_legend()

In [None]:
from sklearn.model_selection import train_test_split
print(X.shape)
print(y.shape)

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.25, random_state=7)
print(ytrain[:10])
print(ytest[:10])

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn import svm
from sklearn import metrics

model = svm.SVR()
# model = LinearRegression(fit_intercept=True)
print(model)
model.fit(Xtrain, ytrain)
print(model.coef_)
ypred = model.predict(Xtest)

print(ytest[:10])
print(ytest.shape)
print(ypred[:10])
print(ypred.shape)

print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(ytest, ypred)))

In [None]:
def chart_regression(pred, y, sort=True):
    t = pd.DataFrame({'pred': pred, 'y': y.flatten()})
    if sort:
        t.sort_values(by=['y'], inplace=True)
    plt.plot(t['y'].tolist(), label='expected')
    plt.plot(t['pred'].tolist(), label='prediction')
    plt.ylabel('output')
    plt.legend()
    plt.show()
    
chart_regression(ypred[:100].flatten(),ytest[:100].to_numpy(),sort=True)  

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

knn = KNeighborsClassifier(1)
knn.fit(Xtrain, ytrain)
predict = knn.predict(Xtest)
accuracy = accuracy_score(ytest, predict)
print(accuracy)

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

gaussian = GaussianNB()
gaussian.fit(Xtrain, ytrain)
predict = gaussian.predict(Xtest)
accuracy = accuracy_score(ytest, predict)
print(accuracy)