# **Airquality** dataset

Notebook for loading and preprocessing of data from: https://archive.ics.uci.edu/ml/machine-learning-databases/00501/PRSA2017_Data_20130301-20170228.zip

In [15]:
# Loading libraries

import os
import re
import pandas as pd
import numpy as np
import requests
import io
import zipfile

1) Download data

In [None]:
URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00501/PRSA2017_Data_20130301-20170228.zip'

r = requests.get(URL)

z = zipfile.ZipFile(io.BytesIO(r.content))

2) Unzip the file

In [12]:
def descomprimir_zip(archivo_zip, carpeta_destino):
    # Make sure that the destination folder exists.
    if not os.path.exists(carpeta_destino):
        os.makedirs(carpeta_destino)

    # Open zip file
    with zipfile.ZipFile(archivo_zip, 'r') as zip_ref:
        # Extract the entire contents of the zip file to the destination folder
        zip_ref.extractall(carpeta_destino)
        print(f"Todos los archivos han sido descomprimidos en {carpeta_destino}")


archivo_zip = r'C:\Users\mar27\OneDrive\Documentos\CURSOS\DATA_ANALYTICS_UPGRADEHUB\1_MODULO\05_Preprocesamiento-de-Datos\content\airquality_dataset\PRSA2017_Data_20130301-20170228.zip'  # Cambia esto por la ruta a tu archivo zip
carpeta_destino = r'C:\Users\mar27\OneDrive\Documentos\CURSOS\DATA_ANALYTICS_UPGRADEHUB\1_MODULO\05_Preprocesamiento-de-Datos\content\airquality_dataset'  # Cambia esto por la ruta de la carpeta destino

descomprimir_zip(archivo_zip, carpeta_destino)


Todos los archivos han sido descomprimidos en C:\Users\mar27\OneDrive\Documentos\CURSOS\DATA_ANALYTICS_UPGRADEHUB\1_MODULO\05_Preprocesamiento-de-Datos\content\airquality_dataset


3) Create an empty dataframe

In [16]:
df = pd.DataFrame()

4) Iterate over the files in the beijing folder and read them with pandas

In [17]:
for file in os.listdir('./PRSA_Data_20130301-20170228'):
    if file.endswith('.csv'):
        df = pd.concat([df, pd.read_csv('./PRSA_Data_20130301-20170228/' + file)])

In [18]:
df

Unnamed: 0,No,year,month,day,hour,PM2.5,PM10,SO2,NO2,CO,O3,TEMP,PRES,DEWP,RAIN,wd,WSPM,station
0,1,2013,3,1,0,4.0,4.0,4.0,7.0,300.0,77.0,-0.7,1023.0,-18.8,0.0,NNW,4.4,Aotizhongxin
1,2,2013,3,1,1,8.0,8.0,4.0,7.0,300.0,77.0,-1.1,1023.2,-18.2,0.0,N,4.7,Aotizhongxin
2,3,2013,3,1,2,7.0,7.0,5.0,10.0,300.0,73.0,-1.1,1023.5,-18.2,0.0,NNW,5.6,Aotizhongxin
3,4,2013,3,1,3,6.0,6.0,11.0,11.0,300.0,72.0,-1.4,1024.5,-19.4,0.0,NW,3.1,Aotizhongxin
4,5,2013,3,1,4,3.0,3.0,12.0,12.0,300.0,72.0,-2.0,1025.2,-19.5,0.0,N,2.0,Aotizhongxin
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35059,35060,2017,2,28,19,11.0,32.0,3.0,24.0,400.0,72.0,12.5,1013.5,-16.2,0.0,NW,2.4,Wanshouxigong
35060,35061,2017,2,28,20,13.0,32.0,3.0,41.0,500.0,50.0,11.6,1013.6,-15.1,0.0,WNW,0.9,Wanshouxigong
35061,35062,2017,2,28,21,14.0,28.0,4.0,38.0,500.0,54.0,10.8,1014.2,-13.3,0.0,NW,1.1,Wanshouxigong
35062,35063,2017,2,28,22,12.0,23.0,4.0,30.0,400.0,59.0,10.5,1014.4,-12.9,0.0,NNW,1.2,Wanshouxigong


5) Check duplicates

In [19]:
df.duplicated().sum()

0

6) Check nulls

In [20]:
df.isnull().sum()/len(df)*100

No         0.000000
year       0.000000
month      0.000000
day        0.000000
hour       0.000000
PM2.5      2.076916
PM10       1.532674
SO2        2.143937
NO2        2.879497
CO         4.919813
O3         3.155421
TEMP       0.094589
PRES       0.093401
DEWP       0.095777
RAIN       0.092688
wd         0.433018
WSPM       0.075576
station    0.000000
dtype: float64

7) Type of data

In [21]:
df.dtypes

No           int64
year         int64
month        int64
day          int64
hour         int64
PM2.5      float64
PM10       float64
SO2        float64
NO2        float64
CO         float64
O3         float64
TEMP       float64
PRES       float64
DEWP       float64
RAIN       float64
wd          object
WSPM       float64
station     object
dtype: object

8) Impute missing values

We repair null values using this function to repair float, objets and ints, since they represent very little % of the data and will not affect the model.

If the % was higher for example 20% we could use KNN. 

In [22]:
def impute_missing_values(df):
    for col in df.columns:
        if df[col].dtype == 'float64':
            df[col] = df[col].fillna(df[col].mean())
        elif df[col].dtype == 'object':
            df[col] = df[col].fillna(df[col].mode()[0])
        else:
            df[col] = df[col].fillna(df[col].median())
    return df

df = impute_missing_values(df)

In [23]:
print(df.isnull().sum())

No         0
year       0
month      0
day        0
hour       0
PM2.5      0
PM10       0
SO2        0
NO2        0
CO         0
O3         0
TEMP       0
PRES       0
DEWP       0
RAIN       0
wd         0
WSPM       0
station    0
dtype: int64
