# Load the packages

In [9]:
import pandas as pd
import matplotlib.pyplot as plt
from statistics import mean
from datetime import datetime
import cufflinks as cf
import seaborn as sns

# Load all csv files

In [26]:
station_list_india = pd.read_csv("../air-quality-india/stations_india.csv")
station_list_madrid = pd.read_csv("../air-quality-madrid/stations.csv")
city_day = pd.read_csv("../air-quality-india/city_day.csv")
city_hour = pd.read_csv("../air-quality-india/city_hour.csv")
station_day = pd.read_csv("../air-quality-india/station_day.csv")
station_hour = pd.read_csv("../air-quality-india/station_hour.csv")
madrid_2015 = pd.read_csv("../air-quality-madrid/madrid_2015.csv")
madrid_2016 = pd.read_csv("../air-quality-madrid/madrid_2016.csv")
madrid_2017 = pd.read_csv("../air-quality-madrid/madrid_2017.csv")
madrid_2018 = pd.read_csv("../air-quality-madrid/madrid_2018.csv")

# Remove the NULL values from the data set

In [None]:
filtered_india_data = india_data.dropna()
filtered_madrid_data = madrid_data.dropna()

# Rename the columns name and remove indexing

In [79]:
station_list_madrid.rename(columns = {'id':'StationId', 'name':'StationName'}, inplace = True)
station_list_madrid['StationId'] = station_list_madrid['StationId'].astype('object')
station_list_india.drop(station_list_india.columns[0], axis=1, inplace=True)
station_hour.rename(columns = {'Datetime':'Date'}, inplace = True)
city_hour.rename(columns = {'Datetime':'Date'}, inplace = True)
india_data.drop(station_list_india.columns[0], axis=1, inplace=True)
filtered_madrid_data.rename(columns = {'date':'Date', 'station': 'StationId', 'BEN': 'Benzene', 'NO_2':'NO2', 'O_3': 'O3', 'SO_2': 'SO2'}, inplace = True)
station_list_india.rename(columns = {'City':'City_Station', 'State': 'State_Station', 'Status': 'Status_Station', 'Region':'Region_Station'}, inplace = True)

# Change the datatypes

In [None]:
filtered_india_data['Date'] = pd.to_datetime(filtered_india_data.Date)
india_data['Date'] = pd.to_datetime(india_data.Date)
filtered_madrid_data['Date'] = pd.to_datetime(filtered_madrid_data.Date)
filtered_madrid_data['Region'] = 'Spain'

# Merge the datasets

In [80]:
station_list = pd.concat([station_list_madrid, station_list_india], ignore_index=True, sort=False)
madrid_data = pd.concat([madrid_2015, madrid_2016, madrid_2017, madrid_2018])
india_data = pd.concat([station_day, station_hour, city_hour, city_day])

In [75]:
aq_data = pd.concat([filtered_india_data, filtered_madrid_data], ignore_index=True, sort=False)

StationId                         object
Date                      datetime64[ns]
PM2.5                            float64
PM10                             float64
NO                               float64
NO2                              float64
NOx                              float64
NH3                              float64
CO                               float64
SO2                              float64
O3                               float64
Benzene                          float64
Toluene                          float64
Xylene                           float64
AQI                              float64
AQI_Bucket                        object
StationName                       object
City                              object
State                             object
Status                            object
Region                            object
Month                             object
Year                             float64
Season                            object
Weekday_or_weeke

In [81]:
station_list.dtypes

StationId       object
StationName     object
address         object
lon            float64
lat            float64
elevation      float64
City            object
State           object
Status          object
Region          object
dtype: object

# Generate the final file of dataset & Save 

In [85]:
final_data = pd.merge(aq_data, station_list, on = 'StationId')
final_data.to_csv('../final_dataset.csv',index = False)

In [86]:
final_data.head()

Unnamed: 0,StationId,Date,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,...,CH4,StationName_y,address,lon,lat,elevation,City_Station,State_Station,Status_Station,Region_Station
0,AP001,2017-11-25 09:00:00,104.0,148.5,1.93,23.0,13.75,9.8,0.1,15.3,...,,"Secretariat, Amaravati - APPCB",,,,,Amaravati,Andhra Pradesh,Active,5. Western
1,AP001,2017-11-25 10:00:00,94.5,142.0,1.33,16.25,9.75,9.65,0.1,17.0,...,,"Secretariat, Amaravati - APPCB",,,,,Amaravati,Andhra Pradesh,Active,5. Western
2,AP001,2017-11-25 11:00:00,82.75,126.5,1.47,14.83,9.07,9.7,0.1,15.4,...,,"Secretariat, Amaravati - APPCB",,,,,Amaravati,Andhra Pradesh,Active,5. Western
3,AP001,2017-11-25 14:00:00,68.5,117.0,1.35,13.6,8.35,7.4,0.1,21.8,...,,"Secretariat, Amaravati - APPCB",,,,,Amaravati,Andhra Pradesh,Active,5. Western
4,AP001,2017-11-25 15:00:00,69.25,112.25,1.52,11.8,7.55,9.25,0.1,21.38,...,,"Secretariat, Amaravati - APPCB",,,,,Amaravati,Andhra Pradesh,Active,5. Western
