In [1]:
#import libraries 
import pandas as pd
import numpy as np

### 01 - Getting Data
- in this research we will use three different datasets: two (for Nitrogen Dioxide and Particles Matter levels) of them for London and one for Paris air quality. So, lets import them

In [2]:
# import London Nitrogen Dioxide (NO2) dataset 
londonno2 = pd.read_csv('/Users/Lenovo/Desktop/NO2_full_network_20190101_20191231.csv') 

In [3]:
londonno2.head()

Unnamed: 0,date_UTC,no2_ugm3,pod_id_location,pod_id,location_name,ratification_status
0,2019-02-15T17:00:00Z,61.99,1245,1245,Dollis Road,P
1,2019-02-15T18:00:00Z,67.33,1245,1245,Dollis Road,P
2,2019-02-15T19:00:00Z,72.61,1245,1245,Dollis Road,P
3,2019-02-15T20:00:00Z,67.24,1245,1245,Dollis Road,P
4,2019-02-15T21:00:00Z,59.78,1245,1245,Dollis Road,P


In [4]:
# get London Particles Matter 2.5 (PM2.5) data 
londonpm25 = pd.read_csv('/Users/Lenovo/Desktop/PM2.5_full_network_20190101_20191231.csv') 

In [5]:
londonpm25.head()

Unnamed: 0,date_UTC,pm2_5_ugm3,pod_id_location,pod_id,location_name,ratification_status
0,2019-02-15T13:00:00Z,20.17,1245,1245,Dollis Road,P
1,2019-02-15T14:00:00Z,18.47,1245,1245,Dollis Road,P
2,2019-02-15T15:00:00Z,19.84,1245,1245,Dollis Road,P
3,2019-02-15T16:00:00Z,36.68,1245,1245,Dollis Road,P
4,2019-02-15T17:00:00Z,20.18,1245,1245,Dollis Road,P


In [6]:
# Get Paris air dataset 
parisair = pd.read_csv('/Users/Lenovo/Desktop/2019_A1.csv') 

In [7]:
parisair.head()

Unnamed: 0.1,Unnamed: 0,date,A1:CO,A1:PM10,A1:PM25,A1:NO2,A1:NO,A1:NOX
0,0,,Autoroute A1 - Saint-Denis,Autoroute A1 - Saint-Denis,Autoroute A1 - Saint-Denis,Autoroute A1 - Saint-Denis,Autoroute A1 - Saint-Denis,Autoroute A1 - Saint-Denis
1,1,,A1,A1,A1,A1,A1,A1
2,2,,Monoxyde de carbone,PM 10 particules,"PM 2,5 particules",dioxyde d azote,monoxyde d azote,oxydes d azote
3,3,,CO,PM10,PM25,NO2,NO,NOX
4,4,,mg/m3,microg/m3,microg/m3,microg/m3,microg/m3,microg/m3


In [8]:
# Get Paris regions 
paris_regions = pd.read_csv('/Users/Lenovo/Desktop/ecoles_creches_idf.csv') 

### 02 - Data Cleaning/Wrangling/EDA


#### 02.1 - London PM 2.5 dataset

In [9]:
# we need only PM2.5 info from this dataset, so lets drop the other columns
londonpm25.drop(['date_UTC', 'pod_id_location','pod_id','location_name','ratification_status'], axis = 1, inplace = True) 

In [10]:
londonpm25.describe()

Unnamed: 0,pm2_5_ugm3
count,666605.0
mean,-67.912651
std,272.879386
min,-999.0
25%,5.03
50%,7.28
75%,12.56
max,469.59


In [11]:
# replace -999 with median  
median = float(londonpm25['pm2_5_ugm3'].median())
londonpm25["pm2_5_ugm3"] = np.where(londonpm25["pm2_5_ugm3"] == -999, median, londonpm25['pm2_5_ugm3'])


In [12]:
londonpm25

Unnamed: 0,pm2_5_ugm3
0,20.17
1,18.47
2,19.84
3,36.68
4,20.18
...,...
666600,7.28
666601,7.28
666602,7.28
666603,7.28


In [13]:
# check for duplicates 
londonpm25.duplicated(subset=None, keep='first')

0         False
1         False
2         False
3         False
4         False
          ...  
666600     True
666601     True
666602     True
666603     True
666604     True
Length: 666605, dtype: bool

In [14]:
# remove duplicates 
londonpm = londonpm25.drop_duplicates()

In [15]:
# check for duplicates again 
londonpm.duplicated(subset=None, keep='first')

0         False
1         False
2         False
3         False
4         False
          ...  
654908    False
655123    False
655124    False
655125    False
655128    False
Length: 9490, dtype: bool

In [16]:
londonpm

Unnamed: 0,pm2_5_ugm3
0,20.17
1,18.47
2,19.84
3,36.68
4,20.18
...,...
654908,92.01
655123,86.73
655124,88.57
655125,88.41


#### 02.2 - London NO2 dataset

In [17]:
londonno2

Unnamed: 0,date_UTC,no2_ugm3,pod_id_location,pod_id,location_name,ratification_status
0,2019-02-15T17:00:00Z,61.99,1245,1245,Dollis Road,P
1,2019-02-15T18:00:00Z,67.33,1245,1245,Dollis Road,P
2,2019-02-15T19:00:00Z,72.61,1245,1245,Dollis Road,P
3,2019-02-15T20:00:00Z,67.24,1245,1245,Dollis Road,P
4,2019-02-15T21:00:00Z,59.78,1245,1245,Dollis Road,P
...,...,...,...,...,...,...
733825,2019-03-14T22:00:00Z,85.06,992450,99245,Finchley Road,P
733826,2019-03-14T23:00:00Z,64.31,992450,99245,Finchley Road,P
733827,2019-03-15T00:00:00Z,56.04,992450,99245,Finchley Road,P
733828,2019-03-15T01:00:00Z,47.78,992450,99245,Finchley Road,P


In [18]:
# we dont need IDs and ratification_status columns, so lets drop them 
londonno2.drop(['pod_id_location','pod_id', 'ratification_status'], axis = 1, inplace = True) 

In [19]:
londonno2.head()

Unnamed: 0,date_UTC,no2_ugm3,location_name
0,2019-02-15T17:00:00Z,61.99,Dollis Road
1,2019-02-15T18:00:00Z,67.33,Dollis Road
2,2019-02-15T19:00:00Z,72.61,Dollis Road
3,2019-02-15T20:00:00Z,67.24,Dollis Road
4,2019-02-15T21:00:00Z,59.78,Dollis Road


In [20]:
londonno2

Unnamed: 0,date_UTC,no2_ugm3,location_name
0,2019-02-15T17:00:00Z,61.99,Dollis Road
1,2019-02-15T18:00:00Z,67.33,Dollis Road
2,2019-02-15T19:00:00Z,72.61,Dollis Road
3,2019-02-15T20:00:00Z,67.24,Dollis Road
4,2019-02-15T21:00:00Z,59.78,Dollis Road
...,...,...,...
733825,2019-03-14T22:00:00Z,85.06,Finchley Road
733826,2019-03-14T23:00:00Z,64.31,Finchley Road
733827,2019-03-15T00:00:00Z,56.04,Finchley Road
733828,2019-03-15T01:00:00Z,47.78,Finchley Road


In [21]:
# check for duplicates 
londonno2.duplicated(subset=None, keep='first')

0         False
1         False
2         False
3         False
4         False
          ...  
733825    False
733826    False
733827    False
733828    False
733829    False
Length: 733830, dtype: bool

In [22]:
# replace -999 with median  
median = float(londonno2['no2_ugm3'].median())
londonno2["no2_ugm3"] = np.where(londonno2["no2_ugm3"] == -999, median, londonno2['no2_ugm3'])

In [23]:
londonno2

Unnamed: 0,date_UTC,no2_ugm3,location_name
0,2019-02-15T17:00:00Z,61.99,Dollis Road
1,2019-02-15T18:00:00Z,67.33,Dollis Road
2,2019-02-15T19:00:00Z,72.61,Dollis Road
3,2019-02-15T20:00:00Z,67.24,Dollis Road
4,2019-02-15T21:00:00Z,59.78,Dollis Road
...,...,...,...
733825,2019-03-14T22:00:00Z,85.06,Finchley Road
733826,2019-03-14T23:00:00Z,64.31,Finchley Road
733827,2019-03-15T00:00:00Z,56.04,Finchley Road
733828,2019-03-15T01:00:00Z,47.78,Finchley Road


In [24]:
# lets merge two dataframes 
londonair = pd.concat([londonno2, londonpm], axis=1, join="inner")

In [25]:
londonair

Unnamed: 0,date_UTC,no2_ugm3,location_name,pm2_5_ugm3
0,2019-02-15T17:00:00Z,61.99,Dollis Road,20.17
1,2019-02-15T18:00:00Z,67.33,Dollis Road,18.47
2,2019-02-15T19:00:00Z,72.61,Dollis Road,19.84
3,2019-02-15T20:00:00Z,67.24,Dollis Road,36.68
4,2019-02-15T21:00:00Z,59.78,Dollis Road,20.18
...,...,...,...,...
654908,2019-01-11T16:00:00Z,66.72,National Physical Laboratory,92.01
655123,2019-01-20T15:00:00Z,39.52,National Physical Laboratory,86.73
655124,2019-01-20T16:00:00Z,36.07,National Physical Laboratory,88.57
655125,2019-01-20T17:00:00Z,41.86,National Physical Laboratory,88.41


In [26]:
# we have location names, but not city name itsels. So, lets ad a city name (London) to df as well. 
londonair['city'] = 'London'

In [27]:
londonair

Unnamed: 0,date_UTC,no2_ugm3,location_name,pm2_5_ugm3,city
0,2019-02-15T17:00:00Z,61.99,Dollis Road,20.17,London
1,2019-02-15T18:00:00Z,67.33,Dollis Road,18.47,London
2,2019-02-15T19:00:00Z,72.61,Dollis Road,19.84,London
3,2019-02-15T20:00:00Z,67.24,Dollis Road,36.68,London
4,2019-02-15T21:00:00Z,59.78,Dollis Road,20.18,London
...,...,...,...,...,...
654908,2019-01-11T16:00:00Z,66.72,National Physical Laboratory,92.01,London
655123,2019-01-20T15:00:00Z,39.52,National Physical Laboratory,86.73,London
655124,2019-01-20T16:00:00Z,36.07,National Physical Laboratory,88.57,London
655125,2019-01-20T17:00:00Z,41.86,National Physical Laboratory,88.41,London


In [28]:
# for readibility, lets re-order column positions: to have 'city', 'NO2' and 'PM2.5' next to each other
london = londonair[['date_UTC', 'city','no2_ugm3', 'pm2_5_ugm3','location_name']]

In [29]:
# lets rename column modify column names for better readibility 
london1=london.rename(columns={"date_UTC": "date", "no2_ugm3": "no2", "pm2_5_ugm3":"pm2_5"})

In [30]:
# reset index 
london1.reset_index(drop=True, inplace=True)

In [31]:
london1.head()

Unnamed: 0,date,city,no2,pm2_5,location_name
0,2019-02-15T17:00:00Z,London,61.99,20.17,Dollis Road
1,2019-02-15T18:00:00Z,London,67.33,18.47,Dollis Road
2,2019-02-15T19:00:00Z,London,72.61,19.84,Dollis Road
3,2019-02-15T20:00:00Z,London,67.24,36.68,Dollis Road
4,2019-02-15T21:00:00Z,London,59.78,20.18,Dollis Road


In [32]:
# reset index per date 
london_final = london1.sort_values(by=['date'],ignore_index=True)

In [33]:
london_final

Unnamed: 0,date,city,no2,pm2_5,location_name
0,2019-01-01T00:00:00Z,London,30.62,16.92,Savernake Road
1,2019-01-01T00:00:00Z,London,30.62,88.66,Triangle Adventure Playground
2,2019-01-01T01:00:00Z,London,30.62,77.81,Hunslett Street
3,2019-01-01T01:00:00Z,London,30.62,41.67,Pattison Road
4,2019-01-01T02:00:00Z,London,43.31,123.38,Tower Bridge Road
...,...,...,...,...,...
9485,2019-12-31T20:00:00Z,London,27.66,64.73,West Smithsfield
9486,2019-12-31T21:00:00Z,London,14.89,33.61,Dollis Road
9487,2019-12-31T21:00:00Z,London,34.28,22.79,Savernake Road
9488,2019-12-31T22:00:00Z,London,12.69,33.77,Dollis Road


In [34]:
# check for dataframe 
london_final.info(memory_usage='deep') 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9490 entries, 0 to 9489
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           9490 non-null   object 
 1   city           9490 non-null   object 
 2   no2            9490 non-null   float64
 3   pm2_5          9490 non-null   float64
 4   location_name  9490 non-null   object 
dtypes: float64(2), object(3)
memory usage: 2.1 MB


In [35]:
# check for missing values
london_final.isna().sum() 

date             0
city             0
no2              0
pm2_5            0
location_name    0
dtype: int64

In [36]:
london_final.to_csv('/Users/Lenovo/Desktop/london_regions.csv')

#### 02.3 - Paris

In [37]:
paris_regions.head()

Unnamed: 0,ID,nom,departement,ville,CP,type,NO2_2012,NO2_2013,NO2_2014,NO2_2015,...,PM10_2015,PM10_2016,PM10_2017,PM25_2012,PM25_2013,PM25_2014,PM25_2015,PM25_2016,PM25_2017,geometry
0,0750026W,L'Enseignement par petits groupes (Ecole secon...,75,Paris 15,75015,secondaire,41.316912,42.916257,40.610102,37.182711,...,23.790138,23.077844,21.782889,17.152857,18.63708,14.895546,15.277787,14.806532,13.549347,"c(2.2908106734441, 48.8421565600018)"
1,0750106H,Ecole technologique privée CCIP Centre des For...,75,Paris 20,75020,secondaire,63.683878,60.890328,53.322156,45.804898,...,26.063983,23.803116,23.020608,22.692531,23.629341,18.628494,16.97874,15.434943,14.429643,"c(2.40580700033425, 48.8756657433978)"
2,0750132L,Ecole primaire privée Notre-Dame-Saint Roch,75,Paris 01,75001,primaire,41.507336,41.59842,42.855147,39.286859,...,22.995246,22.400441,20.867075,16.614189,18.142261,14.487745,14.562495,14.134427,12.561472,"c(2.33270615799049, 48.8661396641722)"
3,0750136R,Ecole primaire privée Saint-Sauveur,75,Paris 02,75002,primaire,45.197342,44.280198,45.187931,40.855468,...,23.650961,23.405547,22.650735,17.713481,19.089383,15.433904,15.067306,14.756645,15.11874,"c(2.34734689771536, 48.8668886093452)"
4,0750138T,Ecole primaire privée Sainte Geneviève du Marais,75,Paris 03,75003,primaire,40.109546,39.257252,39.688121,37.249166,...,22.808231,22.049934,20.717201,16.332941,17.645253,14.270936,14.391215,13.911639,12.654443,"c(2.36480092620121, 48.8569700986785)"


In [38]:
pr = paris_regions[['nom']]

In [39]:
pr

Unnamed: 0,nom
0,L'Enseignement par petits groupes (Ecole secon...
1,Ecole technologique privée CCIP Centre des For...
2,Ecole primaire privée Notre-Dame-Saint Roch
3,Ecole primaire privée Saint-Sauveur
4,Ecole primaire privée Sainte Geneviève du Marais
...,...
12525,Les p tits Cressonets
12526,Les lucioles
12527,Les coccinelles
12528,Soyer


In [40]:
# lets follow the same steps for paris air dataset.t
parisair.head()

Unnamed: 0.1,Unnamed: 0,date,A1:CO,A1:PM10,A1:PM25,A1:NO2,A1:NO,A1:NOX
0,0,,Autoroute A1 - Saint-Denis,Autoroute A1 - Saint-Denis,Autoroute A1 - Saint-Denis,Autoroute A1 - Saint-Denis,Autoroute A1 - Saint-Denis,Autoroute A1 - Saint-Denis
1,1,,A1,A1,A1,A1,A1,A1
2,2,,Monoxyde de carbone,PM 10 particules,"PM 2,5 particules",dioxyde d azote,monoxyde d azote,oxydes d azote
3,3,,CO,PM10,PM25,NO2,NO,NOX
4,4,,mg/m3,microg/m3,microg/m3,microg/m3,microg/m3,microg/m3


In [41]:
# lets merge two dataframes 
parisair = pd.concat([parisair, pr], axis=1, join="inner")

In [42]:
parisair

Unnamed: 0.1,Unnamed: 0,date,A1:CO,A1:PM10,A1:PM25,A1:NO2,A1:NO,A1:NOX,nom
0,0,,Autoroute A1 - Saint-Denis,Autoroute A1 - Saint-Denis,Autoroute A1 - Saint-Denis,Autoroute A1 - Saint-Denis,Autoroute A1 - Saint-Denis,Autoroute A1 - Saint-Denis,L'Enseignement par petits groupes (Ecole secon...
1,1,,A1,A1,A1,A1,A1,A1,Ecole technologique privée CCIP Centre des For...
2,2,,Monoxyde de carbone,PM 10 particules,"PM 2,5 particules",dioxyde d azote,monoxyde d azote,oxydes d azote,Ecole primaire privée Notre-Dame-Saint Roch
3,3,,CO,PM10,PM25,NO2,NO,NOX,Ecole primaire privée Saint-Sauveur
4,4,,mg/m3,microg/m3,microg/m3,microg/m3,microg/m3,microg/m3,Ecole primaire privée Sainte Geneviève du Marais
...,...,...,...,...,...,...,...,...,...
8760,8760,2019-12-31 20:00:00Z,0.915,85.4,57.2,81.3,255.8,474.1,Ecole primaire Henri Barbusse B
8761,8761,2019-12-31 21:00:00Z,1.233,98.8,65.6,102.3,336.4,618.7,Ecole maternelle Chantefleur
8762,8762,2019-12-31 22:00:00Z,1.588,117.9,85.5,112.8,395,718.9,Ecole élémentaire Anne Frank
8763,8763,2019-12-31 23:00:00Z,1.503,113.1,86.8,94.7,322.6,589.3,Ecole primaire Gustave Loiseau


In [43]:
# remove column 'Unnamed'
parisair = parisair.loc[:, ~parisair.columns.str.contains('^Unnamed')]

In [44]:
parisair

Unnamed: 0,date,A1:CO,A1:PM10,A1:PM25,A1:NO2,A1:NO,A1:NOX,nom
0,,Autoroute A1 - Saint-Denis,Autoroute A1 - Saint-Denis,Autoroute A1 - Saint-Denis,Autoroute A1 - Saint-Denis,Autoroute A1 - Saint-Denis,Autoroute A1 - Saint-Denis,L'Enseignement par petits groupes (Ecole secon...
1,,A1,A1,A1,A1,A1,A1,Ecole technologique privée CCIP Centre des For...
2,,Monoxyde de carbone,PM 10 particules,"PM 2,5 particules",dioxyde d azote,monoxyde d azote,oxydes d azote,Ecole primaire privée Notre-Dame-Saint Roch
3,,CO,PM10,PM25,NO2,NO,NOX,Ecole primaire privée Saint-Sauveur
4,,mg/m3,microg/m3,microg/m3,microg/m3,microg/m3,microg/m3,Ecole primaire privée Sainte Geneviève du Marais
...,...,...,...,...,...,...,...,...
8760,2019-12-31 20:00:00Z,0.915,85.4,57.2,81.3,255.8,474.1,Ecole primaire Henri Barbusse B
8761,2019-12-31 21:00:00Z,1.233,98.8,65.6,102.3,336.4,618.7,Ecole maternelle Chantefleur
8762,2019-12-31 22:00:00Z,1.588,117.9,85.5,112.8,395,718.9,Ecole élémentaire Anne Frank
8763,2019-12-31 23:00:00Z,1.503,113.1,86.8,94.7,322.6,589.3,Ecole primaire Gustave Loiseau


In [45]:
# lower and replace
parisair.columns=[e.lower().replace(' ', '_') for e in parisair.columns]   
parisair.columns

Index(['date', 'a1:co', 'a1:pm10', 'a1:pm25', 'a1:no2', 'a1:no', 'a1:nox',
       'nom'],
      dtype='object')

In [46]:
# drop columns
parisair.drop(['a1:co', 'a1:pm10', 'a1:no', 'a1:nox'], axis = 1, inplace = True) 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [47]:
# add city column 
parisair['city'] = 'Paris'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  parisair['city'] = 'Paris'


In [48]:
parisair

Unnamed: 0,date,a1:pm25,a1:no2,nom,city
0,,Autoroute A1 - Saint-Denis,Autoroute A1 - Saint-Denis,L'Enseignement par petits groupes (Ecole secon...,Paris
1,,A1,A1,Ecole technologique privée CCIP Centre des For...,Paris
2,,"PM 2,5 particules",dioxyde d azote,Ecole primaire privée Notre-Dame-Saint Roch,Paris
3,,PM25,NO2,Ecole primaire privée Saint-Sauveur,Paris
4,,microg/m3,microg/m3,Ecole primaire privée Sainte Geneviève du Marais,Paris
...,...,...,...,...,...
8760,2019-12-31 20:00:00Z,57.2,81.3,Ecole primaire Henri Barbusse B,Paris
8761,2019-12-31 21:00:00Z,65.6,102.3,Ecole maternelle Chantefleur,Paris
8762,2019-12-31 22:00:00Z,85.5,112.8,Ecole élémentaire Anne Frank,Paris
8763,2019-12-31 23:00:00Z,86.8,94.7,Ecole primaire Gustave Loiseau,Paris


In [49]:
# rename columns 
paris_updated = parisair.rename(columns={"a1:pm25":"pm2_5", "a1:no2":"no2", 'nom':'location_name'})

In [50]:
paris_updated

Unnamed: 0,date,pm2_5,no2,location_name,city
0,,Autoroute A1 - Saint-Denis,Autoroute A1 - Saint-Denis,L'Enseignement par petits groupes (Ecole secon...,Paris
1,,A1,A1,Ecole technologique privée CCIP Centre des For...,Paris
2,,"PM 2,5 particules",dioxyde d azote,Ecole primaire privée Notre-Dame-Saint Roch,Paris
3,,PM25,NO2,Ecole primaire privée Saint-Sauveur,Paris
4,,microg/m3,microg/m3,Ecole primaire privée Sainte Geneviève du Marais,Paris
...,...,...,...,...,...
8760,2019-12-31 20:00:00Z,57.2,81.3,Ecole primaire Henri Barbusse B,Paris
8761,2019-12-31 21:00:00Z,65.6,102.3,Ecole maternelle Chantefleur,Paris
8762,2019-12-31 22:00:00Z,85.5,112.8,Ecole élémentaire Anne Frank,Paris
8763,2019-12-31 23:00:00Z,86.8,94.7,Ecole primaire Gustave Loiseau,Paris


In [51]:
# drop first 5 rows
paris1 = paris_updated.drop([0, 1, 2, 3, 4])

In [52]:
# reset index 
paris1.reset_index(drop=True, inplace=True)

In [53]:
# re-order column positions
paris = paris1[['date', 'city','no2', 'pm2_5', 'location_name']]

In [54]:
paris

Unnamed: 0,date,city,no2,pm2_5,location_name
0,2019-01-01 01:00:00Z,Paris,79.4,25.1,Ecole primaire privée Massillon
1,2019-01-01 02:00:00Z,Paris,75.1,14.7,Ecole primaire privée Saint Jean Gabriel
2,2019-01-01 03:00:00Z,Paris,110.5,21.9,Ecole élémentaire privée Les Francs Bourgeois
3,2019-01-01 04:00:00Z,Paris,111.4,24.5,Ecole primaire privée Sinaï
4,2019-01-01 05:00:00Z,Paris,114.2,20.1,Ecole primaire privée Sainte Catherine
...,...,...,...,...,...
8755,2019-12-31 20:00:00Z,Paris,81.3,57.2,Ecole primaire Henri Barbusse B
8756,2019-12-31 21:00:00Z,Paris,102.3,65.6,Ecole maternelle Chantefleur
8757,2019-12-31 22:00:00Z,Paris,112.8,85.5,Ecole élémentaire Anne Frank
8758,2019-12-31 23:00:00Z,Paris,94.7,86.8,Ecole primaire Gustave Loiseau


In [55]:
paris.info(memory_usage='deep') 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8760 entries, 0 to 8759
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   date           8760 non-null   object
 1   city           8760 non-null   object
 2   no2            8576 non-null   object
 3   pm2_5          8582 non-null   object
 4   location_name  8580 non-null   object
dtypes: object(5)
memory usage: 3.0 MB


In [56]:
# check for missing values
paris.isnull().sum()

date               0
city               0
no2              184
pm2_5            178
location_name    180
dtype: int64

In [59]:
# check for nulls 
paris_clean = paris.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)

In [63]:
# check for nulls again 
paris_clean.isnull().sum()

date             0
city             0
no2              0
pm2_5            0
location_name    0
dtype: int64

In [66]:
# reset index per date 
paris_final = paris_clean.sort_values(by=['date'],ignore_index=True)

In [67]:
paris_final

Unnamed: 0,date,city,no2,pm2_5,location_name
0,2019-01-01 01:00:00Z,Paris,79.4,25.1,Ecole primaire privée Massillon
1,2019-01-01 02:00:00Z,Paris,75.1,14.7,Ecole primaire privée Saint Jean Gabriel
2,2019-01-01 03:00:00Z,Paris,110.5,21.9,Ecole élémentaire privée Les Francs Bourgeois
3,2019-01-01 04:00:00Z,Paris,111.4,24.5,Ecole primaire privée Sinaï
4,2019-01-01 05:00:00Z,Paris,114.2,20.1,Ecole primaire privée Sainte Catherine
...,...,...,...,...,...
8343,2019-12-31 20:00:00Z,Paris,81.3,57.2,Ecole primaire Henri Barbusse B
8344,2019-12-31 21:00:00Z,Paris,102.3,65.6,Ecole maternelle Chantefleur
8345,2019-12-31 22:00:00Z,Paris,112.8,85.5,Ecole élémentaire Anne Frank
8346,2019-12-31 23:00:00Z,Paris,94.7,86.8,Ecole primaire Gustave Loiseau


In [68]:
# lets merge two dataframes 
london_paris=pd.concat([london_final,paris_final],axis=0)

In [69]:
# reset index after merging
london_paris.reset_index(drop=True, inplace=True)

In [70]:
london_paris

Unnamed: 0,date,city,no2,pm2_5,location_name
0,2019-01-01T00:00:00Z,London,30.62,16.92,Savernake Road
1,2019-01-01T00:00:00Z,London,30.62,88.66,Triangle Adventure Playground
2,2019-01-01T01:00:00Z,London,30.62,77.81,Hunslett Street
3,2019-01-01T01:00:00Z,London,30.62,41.67,Pattison Road
4,2019-01-01T02:00:00Z,London,43.31,123.38,Tower Bridge Road
...,...,...,...,...,...
17833,2019-12-31 20:00:00Z,Paris,81.3,57.2,Ecole primaire Henri Barbusse B
17834,2019-12-31 21:00:00Z,Paris,102.3,65.6,Ecole maternelle Chantefleur
17835,2019-12-31 22:00:00Z,Paris,112.8,85.5,Ecole élémentaire Anne Frank
17836,2019-12-31 23:00:00Z,Paris,94.7,86.8,Ecole primaire Gustave Loiseau


In [72]:
london_paris.to_csv('/Users/Lenovo/Desktop/london_paris_final.csv')