# EDA - Exploratory Data Analysis

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

## Loading CSV with Pandas plus 1st look into the data

In [55]:
#use pandas to load and convert the CSV
df = pd.read_csv("london-air-quality.csv")


In [56]:
#printing the 1st and last 5 lines of the dataframe
print(df.head(-10))

           date  pm25  pm10   o3  no2  so2  co
0      2023/1/3    48    13   28   18    1   3
1      2023/1/4    36    18   27    9    1   1
2      2023/1/5    37    16   23   16    1   3
3      2023/1/6    30     8   25   19    1   3
4      2023/1/7    13    13   30   12    1   3
...         ...   ...   ...  ...  ...  ...  ..
3273  2014/3/27    68    34   19   48    4   2
3274  2014/3/28    86    54   15   45    4   3
3275  2014/3/29   137    52   31   38    4   3
3276  2014/3/30   113    34   27   49    4   3
3277  2014/3/31    82    45   17   50    6   4

[3278 rows x 7 columns]

Sorted
           date  pm25  pm10   o3  no2  so2  co
0      2023/1/3    48    13   28   18    1   3
1      2023/1/4    36    18   27    9    1   1
2      2023/1/5    37    16   23   16    1   3
3      2023/1/6    30     8   25   19    1   3
4      2023/1/7    13    13   30   12    1   3
...         ...   ...   ...  ...  ...  ...  ..
3273  2014/3/27    68    34   19   48    4   2
3274  2014/3/28    86    54

We can start by noticing that there 7 columns: 1 for the data, and 6 for variables values. Furthermore, the dataframe seems to be sorted by data, but the final rows seem out of order,



In [31]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3288 entries, 0 to 3287
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   date    3288 non-null   object
 1    pm25   3288 non-null   object
 2    pm10   3288 non-null   object
 3    o3     3288 non-null   object
 4    no2    3288 non-null   object
 5    so2    3288 non-null   object
 6    co     3288 non-null   object
dtypes: object(7)
memory usage: 179.9+ KB
None


We see that all columns are dtype string when they should be: datetime (for the date) and floats or integers for all other variables for us to be able to study and learn from them.

## Dtype Convertion

Above, we saw that all columns are of the dtype object which means they are strings. Before we do anything else we need to convert them all to numbers we can read (floats or integers).

In [48]:
#1st convert data column to datetime object
df["date"] = pd.to_datetime(df["date"], format = "%Y/%m/%d" )

#2nd convert all other columns, that we know should be floats
for col in df.columns[1:]:
    #notice that column variable names come with a whitespace before the actual name, we will remove it
    new_name = col.strip()
    df.rename(columns= {f"{col}": f"{new_name}"}, inplace=True)
    #conver string to numeric values
    df[f"{new_name}"] = pd.to_numeric(df[f"{new_name}"], errors="coerce")

In [50]:
#check if dtypes are correct now
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3288 entries, 0 to 3287
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    3288 non-null   datetime64[ns]
 1   pm25    3278 non-null   float64       
 2   pm10    3279 non-null   float64       
 3   o3      3243 non-null   float64       
 4   no2     3246 non-null   float64       
 5   so2     2762 non-null   float64       
 6   co      3149 non-null   float64       
dtypes: datetime64[ns](1), float64(6)
memory usage: 179.9 KB
None
        date  pm25  pm10    o3   no2  so2   co
0 2023-01-03  48.0  13.0  28.0  18.0  1.0  3.0
1 2023-01-04  36.0  18.0  27.0   9.0  1.0  1.0
2 2023-01-05  37.0  16.0  23.0  16.0  1.0  3.0
3 2023-01-06  30.0   8.0  25.0  19.0  1.0  3.0
4 2023-01-07  13.0  13.0  30.0  12.0  1.0  3.0


In [51]:
df.describe()

Unnamed: 0,pm25,pm10,o3,no2,so2,co
count,3278.0,3279.0,3243.0,3246.0,2762.0,3149.0
mean,59.790726,24.659652,24.66605,34.465496,3.677408,4.822483
std,21.620866,9.919508,9.541935,14.698088,2.483671,2.663329
min,13.0,5.0,1.0,3.0,1.0,1.0
25%,46.0,18.0,19.0,23.0,2.0,3.0
50%,57.0,23.0,25.0,34.0,3.0,4.0
75%,69.0,29.0,30.0,45.0,5.0,6.0
max,177.0,89.0,91.0,92.0,27.0,19.0


In [60]:
#be sure the df is sorted by date
df.sort_values(by=["date"])
#be sure there are no days missing
assert df["date"].isnull().sum() == 0

True


## Data exploration

We are now ready to explore each column and see patterns, distributions, errors, and quirks in the data.