# National emissions reported to the Convention on Long-range Transboundary Air Pollution (LRTAP Convention), 2023

Data from 1990 to 2021

Source : https://www.eea.europa.eu/en/datahub/datahubitem-view/5be6cebc-ed2b-4496-be59-93736fc4ad78?activeAccordion=

By : Laurent Nopoly

Setup : Download CSV file from Source link, add it in project/data/raw

Documentation : (from source file NEC_NFR19_2023_2023.xlsx)

National Emission Ceilings (NEC) Directive Inventory - NFR19 sector classification - feature catalogue

## Libraries import

In [3]:
import pandas as pd
import plotly

## Feature catalogue

In [5]:
excel_file = '../references/LRTAP-Table_definition.xlsx'
df_catalogue = pd.read_excel(excel_file, sheet_name='Sheet1', header=2)
df_catalogue

Unnamed: 0,Name,Datatype,Definition
0,Emissions,float (8),Emission value.
1,Country_code,varchar(4),International Country Code. Note: ISO 3166-1-A...
2,Country,varchar(53),Country name.
3,Pollutant_name,varchar(20),"Short name of pollutant. Note: NOx (as NO2), N..."
4,Format_name,varchar(100),Name of guideline. Note: NFR19 sector classifi...
5,Sector_code,varchar(15),Sector code. Note: NFR19 sector classification
6,Parent_sector_code,varchar(15),Parent sector code. Note: NFR19 sector classif...
7,Sector_name,varchar(100),Sector name. Note: NFR19 sector classification
8,Year,varchar(4),Annual data. Note: 1990 - 2021
9,Unit,varchar(40),Emission unit.


## Dataset exploration

In [7]:
df = pd.read_csv("../data_store/raw/CLRTAP_NVFR19_V23_1_GF_csv.csv",sep='\t')

  df = pd.read_csv("../data_store/raw/CLRTAP_NVFR19_V23_1_GF_csv.csv",sep='\t')


In [17]:
print("Dataset Headers:{}\n".format(list(df.columns)))
print("Dataset shape:{}\n".format(df.shape))
df.dtypes

Dataset Headers:['Country_Code', 'Country', 'Pollutant_name', 'Format_name', 'Sector_code', 'Year', 'Emissions', 'Unit', 'Notation', 'VersionId', 'Parent_sector_code', 'Sector_name']

Dataset shape:(3933696, 12)


Country_Code           object
Country                object
Pollutant_name         object
Format_name            object
Sector_code            object
Year                    int64
Emissions             float64
Unit                   object
Notation              float64
VersionId             float64
Parent_sector_code     object
Sector_name            object
dtype: object

In [8]:
df.head(10)

Unnamed: 0,Country_Code,Country,Pollutant_name,Format_name,Sector_code,Year,Emissions,Unit,Notation,VersionId,Parent_sector_code,Sector_name
0,AT,Austria,As,NFR 2014-1,1A1a,1990,,Mg,,1.0,NATIONAL TOTAL,Public electricity and heat production
1,AT,Austria,As,NFR 2014-1,1A1b,1990,,Mg,,1.0,NATIONAL TOTAL,Petroleum refining
2,AT,Austria,As,NFR 2014-1,1A1c,1990,,Mg,,1.0,NATIONAL TOTAL,Manufacture of solid fuels and other energy in...
3,AT,Austria,As,NFR 2014-1,1A2a,1990,,Mg,,1.0,NATIONAL TOTAL,Stationary combustion in manufacturing industr...
4,AT,Austria,As,NFR 2014-1,1A2b,1990,,Mg,,1.0,NATIONAL TOTAL,Stationary combustion in manufacturing industr...
5,AT,Austria,As,NFR 2014-1,1A2c,1990,,Mg,,1.0,NATIONAL TOTAL,Stationary combustion in manufacturing industr...
6,AT,Austria,As,NFR 2014-1,1A2d,1990,,Mg,,1.0,NATIONAL TOTAL,Stationary combustion in manufacturing industr...
7,AT,Austria,As,NFR 2014-1,1A2e,1990,,Mg,,1.0,NATIONAL TOTAL,Stationary combustion in manufacturing industr...
8,AT,Austria,As,NFR 2014-1,1A2f,1990,,Mg,,1.0,NATIONAL TOTAL,Stationary combustion in manufacturing industr...
9,AT,Austria,As,NFR 2014-1,1A2gvii,1990,,Mg,,1.0,NATIONAL TOTAL,Mobile Combustion in manufacturing industries ...


## Values

### Target/Features

In [27]:
df_features = df.copy()
print(df_features.shape)
print(df_features.columns)
df_features.drop(['Unit'], )
print(df_features.shape)
print(df_features.columns)

(3933696, 12)
Index(['Country_Code', 'Country', 'Pollutant_name', 'Format_name',
       'Sector_code', 'Year', 'Emissions', 'Unit', 'Notation', 'VersionId',
       'Parent_sector_code', 'Sector_name'],
      dtype='object')


KeyError: "['Unit'] not found in axis"

### Unique values

In [20]:
df_dict = {}
print("Nbr unique values")
for col in df.columns:
        attributes = list(df[col].unique())
        df_dict[f'{col} : {len(attributes)}'] = attributes

# Trouver la longueur maximale parmi toutes les listes
length_max = max(len(v) for v in df_dict.values())

# Égaliser les longueurs des listes
for cle, tab in df_dict.items():
    while len(tab) < length_max:
        tab.append(None)

df_unique = pd.DataFrame(df_dict)
df_unique

Nbr unique values


Unnamed: 0,Country_Code : 34,Country : 35,Pollutant_name : 26,Format_name : 2,Sector_code : 141,Year : 32,Emissions : 965145,Unit : 4,Notation : 1,VersionId : 2,Parent_sector_code : 2,Sector_name : 162
0,AT,Austria,As,NFR 2014-1,1A1a,1990.0,,Mg,,1.0,NATIONAL TOTAL,Public electricity and heat production
1,BE,Belgium,BC,NEC NFR-1 sector classification,1A1b,1991.0,2.014674,Gg,,,,Petroleum refining
2,DK,Denmark,benzo(a),,1A1c,1992.0,0.011000,g,,,,Manufacture of solid fuels and other energy in...
3,FI,Finland,benzo(b),,1A2a,1993.0,0.142027,kg,,,,Stationary combustion in manufacturing industr...
4,FR,France,benzo(k),,1A2b,1994.0,0.299309,,,,,Stationary combustion in manufacturing industr...
...,...,...,...,...,...,...,...,...,...,...,...,...
965140,,,,,,,4.762444,,,,,
965141,,,,,,,111.584970,,,,,
965142,,,,,,,80.941009,,,,,
965143,,,,,,,0.000968,,,,,


### Missing values

In [16]:
missing_values_count = df.isnull().sum()
missing_values_count

Country_Code                0
Country                     0
Pollutant_name              0
Format_name                 0
Sector_code                 0
Year                        0
Emissions             1514137
Unit                        0
Notation              3933696
VersionId              117312
Parent_sector_code     446784
Sector_name            117312
dtype: int64

Faire un pourcentage de missing values