# **Execute the data integration and data processing steps to produce the inetgarted EF and recommended EF tables**

**1. Setting up the NEIVA database in the colab environment.**

In [None]:
!pip install mysql-connector-python # Install the necessary package to connect Python with MySQL databases.
!pip install pubchempy
!pip install pymysql
!apt-get -y install mysql-server    # Install the MySQL server on the Colab environment.
!service mysql start                # With MySQL install, this starts the server.

# Setting the password. Here 'root' is used as password.

!mysql -e "ALTER USER 'root'@'localhost' IDENTIFIED WITH 'mysql_native_password' BY 'root';FLUSH PRIVILEGES;"

In [2]:
#remove the existing github repository
!rm -rf NEIVA

In [None]:
#downloading the github repository
!git clone https://github.com/NEIVA-BB-emissions-Inventory/NEIVA.git

In [4]:
!ls NEIVA/data

backend_db.sql	CSV_files  legacy_db.sql  neiva_output_db.sql  primary_db.sql  raw_db.sql


In [None]:
!mysql -u root -proot -e "CREATE DATABASE IF NOT EXISTS backend_db"
!mysql -u root -proot backend_db < NEIVA/data/backend_db.sql
!mysql -u root -proot -e "CREATE DATABASE IF NOT EXISTS legacy_db"
!mysql -u root -proot legacy_db < NEIVA/data/legacy_db.sql
!mysql -u root -proot -e "CREATE DATABASE IF NOT EXISTS neiva_output_db"
!mysql -u root -proot neiva_output_db < NEIVA/data/neiva_output_db.sql
!mysql -u root -proot -e "CREATE DATABASE IF NOT EXISTS primary_db"
!mysql -u root -proot primary_db < NEIVA/data/primary_db.sql
!mysql -u root -proot -e "CREATE DATABASE IF NOT EXISTS raw_db"
!mysql -u root -proot raw_db < NEIVA/data/raw_db.sql

**2. Improting the neivapy package and using the functions to execute the data integration, data processing steps**

In [8]:
import NEIVA.neivapy as nv

In [9]:
from sqlalchemy import text
import pandas as pd
import warnings
warnings.simplefilter("ignore", UserWarning)

**2.1 Executing the data integration step using the 'integrate_tables' function.**

In [10]:
# The pdb database tabled are integrated into a single table.
int_df=nv.integrate_tables()

0           pdb_akagi11_boreal_forest
1               pdb_akagi11_chaparral
2        pdb_akagi11_charcoal_burning
3         pdb_akagi11_charcoal_making
4               pdb_akagi11_cookstove
5            pdb_akagi11_crop_residue
6            pdb_akagi11_dung_burning
7         pdb_akagi11_garbage_burning
8            pdb_akagi11_open_cooking
9     pdb_akagi11_pasture_maintenance
10                   pdb_akagi11_peat
11                pdb_akagi11_savanna
12       pdb_akagi11_temperate_forest
13        pdb_akagi11_tropical_forest
14                    pdb_bf_hayden22
15                       pdb_coffey17
16                    pdb_cr_holder17
17                     pdb_cr_lasko18
18                       pdb_cr_liu16
19                      pdb_fleming18
20                  pdb_gb_yokelson13
21                        pdb_goetz18
22                        pdb_hatch15
23                        pdb_hatch17
24                   pdb_jayarathne14
25                   pdb_jayarathne18
26          

In [11]:
# The integrated EF dataset is produced.
int_df

Unnamed: 0,mm,formula,compound,pollutant_category,EF_engelmann_spruce_koss18,EF_manzanita_uncontaminated_koss18,EF_subalpine_fir_koss18,EF_loblolly_pine_koss18,EF_ceanothus_koss18,EF_jeffrey_pine_koss18,...,EF_rice_travis23,EF_soybean_travis23,EF_winter_wheat_travis23,EF_slash_travis23,EF_pile_travis23,EF_shrubland_travis23,EF_grassland_travis23,EF_blackwater_travis23,EF_rondonia_hodgson18,EF_tocantins_cerrado_hodgson18
0,44.0,CO2,carbon dioxide,inorganic gas,1644.614200,1698.458200,1659.7898,1717.0291,1718.8532,1511.849800,...,1378.000000,1376.000000,1461.000,1655.000000,1684.000000,1586.000,1506.000000,1719.000000,1447.0,1711.00
1,28.0,CO,carbon monoxide,inorganic gas,69.418100,40.619400,72.8033,87.4088,64.3124,134.815400,...,85.000000,86.000000,38.000,125.000000,108.000000,65.000,113.000000,69.000000,237.0,74.00
2,17.0,NH3,ammonia,inorganic gas,1.570760,0.431127,1.4925,0.3994,1.0744,1.370167,...,,,,,,,,,,
3,26.0,C2H2,acetylene,NMOC_g,0.286264,0.121084,0.5796,0.2121,0.2239,0.500273,...,0.318000,0.272000,,0.162000,0.211000,0.188,0.971000,0.272000,,
4,27.0,HCN,HCN,NMOC_g,0.434508,0.062507,0.4739,0.1619,0.2200,0.734078,...,0.477000,0.295000,0.329,0.229000,0.399000,0.126,0.786000,0.239000,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1380,169.0,C7H7NO4,Nitromethylcatechol,NMOC_g,,,,,,,...,0.008000,0.010000,,0.005000,0.005000,0.008,0.007000,0.004000,,
1381,93.0,NO2Cl,Nitryl chloride,inorganic gas,,,,,,,...,0.000164,0.000059,,0.000036,0.000275,,0.000103,0.000189,,
1382,106.0,C2H3O2Cl,Chloroacetic acid,inorganic gas,,,,,,,...,0.000182,0.000056,,,0.000394,,0.000267,,,
1383,155.0,C6H5NO4,4-Nitrocatechol,NMOC_g,,,,,,,...,0.056000,0.016000,,0.059000,0.042000,0.085,0.043000,0.127000,,


In [12]:
nmogdf=nv.fetch_nmog(int_df)
nmogdf=nv.assign_study_column(nmogdf)

Non-Methane Organic Compounds Gas-Phase (NMOC_g) Data Frame: [ROW, COLUMN] =[1189 262]


In [13]:
# The similar lumped compounds are merged.
r_iddf, iddf =nv.merge_lumped_compound_same_formula(nmogdf)

______________________________________________________________________________________________________________
Formula-C4H8 Merged compounds-
                      compound     study
0  Butenes + other hydrocarbon    koss18
1         Isobutene + 1-Butene  permar21
2                      Butenes  permar21
______________________________________________________________________________________________________________
______________________________________________________________________________________________________________
Formula-C4H7N Merged compounds-
                          compound            study
0  Dihydropyrrole + butane nitrile  permar21,koss18
1                          unknown         hayden22
______________________________________________________________________________________________________________
______________________________________________________________________________________________________________
Formula-C5H10 Merged compounds-
                  compound    

In [14]:
nmogdf=nv.insert_rdf_nmogdf(nmogdf,r_iddf,iddf)

Length of NMOC_g dataset: 1115


In [15]:
lc_spec_df=nv.sync_lumped_compound_and_speciation(nmogdf)

***************************************************************************************
The following steps are executed:
1. Split a lumped compound into individual components, assign id to the componenets
2. Search the ids within the integrated dataset
3. Align the lumped compound and speciatied compounds if found.
***************************************************************************************
Lumped compound- Ethylamine + dimethylamine
Assigned id:  Ethylamine
Assigned id:  dimethylamine
__________________________________________________________________
Lumped compound- 1,3-Butadiene + 1,2-Butadiene
Assigned id:  1,3-Butadiene
Assigned id:  1,2-Butadiene
All individual ids are found in the Integrated Dataset
__________________________________________________________________
Lumped compound- 1-butyne + 2-butyne
Assigned id:  1-butyne
Assigned id:  2-butyne
All individual ids are found in the Integrated Dataset
__________________________________________________________________


In [38]:
# data sort
nmogdf=nv.sort_nmog_data(nmogdf)
igdf=nv.sort_inorganic_gas_data(int_df)
pmdf=nv.sort_particulate_matter_data(int_df)
int_df = pd.concat([igdf, nmogdf, pmdf], ignore_index=True)

In [19]:
bk_db=nv.connect_db('backend_db')

In [20]:
efcoldf=pd.read_sql(text('select * from bkdb_info_efcol'), con=bk_db)

In [21]:
int_df=nv.calc_NOx_as_NO (int_df)

In [22]:
int_df_2, efcoldf = nv.calculate_average_lab_study(int_df, efcoldf)

In [23]:
int_df_3=nv.lab_data_adjust_to_field_conditions(int_df_2,efcoldf)[0]

In [24]:
int_df_3=nv.assign_data_count_column(int_df_3,efcoldf)

In [25]:
avgdf = nv.calculate_average_fire_types(int_df_3, efcoldf)
avgdf= nv.calculalate_fractional_contribution(avgdf)
avgdf = nv.round_avg_cols(avgdf)

In [27]:
round(avgdf['AVG_temperate_forest'][avgdf['pollutant_category']=='NMOC_g'].sum(),3)

42.511

In [30]:
# The Recommended EF dataset
avgdf

Unnamed: 0,mm,formula,compound,pollutant_category,AVG_savanna,AVG_boreal_forest,AVG_tropical_forest,AVG_temperate_forest,AVG_peat,AVG_chaparral,...,STD_open_cooking,STD_cookstove,STD_dung_burning,STD_charcoal_making,STD_charcoal_burning,STD_coal_burning,STD_pasture_maintenance,STD_crop_residue,STD_garbage_burning,id
0,2.00,H2,hydrogen,inorganic gas,1.7000,,3.3550,2.0300,1.2170,,...,,,,,,,,,,InChI=1S/H2/h1H
1,16.04,CH4,methane,methane,2.8276,4.7837,4.6814,4.7444,11.0975,2.5719,...,2.7173,3.3023,3.0047,47.9995,1.8798,10.1607,7.4960,1.1257,2.8516,InChI=1S/CH4/h1H4
2,17.00,NH3,ammonia,inorganic gas,0.6592,1.4706,1.3267,1.0625,6.1517,0.9091,...,0.6417,6.6333,2.0359,16.4131,0.5017,1.3881,0.1952,0.5480,0.4244,InChI=1S/H3N/h1H3
3,19.00,HF,hydrogen fluoride,inorganic gas,,,,,,,...,,,,,,0.8704,,,,InChI=1S/FH/h1H
4,28.00,CO,carbon monoxide,inorganic gas,80.9500,100.4800,110.7614,95.9542,224.8939,66.6000,...,19.2269,17.3262,11.4478,52.4708,43.5126,113.5528,84.3172,13.4615,18.8971,InChI=1S/CO/c1-2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1279,,,SSA 532,PM optical property,0.4925,0.9230,,0.6520,0.9913,0.6875,...,,,,,,,,0.0380,,SSA_532
1280,,,SSA 660,PM optical property,0.4675,0.9080,,0.6335,0.9927,0.6550,...,,,,,,,,0.0326,,SSA_660
1281,,,BrC,PM optical property,,,,,,,...,,5.6243,1.6579,,,0.5848,,,,BrC
1282,,,EF Bscat 405 (m2/kg),PM optical property,,,,,,,...,,23.8336,5.8148,,,8.2024,,,,Bscat_405


In [36]:
pp=nv.voc_profile(avgdf, 'S07', 'temperate forest')

In [37]:
pp

Unnamed: 0,S07,ef,mm,mole,mole_fraction
0,HCHO,2.0374,29.5,0.0696,0.1149
1,MEOH,1.8268,31.0,0.0595,0.0982
2,ETHE,1.361,28.0,0.0492,0.0812
3,AACD,2.5141,60.0,0.0425,0.0701
4,OLE2,4.015626,95.970414,0.0424,0.07
5,CCHO,2.1332,52.0,0.0416,0.0686
6,CRES,3.7832,126.0,0.0306,0.0505
7,IPRD,1.865162,87.151515,0.022,0.0363
8,MVK,1.6784,87.0,0.0199,0.0328
9,ARO2,2.2642,121.063291,0.0193,0.0318
