# Cross validation of the SNP-Gene-Bacteria data in 2 cities

## Step 1: Data preprocessing

### 1.1 Import the necessary libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


### 1.2 Load the data

Load the full dataset and check the first few rows of the data.

> The folder containing all the data to be used is too large, thus the data is not included in the repository, see [README](https://github.com/Lucas04-nhr/SNP-Analysis/blob/main/README.md#structure-of-the-result-directory) to check the data source.

> The "Bacteria" column has not been replaced by the real names yet.

In [2]:
full_data_bj = pd.read_csv('../../result/analysis/cross_validation/bac_age_thr20_BJ.csv')
full_data_gz = pd.read_csv('../../result/analysis/cross_validation/bac_age_thr20_GZ.csv')


In [3]:
full_data_bj.head()


Unnamed: 0,CHR,SNP,UNADJ,GC,BONF,HOLM,SIDAK_SS,SIDAK_SD,FDR_BH,FDR_BY,Bacteria
0,4,4:8609098,1.052e-70,3e-06,1.1189999999999999e-63,1.1189999999999999e-63,inf,inf,1.1189999999999999e-63,1.874e-62,38
1,20,20:30977589,1.0020000000000001e-69,3e-06,1.065e-62,1.065e-62,inf,inf,5.324e-63,8.920000000000001e-62,38
2,18,18:64718990,1.591e-65,1.2e-05,1.691e-58,1.691e-58,inf,inf,5.638e-59,9.447e-58,38
3,HG1523_PATCH,HG1523_PATCH:43470,1.057e-65,1e-06,1.123e-58,1.123e-58,inf,inf,7.891e-59,1.322e-57,17
4,HSCHR1_4_CTG3,HSCHR1_4_CTG3:45233,2.053e-65,9e-06,2.182e-58,2.182e-58,inf,inf,7.891e-59,1.322e-57,17


In [4]:
full_data_gz.head()


Unnamed: 0,CHR,SNP,UNADJ,GC,BONF,HOLM,SIDAK_SS,SIDAK_SD,FDR_BH,FDR_BY,Bacteria
0,18,18:107881,1.282e-52,8e-06,7.951e-46,7.951e-46,inf,inf,7.951e-46,1.289e-44,4
1,GL000225.1,GL000225.1:63186,2.544e-48,3e-06,1.5789999999999999e-41,1.5789999999999999e-41,inf,inf,1.5789999999999999e-41,2.56e-40,45
2,GL000225.1,GL000225.1:63182,5.185e-48,9e-06,3.2169999999999997e-41,3.2169999999999997e-41,inf,inf,1.608e-41,2.609e-40,45
3,HG2069_PATCH,HG2069_PATCH:279669,1.065e-45,3.7e-05,6.609e-39,6.609e-39,inf,inf,2.2030000000000002e-39,3.573e-38,45
4,20,20:31067694,2.478e-45,3.8e-05,1.5379999999999998e-38,1.5379999999999998e-38,inf,inf,3.844e-39,6.234e-38,45


### 1.3 Load the metadata of each bacteria

Load the metadata of each bacteria and check the first few rows of the data.

In [5]:
bac_bj = pd.read_csv('../../result/analysis/top_bacteria/bacteria_BJ.csv')
bac_gz = pd.read_csv('../../result/analysis/top_bacteria/bacteria_GZ.csv')


In [6]:
bac_bj.head()


Unnamed: 0,No.,Species
0,1,Actinomyces oris
1,2,Aeromonas caviae
2,3,Alloprevotella sp015259235
3,4,Anaerococcus nagyae
4,5,Corynebacterium kefirresidentii


In [7]:
bac_gz.head()


Unnamed: 0,No.,Species
0,1,Acinetobacter baumannii
1,2,Aeromonas caviae
2,3,Anaerococcus nagyae
3,4,Corynebacterium kefirresidentii
4,5,Corynebacterium macginleyi


### 1.4 Replace the "Bacteria" column with the real names

Replace the "Bacteria" column with the real names of the bacteria.

In [8]:
# Merge full_data_bj with bac_bj to replace Bacteria column with real names
full_data_bj = full_data_bj.merge(bac_bj, left_on='Bacteria', right_on='No.', how='left')

# Drop the original Bacteria column and the No. column from bac_bj
full_data_bj = full_data_bj.drop(columns=['Bacteria', 'No.'])

# Rename the Species column to Bacteria
full_data_bj = full_data_bj.rename(columns={'Species': 'Bacteria'})

# Display the updated dataframe
full_data_bj.head()


Unnamed: 0,CHR,SNP,UNADJ,GC,BONF,HOLM,SIDAK_SS,SIDAK_SD,FDR_BH,FDR_BY,Bacteria
0,4,4:8609098,1.052e-70,3e-06,1.1189999999999999e-63,1.1189999999999999e-63,inf,inf,1.1189999999999999e-63,1.874e-62,Veillonella parvula_A
1,20,20:30977589,1.0020000000000001e-69,3e-06,1.065e-62,1.065e-62,inf,inf,5.324e-63,8.920000000000001e-62,Veillonella parvula_A
2,18,18:64718990,1.591e-65,1.2e-05,1.691e-58,1.691e-58,inf,inf,5.638e-59,9.447e-58,Veillonella parvula_A
3,HG1523_PATCH,HG1523_PATCH:43470,1.057e-65,1e-06,1.123e-58,1.123e-58,inf,inf,7.891e-59,1.322e-57,Porphyromonas pasteri
4,HSCHR1_4_CTG3,HSCHR1_4_CTG3:45233,2.053e-65,9e-06,2.182e-58,2.182e-58,inf,inf,7.891e-59,1.322e-57,Porphyromonas pasteri


In [9]:
# Merge full_data_gz with bac_gz to replace Bacteria column with real names
full_data_gz = full_data_gz.merge(bac_gz, left_on='Bacteria', right_on='No.', how='left')

# Drop the original Bacteria column and the No. column from bac_gz
full_data_gz = full_data_gz.drop(columns=['Bacteria', 'No.'])

# Rename the Species column to Bacteria
full_data_gz = full_data_gz.rename(columns={'Species': 'Bacteria'})

# Display the updated dataframe
full_data_gz.head()


Unnamed: 0,CHR,SNP,UNADJ,GC,BONF,HOLM,SIDAK_SS,SIDAK_SD,FDR_BH,FDR_BY,Bacteria
0,18,18:107881,1.282e-52,8e-06,7.951e-46,7.951e-46,inf,inf,7.951e-46,1.289e-44,Corynebacterium kefirresidentii
1,GL000225.1,GL000225.1:63186,2.544e-48,3e-06,1.5789999999999999e-41,1.5789999999999999e-41,inf,inf,1.5789999999999999e-41,2.56e-40,Vibrio metschnikovii
2,GL000225.1,GL000225.1:63182,5.185e-48,9e-06,3.2169999999999997e-41,3.2169999999999997e-41,inf,inf,1.608e-41,2.609e-40,Vibrio metschnikovii
3,HG2069_PATCH,HG2069_PATCH:279669,1.065e-45,3.7e-05,6.609e-39,6.609e-39,inf,inf,2.2030000000000002e-39,3.573e-38,Vibrio metschnikovii
4,20,20:31067694,2.478e-45,3.8e-05,1.5379999999999998e-38,1.5379999999999998e-38,inf,inf,3.844e-39,6.234e-38,Vibrio metschnikovii


## Step 2: Data analysis

### 2.1 Check the distribution of the target variable