In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

I went to following link: https://evolgeniusteam.github.io/gmrepodocumentation/usage/downloaddatafromgmrepo/

I download Processed runs (samples_loaded.txt here) that has both uid and sample id which help to map samples to relative abundance profile

I downloaded relative abundances (species_abundance.txt here) that has relative abundance for each taxon for eaxh sample. I made relative abundance profile based on it

I downloaded All runs (samlpe_to_run_info.txt here) that has metadata for each sample. 



### whole data preprocessing and filtering

In [18]:
# Set the display option to show all rows
pd.set_option('display.max_rows', None)
pd.set_option('display.min_rows', None)

In [6]:
relative_abundance = pd.read_csv('species_abundance.txt', delimiter='\t')  # Adjust the delimiter as needed

In [7]:
relative_abundance.shape

(8283371, 4)

In [4]:
print(relative_abundance.columns)

Index(['loaded_uid', 'ncbi_taxon_id', 'taxon_rank_level',
       'relative_abundance'],
      dtype='object')


In [5]:
relative_abundance_df = relative_abundance.pivot_table(index='loaded_uid', columns='ncbi_taxon_id', values='relative_abundance', fill_value=0)


In [6]:
relative_abundance_df

ncbi_taxon_id,-1,6,7,9,10,11,13,14,16,17,...,2212691,2212731,2282523,2282740,2282741,2282742,2304691,2304692,2339232,2529408
loaded_uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.503245,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,11.210800,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.496660,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,5.832720,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,5.147945,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52855,14.686035,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
52856,2.268080,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
52857,3.817365,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
52858,46.556050,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# Drop the column named '-1'
relative_abundance_df = relative_abundance_df.drop(columns=[-1])

# Remove columns that have less than 2 non-zero values
relative_abundance_df = relative_abundance_df.loc[:, (relative_abundance_df != 0).sum(axis=0) >= 2]



In [8]:
relative_abundance_df

ncbi_taxon_id,6,7,9,10,11,13,14,16,17,18,...,2211641,2212691,2212731,2282523,2282740,2282741,2282742,2304691,2304692,2529408
loaded_uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52855,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
52856,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
52857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
52858,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
relative_abundance_df.to_csv('relative_abundance.csv')

### metadata

In [3]:
relative_abundance_df = pd.read_csv('relative_abundance.csv', index_col = False)

In [4]:
metadata = pd.read_csv('sample_to_run_info.txt', delimiter='\t')

  metadata = pd.read_csv('sample_to_run_info.txt', delimiter='\t')


In [5]:
metadata

Unnamed: 0,checking,project_id,our_project_id,sample_name,original_sample_description,curated_sample_description,run_id,sample_id,second_sample_id,experiment_type,...,sex,host_age,diet,longitude,latitude,BMI,Recent.Antibiotics.Use,antibiotics_used,Antibiotics.Dose,Days.Without.Antibiotics.Use
0,1,PRJDB4360,,Japanese001,Healthy Japanese gut microbiota,,DRR048993,,SAMD00042843,Amplicon,...,Female,28.0,,139.680,35.4800,,,,,
1,1,PRJDB4360,,Japanese002,Healthy Japanese gut microbiota,,DRR048994,,SAMD00042844,Amplicon,...,Male,31.0,,139.680,35.4800,,,,,
2,1,PRJDB4360,,Japanese003,Healthy Japanese gut microbiota,,DRR048995,,SAMD00042845,Amplicon,...,Male,60.0,,139.680,35.4800,,,,,
3,1,PRJDB4360,,Japanese004,Healthy Japanese gut microbiota,,DRR048996,,SAMD00042846,Amplicon,...,Male,57.0,,139.680,35.4800,,,,,
4,1,PRJDB4360,,Japanese005,Healthy Japanese gut microbiota,,DRR048997,,SAMD00042847,Amplicon,...,Male,29.0,,139.680,35.4800,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66575,1,PRJNA215106,,F34_CY,Functional metagenomic selections were perform...,,SRR961853,SAMN02317178,SAMN02317178,Metagenomics,...,,0.7,,-122.268,47.6614,,,,,
66576,1,PRJNA215106,,F34_PE,Functional metagenomic selections were perform...,,SRR961854,SAMN02317178,SAMN02317178,Metagenomics,...,,0.7,,-122.268,47.6614,,,,,
66577,1,PRJNA215106,,F34_TE,Functional metagenomic selections were perform...,,SRR961855,SAMN02317178,SAMN02317178,Metagenomics,...,,0.7,,-122.268,47.6614,,,,,
66578,1,PRJNA215106,,F34_TR,Functional metagenomic selections were perform...,,SRR961856,SAMN02317178,SAMN02317178,Metagenomics,...,,0.7,,-122.268,47.6614,,,,,


In [6]:
uid_map = pd.read_csv('samples_loaded.txt', delimiter = '\t')

In [7]:
uid_map

Unnamed: 0,uid,accession_id,data_type,tool_used,results_version,last_updated,QCStatus,QCMessage
0,1,ERR525949,metagenomics,metaphlan ver2.0,0.1,2018-06-22 15:56:38,1,
1,2,SRR413772,metagenomics,metaphlan ver2.0,0.1,2018-06-22 15:58:51,1,
2,3,ERR526058,metagenomics,metaphlan ver2.0,0.1,2018-06-22 16:01:07,1,
3,4,ERR209092,metagenomics,metaphlan ver2.0,0.1,2019-01-01 00:00:00,0,a single taxon unknown account for 100 percent...
4,5,ERR209066,metagenomics,metaphlan ver2.0,0.1,2019-01-01 00:00:00,0,a single taxon unknown account for 100 percent...
...,...,...,...,...,...,...,...,...
52628,52855,SRR5548979,metagenomics,metaphlan ver2.0,0.1,2019-06-21 16:04:46,1,
52629,52856,SRR5548988,metagenomics,metaphlan ver2.0,0.1,2019-06-21 16:04:46,1,
52630,52857,SRR5548952,metagenomics,metaphlan ver2.0,0.1,2019-06-21 16:04:46,1,
52631,52858,SRR5548909,metagenomics,metaphlan ver2.0,0.1,2019-06-21 16:04:46,1,


In [8]:
merged_metadata = pd.merge(uid_map, metadata, left_on='accession_id', right_on='run_id', how='inner')

In [9]:
merged_metadata

Unnamed: 0,uid,accession_id,data_type,tool_used,results_version,last_updated,QCStatus,QCMessage,checking,project_id,...,sex,host_age,diet,longitude,latitude,BMI,Recent.Antibiotics.Use,antibiotics_used,Antibiotics.Dose,Days.Without.Antibiotics.Use
0,1,ERR525949,metagenomics,metaphlan ver2.0,0.1,2018-06-22 15:56:38,1,,1,PRJEB6456,...,,,,,,,,,,
1,2,SRR413772,metagenomics,metaphlan ver2.0,0.1,2018-06-22 15:58:51,1,,1,PRJNA422434,...,Female,45.0,,,,,N,,,2 months
2,3,ERR526058,metagenomics,metaphlan ver2.0,0.1,2018-06-22 16:01:07,1,,1,PRJEB6456,...,,,,,,,,,,
3,6,ERR209254,metagenomics,metaphlan ver2.0,0.1,2018-06-22 16:01:07,1,,1,PRJEB1220,...,,,,12.5683,55.6761,,,,,
4,7,ERR525843,metagenomics,metaphlan ver2.0,0.1,2018-06-22 16:01:07,1,,1,PRJEB6456,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53303,52855,SRR5548979,metagenomics,metaphlan ver2.0,0.1,2019-06-21 16:04:46,1,,1,PRJNA50637,...,Female,,,,,,,,,
53304,52856,SRR5548988,metagenomics,metaphlan ver2.0,0.1,2019-06-21 16:04:46,1,,1,PRJNA50637,...,Female,,,,,,,,,
53305,52857,SRR5548952,metagenomics,metaphlan ver2.0,0.1,2019-06-21 16:04:46,1,,1,PRJNA50637,...,Female,,,,,,,,,
53306,52858,SRR5548909,metagenomics,metaphlan ver2.0,0.1,2019-06-21 16:04:46,1,,1,PRJNA50637,...,Female,,,,,,,,,


There are some samples that has more than one phenotype. For now, I'm filter these sample out from metadata but later we can keep them and do following instead: 

You need to transform your target labels into a multi-label format. For example, instead of having a single class label per sample, you represent the labels as a binary vector where each position corresponds to a phenotype, and 1 indicates the presence of that phenotype.

In [17]:
# # Find duplicate `uid`s
# duplicate_uids = merged_metadata[merged_metadata.duplicated('uid', keep=False)]['uid'].unique()

# # Drop rows with those duplicate `uid`s
# merged_metadata_cleaned = merged_metadata[~merged_metadata['uid'].isin(duplicate_uids)]

In [10]:
# Step 1: Find common UIDs between the two DataFrames
common_uids = set(relative_abundance_df['loaded_uid']).intersection(set(merged_metadata['uid']))

print(len(common_uids))

# Step 2: Filter both DataFrames to keep only the rows with common UIDs
filtered_relative_abundance = relative_abundance_df[relative_abundance_df['loaded_uid'].isin(common_uids)]
filtered_metadata = merged_metadata[merged_metadata['uid'].isin(common_uids)]

# Step 3: Sort both DataFrames by the same order of UIDs
# First, sort the common_uids list to define the order
common_uids_sorted = sorted(common_uids)

# Reindex both DataFrames to ensure they follow the same order
filtered_relative_abundance = filtered_relative_abundance.set_index('loaded_uid').loc[common_uids_sorted].reset_index()
filtered_metadata = filtered_metadata.set_index('uid').loc[common_uids_sorted].reset_index()

28189


In [11]:
filtered_relative_abundance

Unnamed: 0,loaded_uid,6,7,9,10,11,13,14,16,17,...,2211641,2212691,2212731,2282523,2282740,2282741,2282742,2304691,2304692,2529408
0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28184,52855,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28185,52856,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28186,52857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28187,52858,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
filtered_metadata

Unnamed: 0,uid,accession_id,data_type,tool_used,results_version,last_updated,QCStatus,QCMessage,checking,project_id,...,sex,host_age,diet,longitude,latitude,BMI,Recent.Antibiotics.Use,antibiotics_used,Antibiotics.Dose,Days.Without.Antibiotics.Use
0,1,ERR525949,metagenomics,metaphlan ver2.0,0.1,2018-06-22 15:56:38,1,,1,PRJEB6456,...,,,,,,,,,,
1,2,SRR413772,metagenomics,metaphlan ver2.0,0.1,2018-06-22 15:58:51,1,,1,PRJNA422434,...,Female,45.0,,,,,N,,,2 months
2,3,ERR526058,metagenomics,metaphlan ver2.0,0.1,2018-06-22 16:01:07,1,,1,PRJEB6456,...,,,,,,,,,,
3,6,ERR209254,metagenomics,metaphlan ver2.0,0.1,2018-06-22 16:01:07,1,,1,PRJEB1220,...,,,,12.5683,55.6761,,,,,
4,7,ERR525843,metagenomics,metaphlan ver2.0,0.1,2018-06-22 16:01:07,1,,1,PRJEB6456,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31583,52855,SRR5548979,metagenomics,metaphlan ver2.0,0.1,2019-06-21 16:04:46,1,,1,PRJNA50637,...,Female,,,,,,,,,
31584,52856,SRR5548988,metagenomics,metaphlan ver2.0,0.1,2019-06-21 16:04:46,1,,1,PRJNA50637,...,Female,,,,,,,,,
31585,52857,SRR5548952,metagenomics,metaphlan ver2.0,0.1,2019-06-21 16:04:46,1,,1,PRJNA50637,...,Female,,,,,,,,,
31586,52858,SRR5548909,metagenomics,metaphlan ver2.0,0.1,2019-06-21 16:04:46,1,,1,PRJNA50637,...,Female,,,,,,,,,


In [13]:
filtered_relative_abundance.to_csv('filtered_relative_abundance.csv', index = False)

In [14]:
filtered_metadata.to_csv('filtered_metadata.csv', index=False)

### IBD data filtering 

In [15]:
filtered_metadata = pd.read_csv('filtered_metadata.csv')

In [16]:
filtered_metadata

Unnamed: 0,uid,accession_id,data_type,tool_used,results_version,last_updated,QCStatus,QCMessage,checking,project_id,...,sex,host_age,diet,longitude,latitude,BMI,Recent.Antibiotics.Use,antibiotics_used,Antibiotics.Dose,Days.Without.Antibiotics.Use
0,1,ERR525949,metagenomics,metaphlan ver2.0,0.1,2018-06-22 15:56:38,1,,1,PRJEB6456,...,,,,,,,,,,
1,2,SRR413772,metagenomics,metaphlan ver2.0,0.1,2018-06-22 15:58:51,1,,1,PRJNA422434,...,Female,45.0,,,,,N,,,2 months
2,3,ERR526058,metagenomics,metaphlan ver2.0,0.1,2018-06-22 16:01:07,1,,1,PRJEB6456,...,,,,,,,,,,
3,6,ERR209254,metagenomics,metaphlan ver2.0,0.1,2018-06-22 16:01:07,1,,1,PRJEB1220,...,,,,12.5683,55.6761,,,,,
4,7,ERR525843,metagenomics,metaphlan ver2.0,0.1,2018-06-22 16:01:07,1,,1,PRJEB6456,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31583,52855,SRR5548979,metagenomics,metaphlan ver2.0,0.1,2019-06-21 16:04:46,1,,1,PRJNA50637,...,Female,,,,,,,,,
31584,52856,SRR5548988,metagenomics,metaphlan ver2.0,0.1,2019-06-21 16:04:46,1,,1,PRJNA50637,...,Female,,,,,,,,,
31585,52857,SRR5548952,metagenomics,metaphlan ver2.0,0.1,2019-06-21 16:04:46,1,,1,PRJNA50637,...,Female,,,,,,,,,
31586,52858,SRR5548909,metagenomics,metaphlan ver2.0,0.1,2019-06-21 16:04:46,1,,1,PRJNA50637,...,Female,,,,,,,,,


I found IBD projects on GMrepo following link: https://gmrepo.humangut.info/phenotypes/D043183

In [17]:
IBD_projects = ['PRJEB11419', 'PRJNA268708', 'PRJNA46339', 'PRJNA302437', 'PRJNA392762', 'PRJNA386442', 'PRJDB5442', 'PRJNA373876', 'PRJNA524547' ]

In [18]:
Filtered_IBD_projects_metadata = filtered_metadata[filtered_metadata['project_id'].isin(IBD_projects)]

In [19]:
Filtered_IBD_projects_metadata['project_id'].value_counts()

project_id
PRJEB11419     7618
PRJNA268708     290
PRJNA392762      91
PRJNA386442      48
PRJNA46339       48
PRJNA302437       8
Name: count, dtype: int64

In [20]:
diseases = ['D043183', 'D006262']

In [21]:
Filtered_IBD_projects_disease_metadata = Filtered_IBD_projects_metadata[Filtered_IBD_projects_metadata['disease'].isin(diseases)]

In [22]:
Filtered_IBD_projects_disease_metadata['project_id'].value_counts()

project_id
PRJEB11419     1956
PRJNA268708     290
PRJNA392762      91
PRJNA386442      48
PRJNA46339       48
PRJNA302437       8
Name: count, dtype: int64

In [23]:
filtered_relative_abundance = pd.read_csv('filtered_relative_abundance.csv', index_col = False)

In [24]:
# Step 1: Find common UIDs between the two DataFrames
common_uids_IBD = set(Filtered_IBD_projects_disease_metadata['uid'])

print(len(common_uids_IBD))

# Step 2: Filter both DataFrames to keep only the rows with common UIDs
filtered_relative_abundance_IBD = filtered_relative_abundance[filtered_relative_abundance['loaded_uid'].isin(common_uids_IBD)]


# Step 3: Sort both DataFrames by the same order of UIDs
# First, sort the common_uids list to define the order
common_uids_IBD_sorted = sorted(common_uids_IBD)

# Reindex both DataFrames to ensure they follow the same order
filtered_relative_abundance_IBD = filtered_relative_abundance_IBD.set_index('loaded_uid').loc[common_uids_IBD_sorted].reset_index()
filtered_metadata_IBD = Filtered_IBD_projects_disease_metadata.set_index('uid').loc[common_uids_IBD_sorted].reset_index()

2441


In [25]:
filtered_metadata_IBD.to_csv('metadata_IBD.csv', index = False)

In [26]:
# Remove columns where all values are 0
filtered_relative_abundance_IBD = filtered_relative_abundance_IBD.loc[:, (filtered_relative_abundance_IBD != 0).any(axis=0)]

In [27]:
filtered_relative_abundance_IBD.to_csv('relative_abundance_IBD.csv', index = False)

### IBD train/test

In [4]:
metadata_IBD = pd.read_csv('metadata_IBD.csv')

In [5]:
metadata_IBD['project_id'].value_counts()

project_id
PRJEB11419     1956
PRJNA268708     290
PRJNA392762      91
PRJNA386442      48
PRJNA46339       48
PRJNA302437       8
Name: count, dtype: int64

In [6]:
metadata_IBD = metadata_IBD[metadata_IBD['experiment_type']=='Amplicon']

In [8]:
metadata_IBD = metadata_IBD[(metadata_IBD['host_age'].isnull()) | (metadata_IBD['host_age'] >= 1)]


In [10]:
metadata_IBD = metadata_IBD[(metadata_IBD['BMI'].isnull()) | (metadata_IBD['BMI'] <= 60)]


In [11]:
metadata_IBD = metadata_IBD[(metadata_IBD['BMI'].isnull()) | (metadata_IBD['BMI'] >= 14)]

In [13]:
metadata_IBD['project_id'].value_counts()

project_id
PRJEB11419     1577
PRJNA268708     290
PRJNA392762      91
PRJNA386442      48
PRJNA46339       48
PRJNA302437       8
Name: count, dtype: int64

In [14]:
IBD_train_df = metadata_IBD[metadata_IBD['project_id'] == 'PRJEB11419']

# Create the testing DataFrame where project_id is not 'PRJEB11419'
IBD_test_df = metadata_IBD[metadata_IBD['project_id'] != 'PRJEB11419']

In [15]:
IBD_train_df.shape

(1577, 38)

In [16]:
IBD_train_df.describe()

Unnamed: 0,uid,results_version,QCStatus,QCMessage,checking,our_project_id,curated_sample_description,nr_reads_sequenced,disease_stage,host_age,longitude,latitude,BMI,antibiotics_used,Antibiotics.Dose,Days.Without.Antibiotics.Use
count,1577.0,1577.0,1577.0,0.0,1577.0,0.0,0.0,1577.0,0.0,1577.0,1577.0,1577.0,1577.0,0.0,0.0,1567.0
mean,38610.783767,1.0,1.0,,3.0,,,41110.20038,,42.262524,-63.777869,38.906785,23.482467,,,328.326739
std,4082.656993,0.0,0.0,,0.0,,,40010.274448,,17.648164,57.159991,15.021576,4.486924,,,86.876069
min,34044.0,1.0,1.0,,3.0,,,20889.0,,1.0,-157.8,-45.8,14.04,,,7.0
25%,36576.0,1.0,1.0,,3.0,,,26385.0,,31.0,-117.0,33.5,20.73,,,365.0
50%,36977.0,1.0,1.0,,3.0,,,30964.0,,42.0,-78.9,40.7,23.18,,,365.0
75%,37385.0,1.0,1.0,,3.0,,,38163.0,,57.0,-1.9,50.4,25.52,,,365.0
max,47508.0,1.0,1.0,,3.0,,,580153.0,,82.0,174.9,62.7,53.21,,,365.0


In [17]:
IBD_test_df.shape

(485, 38)

In [18]:
relative_abundance_IBD = pd.read_csv('relative_abundance_IBD.csv')

In [19]:
relative_abundance_IBD.shape

(2441, 5193)

In [20]:
# Step 1: Find common UIDs between the two DataFrames
train_uids = IBD_train_df['uid']
test_uids = IBD_test_df['uid']


# Step 2: Filter both DataFrames to keep only the rows with common UIDs
train_relative_abundance_IBD = relative_abundance_IBD[relative_abundance_IBD['loaded_uid'].isin(train_uids)]
test_relative_abundance_IBD = relative_abundance_IBD[relative_abundance_IBD['loaded_uid'].isin(test_uids)]

# Step 3: Sort both DataFrames by the same order of UIDs
# First, sort the common_uids list to define the order
train_uids_sorted = sorted(train_uids)
test_uids_sorted = sorted(test_uids)

# Reindex both DataFrames to ensure they follow the same order
train_relative_abundance_IBD = train_relative_abundance_IBD.set_index('loaded_uid').loc[train_uids_sorted].reset_index()
train_metadata_IBD = IBD_train_df.set_index('uid').loc[train_uids_sorted].reset_index()

test_relative_abundance_IBD = test_relative_abundance_IBD.set_index('loaded_uid').loc[test_uids_sorted].reset_index()
test_metadata_IBD = IBD_test_df.set_index('uid').loc[test_uids_sorted].reset_index()

In [21]:
train_metadata_IBD.to_csv('train_metadata_IBD.csv', index=False)

In [22]:
train_relative_abundance_IBD.to_csv('train_relative_abundance_IBD.csv', index=False)

In [23]:
test_metadata_IBD.to_csv('test_metadata_IBD.csv', index=False)

In [24]:
test_relative_abundance_IBD.to_csv('test_relative_abundance_IBD.csv', index=False)

### Balanced train dataset

In [2]:
train_metadata_IBD = pd.read_csv('train_metadata_IBD.csv')

In [3]:
import pandas as pd

# Assume 'target_column' contains the two values (e.g., 0 and 1)
value_counts = train_metadata_IBD['disease'].value_counts()

# Identify the minimum class count
min_count = value_counts.min()

# Undersample the larger class
balanced_train_metadata_IBD = train_metadata_IBD.groupby('disease').apply(lambda x: x.sample(min_count)).reset_index(drop=True)


In [6]:
train_relative_abundance_IBD = pd.read_csv('train_relative_abundance_IBD.csv')

In [7]:
# Step 1: Find common UIDs between the two DataFrames
common_uids = set(balanced_train_metadata_IBD['uid'])

print(len(common_uids))

# Step 2: Filter both DataFrames to keep only the rows with common UIDs
balanced_train_relative_abundance_IBD = train_relative_abundance_IBD[train_relative_abundance_IBD['loaded_uid'].isin(common_uids)]


# Step 3: Sort both DataFrames by the same order of UIDs
# First, sort the common_uids list to define the order
common_uids_sorted = sorted(common_uids)

# Reindex both DataFrames to ensure they follow the same order
balanced_train_relative_abundance_IBD = balanced_train_relative_abundance_IBD.set_index('loaded_uid').loc[common_uids_sorted].reset_index()
balanced_train_metadata_IBD = balanced_train_metadata_IBD.set_index('uid').loc[common_uids_sorted].reset_index()

880


In [8]:
balanced_train_relative_abundance_IBD.to_csv('train_relative_abundance_IBD_balanced.csv', index=False)

In [9]:
balanced_train_metadata_IBD.to_csv('train_metadata_IBD_balanced.csv', index=False)