## General idea: Find out whether the genetics influence the microbiome by comparing the samples within the monozygotic and dizygotic pairs and find the significance of the beta diversity. 

In [2]:
# importing all required packages & notebook extensions at the start of the notebook
import os
import pandas as pd
import qiime2 as q2
from qiime2 import Visualization
import matplotlib.pyplot as plt
%matplotlib inline
from operator import itemgetter
import matplotlib.patches as mpatches
from scipy.stats import shapiro

or_dir = '../data' #original data (demux sequences, metadata)
data_dir = 'data' #data from polybox (ASV, taxonomy analysis)


### 1. Separate metadata table for mono- and dizygotic twins and generate a table for each twin individually. (Or maybe for each pair?)

In [3]:
metadata = pd.read_csv(or_dir + '/metadata.tsv', sep = '\t')
host_numbers = metadata['host_id'].unique()
    
host_numbers

array([42.1, 27.2, 28.1, 28.2, 39.2,  8.1,  8.2, 29.1, 40.1, 40.2, 35.1,
       35.2, 47.1, 47.2,  4.1,  4.2, 29.2,  3.1, 30.2, 36.1, 36.2,  6.1,
        6.2, 30.1, 33.1, 33.2, 43.2, 44.1, 44.2, 45.1, 45.2,  5.1, 37.1,
       37.2, 39.1, 46.1,  3.2, 43.1, 42.2, 46.2,  5.2, 27.1, 48.2, 48.1,
       32.1, 32.2, 12.2, 13.2, 14.1, 14.2, 10.1, 10.2, 12.1, 13.1, 15.1,
       15.2, 16.1, 25.1, 25.2, 26.2, 11.1,  2.1,  2.2, 20.1, 20.2, 21.1,
       21.2, 23.1, 23.2, 19.2, 16.2, 17.1, 17.2, 18.1, 18.2, 19.1, 24.2,
       11.2, 24.1, 26.1])

In [4]:
all_hosts = dict()
for host in host_numbers: #loop through all unique host ids
    #print(host)
    new_name = 'df_host_'+str(host)
    #new_name = new_name.replace('.', '_')
    all_hosts[host] = metadata[metadata['host_id']==host]
    locals()[new_name] = metadata[metadata['host_id']==host]

### 2. Problem: some samples contain NaN values, but the host has been weaned before. We need to keep those values and assign the status of weaned and lose all others that do not contain any information. 

In [53]:
metadata[metadata['host_id']==23.1].sort_values(by=['collection_date'])

Unnamed: 0,id,Library Layout,Instrument,collection_date,geo_location_name,geo_latitude,geo_longitude,host_id,age_days,weight_kg,...,birth_length_cm,sex,delivery_mode,zygosity,race,ethnicity,delivery_preterm,diet_milk,diet_weaning,age_months
1600,ERR1311612,PAIRED,Illumina MiSeq,2010-06-09 00:00:00,"USA, Missouri, St. Louis",38.63699,-90.263794,23.1,36.0,,...,47.0,male,Vaginal,Dizygotic,Caucasian,Not Hispanic,True,fd,False,1.0
1589,ERR1311616,PAIRED,Illumina MiSeq,2010-07-08 00:00:00,"USA, Missouri, St. Louis",38.63699,-90.263794,23.1,65.0,4.763,...,47.0,male,Vaginal,Dizygotic,Caucasian,Not Hispanic,True,fd,False,2.0
1258,ERR1310030,PAIRED,Illumina MiSeq,2010-11-10 00:00:00,"USA, Missouri, St. Louis",38.63699,-90.263794,23.1,190.0,6.804,...,47.0,male,Vaginal,Dizygotic,Caucasian,Not Hispanic,True,fd,True,6.0
1259,ERR1310031,PAIRED,Illumina MiSeq,2010-12-03 00:00:00,"USA, Missouri, St. Louis",38.63699,-90.263794,23.1,213.0,,...,47.0,male,Vaginal,Dizygotic,Caucasian,Not Hispanic,True,fd,True,7.0
904,ERR1310681,PAIRED,Illumina MiSeq,2011-01-14 00:00:00,"USA, Missouri, St. Louis",38.63699,-90.263794,23.1,256.0,,...,47.0,male,Vaginal,Dizygotic,Caucasian,Not Hispanic,True,,,8.0
905,ERR1310682,PAIRED,Illumina MiSeq,2011-02-17 00:00:00,"USA, Missouri, St. Louis",38.63699,-90.263794,23.1,290.0,,...,47.0,male,Vaginal,Dizygotic,Caucasian,Not Hispanic,True,,,10.0
906,ERR1310683,PAIRED,Illumina MiSeq,2011-03-16 00:00:00,"USA, Missouri, St. Louis",38.63699,-90.263794,23.1,316.0,,...,47.0,male,Vaginal,Dizygotic,Caucasian,Not Hispanic,True,fd,True,10.0
1617,ERR1311611,PAIRED,Illumina MiSeq,2011-05-05 00:00:00,"USA, Missouri, St. Louis",38.63699,-90.263794,23.1,366.0,,...,47.0,male,Vaginal,Dizygotic,Caucasian,Not Hispanic,True,fd,True,12.0
1587,ERR1311614,PAIRED,Illumina MiSeq,2011-07-07 00:00:00,"USA, Missouri, St. Louis",38.63699,-90.263794,23.1,428.0,,...,47.0,male,Vaginal,Dizygotic,Caucasian,Not Hispanic,True,,,14.0
1586,ERR1311613,PAIRED,Illumina MiSeq,2011-08-05 00:00:00,"USA, Missouri, St. Louis",38.63699,-90.263794,23.1,457.0,,...,47.0,male,Vaginal,Dizygotic,Caucasian,Not Hispanic,True,,,15.0


#### The NaN values appear to only be after weaning or when not weaned at all, but not before weaning. (How do we check whether that is really true?)

#### If so, we can assign True to each NaN value if metadata['diet_weaning'].sum() >= 1:
#### (We discussed this with our tutor and she gave permission to use this assumption)

In [8]:
for host in host_numbers:
    if all_hosts[host]['diet_weaning'].sum() >= 1:
        Df = all_hosts[host]
        Df['diet_weaning'].fillna(True)
        all_hosts[host] = Df

#### Check:

In [15]:
all_hosts[40.1].sort_values(by=['collection_date'])

Unnamed: 0,id,Library Layout,Instrument,collection_date,geo_location_name,geo_latitude,geo_longitude,host_id,age_days,weight_kg,...,birth_length_cm,sex,delivery_mode,zygosity,race,ethnicity,delivery_preterm,diet_milk,diet_weaning,age_months
527,ERR1315526,PAIRED,Illumina MiSeq,2011-05-27 00:00:00,"USA, Missouri, St. Louis",38.63699,-90.263794,40.1,63.0,,...,49.0,female,Vaginal,Monozygotic,Caucasian,Not Hispanic,True,fd,False,2.0
564,ERR1315589,PAIRED,Illumina MiSeq,2011-06-28 00:00:00,"USA, Missouri, St. Louis",38.63699,-90.263794,40.1,95.0,,...,49.0,female,Vaginal,Monozygotic,Caucasian,Not Hispanic,True,fd,False,3.0
515,ERR1315536,PAIRED,Illumina MiSeq,2011-07-27 00:00:00,"USA, Missouri, St. Louis",38.63699,-90.263794,40.1,124.0,6.974,...,49.0,female,Vaginal,Monozygotic,Caucasian,Not Hispanic,True,fd,False,4.0
304,ERR1314716,PAIRED,Illumina MiSeq,2011-08-15 00:00:00,"USA, Missouri, St. Louis",38.63699,-90.263794,40.1,144.0,,...,49.0,female,Vaginal,Monozygotic,Caucasian,Not Hispanic,True,fd,True,5.0
177,ERR1314295,PAIRED,Illumina MiSeq,2011-09-13 00:00:00,"USA, Missouri, St. Louis",38.63699,-90.263794,40.1,172.0,,...,49.0,female,Vaginal,Monozygotic,Caucasian,Not Hispanic,True,fd,True,6.0
46,ERR1314250,PAIRED,Illumina MiSeq,2011-10-12 00:00:00,"USA, Missouri, St. Louis",38.63699,-90.263794,40.1,200.0,,...,49.0,female,Vaginal,Monozygotic,Caucasian,Not Hispanic,True,fd,True,7.0
81,ERR1314054,PAIRED,Illumina MiSeq,2011-11-09 00:00:00,"USA, Missouri, St. Louis",38.63699,-90.263794,40.1,229.0,,...,49.0,female,Vaginal,Monozygotic,Caucasian,Not Hispanic,True,fd,True,8.0
1575,ERR1313952,PAIRED,Illumina MiSeq,2011-12-08 00:00:00,"USA, Missouri, St. Louis",38.63699,-90.263794,40.1,258.0,,...,49.0,female,Vaginal,Monozygotic,Caucasian,Not Hispanic,True,fd,True,8.0
1417,ERR1313847,PAIRED,Illumina MiSeq,2012-01-08 00:00:00,"USA, Missouri, St. Louis",38.63699,-90.263794,40.1,289.0,,...,49.0,female,Vaginal,Monozygotic,Caucasian,Not Hispanic,True,fd,True,9.0
8,ERR1314190,PAIRED,Illumina MiSeq,2012-02-12 00:00:00,"USA, Missouri, St. Louis",38.63699,-90.263794,40.1,324.0,,...,49.0,female,Vaginal,Monozygotic,Caucasian,Not Hispanic,True,fd,True,11.0


### 3. Filter feature table according to metadata table

In [None]:
! qiime feature-table filter-samples \
--i-table $data_dir/phylogeny_filtered_table.qza \


In [17]:
! qiime feature-table filter-samples --help

Usage: [94mqiime feature-table filter-samples[0m [OPTIONS]

  Filter samples from table based on frequency and/or metadata. Any features
  with a frequency of zero after sample filtering will also be removed. See
  the filtering tutorial on https://docs.qiime2.org for additional details.

[1mInputs[0m:
  [94m[4m--i-table[0m ARTIFACT [32mFeatureTable[Frequency¹ | RelativeFrequency² |[0m
    [32mPresenceAbsence³ | Composition⁴][0m
                       The feature table from which samples should be
                       filtered.                                    [35m[required][0m
[1mParameters[0m:
  [94m--p-min-frequency[0m INTEGER
                       The minimum total frequency that a sample must have to
                       be retained.                               [35m[default: 0][0m
  [94m--p-max-frequency[0m INTEGER
                       The maximum total frequency that a sample can have to
                       be retained. If no value is provided t

### 4. Get table for each twin pair and each stage --> find F values for twin column with ANCOM showing differences between individuals

#### Pair the twins:

In [29]:
pair_numbers = list(dict.fromkeys(host_numbers.round(0)))
pair_numbers

[42.0,
 27.0,
 28.0,
 39.0,
 8.0,
 29.0,
 40.0,
 35.0,
 47.0,
 4.0,
 3.0,
 30.0,
 36.0,
 6.0,
 33.0,
 43.0,
 44.0,
 45.0,
 5.0,
 37.0,
 46.0,
 48.0,
 32.0,
 12.0,
 13.0,
 14.0,
 10.0,
 15.0,
 16.0,
 25.0,
 26.0,
 11.0,
 2.0,
 20.0,
 21.0,
 23.0,
 19.0,
 17.0,
 18.0,
 24.0]

In [48]:
all_pairs = dict()
for pair in pair_numbers: #loop through all unique pair ids
    a = metadata[metadata['host_id'] == pair+0.1]
    b = metadata[metadata['host_id'] == pair+0.2]
    all_pairs[pair] = pd.concat([a,b])

In [None]:
#why was this necessary in the previous dictionary loop?
#new_name = 'df_pair_'+str(pair)
#locals()[new_name] = metadata[metadata['host_id']==host]

#### Get tables for each pair and each stage:

In [57]:
stages = metadata['diet_milk'].dropna().unique()
stages

all_stages = dict()

for stage in stages:
    new_name = 'df_stage_'+str(stage)
    all_stages[stage] = metadata[metadata['diet_milk'] == stage]
    locals()[new_name] = metadata[metadata['diet_milk'] == stage]

#### Check:

In [61]:
all_stages['fd']

Unnamed: 0,id,Library Layout,Instrument,collection_date,geo_location_name,geo_latitude,geo_longitude,host_id,age_days,weight_kg,...,birth_length_cm,sex,delivery_mode,zygosity,race,ethnicity,delivery_preterm,diet_milk,diet_weaning,age_months
0,ERR1314182,PAIRED,Illumina MiSeq,2011-11-11 00:00:00,"USA, Missouri, St. Louis",38.63699,-90.263794,42.1,232.0,,...,47.0,male,Cesarean,Monozygotic,Caucasian,Not Hispanic,True,fd,True,8.0
1,ERR1314183,PAIRED,Illumina MiSeq,2010-12-11 00:00:00,"USA, Missouri, St. Louis",38.63699,-90.263794,27.2,192.0,,...,45.0,female,Cesarean,Dizygotic,Caucasian,Hispanic,True,fd,True,6.0
8,ERR1314190,PAIRED,Illumina MiSeq,2012-02-12 00:00:00,"USA, Missouri, St. Louis",38.63699,-90.263794,40.1,324.0,,...,49.0,female,Vaginal,Monozygotic,Caucasian,Not Hispanic,True,fd,True,11.0
9,ERR1314191,PAIRED,Illumina MiSeq,2012-02-12 00:00:00,"USA, Missouri, St. Louis",38.63699,-90.263794,40.2,325.0,,...,44.0,female,Vaginal,Monozygotic,Caucasian,Not Hispanic,True,fd,True,11.0
13,ERR1314198,PAIRED,Illumina MiSeq,2012-04-12 00:00:00,"USA, Missouri, St. Louis",38.63699,-90.263794,47.1,305.0,,...,46.0,male,Vaginal,Monozygotic,African-American,Not Hispanic,True,fd,True,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1661,ERR1310702,PAIRED,Illumina MiSeq,2011-04-12 00:00:00,"USA, Missouri, St. Louis",38.63699,-90.263794,25.2,335.0,9.696,...,48.0,female,Cesarean_emergency,Monozygotic,Caucasian,Hispanic,False,fd,True,11.0
1664,ERR1310705,PAIRED,Illumina MiSeq,2010-07-31 00:00:00,"USA, Missouri, St. Louis",38.63699,-90.263794,26.1,74.0,,...,49.0,female,Vaginal,Dizygotic,Caucasian,Not Hispanic,False,fd,False,2.0
1665,ERR1310707,PAIRED,Illumina MiSeq,2011-02-07 00:00:00,"USA, Missouri, St. Louis",38.63699,-90.263794,27.1,250.0,,...,45.0,female,Cesarean,Dizygotic,Caucasian,Hispanic,True,fd,True,8.0
1666,ERR1310708,PAIRED,Illumina MiSeq,2011-04-09 00:00:00,"USA, Missouri, St. Louis",38.63699,-90.263794,27.1,310.0,,...,45.0,female,Cesarean,Dizygotic,Caucasian,Hispanic,True,fd,True,10.0


#### In order to generate a table that contains values of two dictionaries with separate conditions, we can merge them like this:

In [62]:
all_stages['fd'].merge(all_pairs[40], how = 'inner')

Unnamed: 0,id,Library Layout,Instrument,collection_date,geo_location_name,geo_latitude,geo_longitude,host_id,age_days,weight_kg,...,birth_length_cm,sex,delivery_mode,zygosity,race,ethnicity,delivery_preterm,diet_milk,diet_weaning,age_months
0,ERR1314190,PAIRED,Illumina MiSeq,2012-02-12 00:00:00,"USA, Missouri, St. Louis",38.63699,-90.263794,40.1,324.0,,...,49.0,female,Vaginal,Monozygotic,Caucasian,Not Hispanic,True,fd,True,11.0
1,ERR1314191,PAIRED,Illumina MiSeq,2012-02-12 00:00:00,"USA, Missouri, St. Louis",38.63699,-90.263794,40.2,325.0,,...,44.0,female,Vaginal,Monozygotic,Caucasian,Not Hispanic,True,fd,True,11.0
2,ERR1314250,PAIRED,Illumina MiSeq,2011-10-12 00:00:00,"USA, Missouri, St. Louis",38.63699,-90.263794,40.1,200.0,,...,49.0,female,Vaginal,Monozygotic,Caucasian,Not Hispanic,True,fd,True,7.0
3,ERR1314054,PAIRED,Illumina MiSeq,2011-11-09 00:00:00,"USA, Missouri, St. Louis",38.63699,-90.263794,40.1,229.0,,...,49.0,female,Vaginal,Monozygotic,Caucasian,Not Hispanic,True,fd,True,8.0
4,ERR1314055,PAIRED,Illumina MiSeq,2011-11-09 00:00:00,"USA, Missouri, St. Louis",38.63699,-90.263794,40.2,230.0,,...,44.0,female,Vaginal,Monozygotic,Caucasian,Not Hispanic,True,fd,True,8.0
5,ERR1314557,PAIRED,Illumina MiSeq,2011-09-14 00:00:00,"USA, Missouri, St. Louis",38.63699,-90.263794,40.2,173.0,,...,44.0,female,Vaginal,Monozygotic,Caucasian,Not Hispanic,True,fd,True,6.0
6,ERR1314295,PAIRED,Illumina MiSeq,2011-09-13 00:00:00,"USA, Missouri, St. Louis",38.63699,-90.263794,40.1,172.0,,...,49.0,female,Vaginal,Monozygotic,Caucasian,Not Hispanic,True,fd,True,6.0
7,ERR1314716,PAIRED,Illumina MiSeq,2011-08-15 00:00:00,"USA, Missouri, St. Louis",38.63699,-90.263794,40.1,144.0,,...,49.0,female,Vaginal,Monozygotic,Caucasian,Not Hispanic,True,fd,True,5.0
8,ERR1314717,PAIRED,Illumina MiSeq,2011-08-15 00:00:00,"USA, Missouri, St. Louis",38.63699,-90.263794,40.2,143.0,,...,44.0,female,Vaginal,Monozygotic,Caucasian,Not Hispanic,True,fd,True,5.0
9,ERR1315635,PAIRED,Illumina MiSeq,2011-07-29 00:00:00,"USA, Missouri, St. Louis",38.63699,-90.263794,40.2,126.0,5.273,...,44.0,female,Vaginal,Monozygotic,Caucasian,Not Hispanic,True,fd,False,4.0


#### That way we can generate a new dictionary with which we can recall all pairs with a certain diet:

In [63]:
pair_stages = dict()

for pair in pair_numbers:
    for stage in stages:
        pair_stages[pair,stage] = all_stages[stage].merge(all_pairs[pair], how = 'inner')

#### Check:

In [67]:
pair_stages[3, 'fd']

Unnamed: 0,id,Library Layout,Instrument,collection_date,geo_location_name,geo_latitude,geo_longitude,host_id,age_days,weight_kg,...,birth_length_cm,sex,delivery_mode,zygosity,race,ethnicity,delivery_preterm,diet_milk,diet_weaning,age_months
0,ERR1314203,PAIRED,Illumina MiSeq,2010-06-12 00:00:00,"USA, Missouri, St. Louis",38.63699,-90.263794,3.1,151.0,,...,48.0,male,Vaginal,Monozygotic,Caucasian,Not Hispanic,True,fd,True,5.0
1,ERR1314448,PAIRED,Illumina MiSeq,2010-12-13 00:00:00,"USA, Missouri, St. Louis",38.63699,-90.263794,3.2,336.0,,...,49.0,male,Vaginal,Monozygotic,Caucasian,Not Hispanic,True,fd,True,11.0
2,ERR1314828,PAIRED,Illumina MiSeq,2010-04-17 00:00:00,"USA, Missouri, St. Louis",38.63699,-90.263794,3.1,95.0,,...,48.0,male,Vaginal,Monozygotic,Caucasian,Not Hispanic,True,fd,False,3.0
3,ERR1314888,PAIRED,Illumina MiSeq,2010-04-18 00:00:00,"USA, Missouri, St. Louis",38.63699,-90.263794,3.2,96.0,,...,49.0,male,Vaginal,Monozygotic,Caucasian,Not Hispanic,True,fd,False,3.0
4,ERR1314844,PAIRED,Illumina MiSeq,2010-06-17 00:00:00,"USA, Missouri, St. Louis",38.63699,-90.263794,3.2,156.0,,...,49.0,male,Vaginal,Monozygotic,Caucasian,Not Hispanic,True,fd,True,5.0
5,ERR1314860,PAIRED,Illumina MiSeq,2010-11-17 00:00:00,"USA, Missouri, St. Louis",38.63699,-90.263794,3.1,309.0,,...,48.0,male,Vaginal,Monozygotic,Caucasian,Not Hispanic,True,fd,True,10.0
6,ERR1314640,PAIRED,Illumina MiSeq,2010-05-15 00:00:00,"USA, Missouri, St. Louis",38.63699,-90.263794,3.2,123.0,6.606,...,49.0,male,Vaginal,Monozygotic,Caucasian,Not Hispanic,True,fd,True,4.0
7,ERR1315089,PAIRED,Illumina MiSeq,2010-12-21 00:00:00,"USA, Missouri, St. Louis",38.63699,-90.263794,3.1,344.0,,...,48.0,male,Vaginal,Monozygotic,Caucasian,Not Hispanic,True,fd,True,11.0
8,ERR1315188,PAIRED,Illumina MiSeq,2010-09-24 00:00:00,"USA, Missouri, St. Louis",38.63699,-90.263794,3.2,256.0,,...,49.0,male,Vaginal,Monozygotic,Caucasian,Not Hispanic,True,fd,True,8.0
9,ERR1315190,PAIRED,Illumina MiSeq,2010-10-24 00:00:00,"USA, Missouri, St. Louis",38.63699,-90.263794,3.2,286.0,,...,49.0,male,Vaginal,Monozygotic,Caucasian,Not Hispanic,True,fd,True,9.0


### 5. ANCOM for zygosity column --> find significance