# Primary analyses of our sequences
### Load modules

In [1]:
import os
import sys

In [None]:
!pip install geopandas

In [None]:
pip install cartopy

In [2]:
import pandas as pd
import qiime2 as q2
from qiime2 import Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
import cartopy.crs as ccrs
%matplotlib inline

# Define the data directory
data_dir = './data'

### Checking out the metadata

In [3]:
!qiime metadata tabulate \
  --m-input-file ./data/metadata/fungut_metadata.tsv \
  --o-visualization ./data/metadata/fungut_metadata.qzv

[32mSaved Visualization to: ./data/metadata/fungut_metadata.qzv[0m
[0m

In [None]:
Visualization.load('./data/metadata/fungut_metadata.qzv')

# Preprocessing the metadata

### Removing rows with NaN for Diseases

In [5]:
#read metadata
metadata = pd.read_csv("./data/metadata/fungut_metadata.tsv", sep="\t")

In [6]:
#count the rows that have column "ibd_sample" and "gluten_sample" = Not provided
filtered_rows = metadata[(metadata['ibd_sample'] == "Not provided") & (metadata['gluten_sample'] == "Not provided")]
print(len(filtered_rows), "samples have both response variables not provided")

#filter rows that have column "ibd_sample" and "gluten_sample" = Not provided
metadata = metadata[~((metadata['ibd_sample'] == "Not provided") & (metadata['gluten_sample'] == "Not provided"))]
metadata.describe()
print(len(filtered_rows), "samples have been removed")

4 samples have both response variables not provided
4 samples have been removed


### How many "Not Provided" do we have per sample?

In [7]:
# count the number of rows that have more than x columns not provided
for x in range(1, 6):
    count = metadata[metadata.eq("Not provided").sum(axis=1) >= x].shape[0]
    print(count, "samples have not provided", x, "columns")

62 samples have not provided 1 columns
10 samples have not provided 2 columns
5 samples have not provided 3 columns
2 samples have not provided 4 columns
1 samples have not provided 5 columns


#### We do not filter out any of these samples. the 5 missing columns in one sample is fine.

### Create new features "Symptoms vs No-Symptoms"

In [8]:
metadata['gluten_sample'].unique()

array(['No',
       'I was diagnosed with gluten allergy (anti-gluten IgG), but not celiac disease',
       'I do not eat gluten because it makes me feel bad',
       'I was diagnosed with celiac disease', 'Not provided'],
      dtype=object)

In [9]:
metadata['ibd_sample'].unique()

array(['I do not have this condition', 'Self-diagnosed',
       'Diagnosed by a medical professional (doctor, physician assistant)',
       'Not provided'], dtype=object)

We have 2 columns where the people from our data set state wheter they are diagnosed with Inflammatory Bowel Disease and how they react to gluten . Because the answers are not that easy to compare against each other, we create a new column which summarizes whether a person shows symptoms or not (2 separate, one for gluten and one for IBD). This way we have a binary variable that creates 2 groups which is easier to compare. We also have the problem, that most of the people have answer "No" and the other groups are much smaller. When we group all people together that show symptoms we have one bigger group instead of 3 smaller ones.

In [10]:
# Define the categories to summarize as "symptoms"
no_symptoms = ['I do not have this condition']  

# Create a new column with a binary classification
metadata['ibd_symptoms'] = metadata['ibd_sample'].apply(
    lambda x: 'Not provided' if x == 'Not provided' else 
              ('no_symptoms' if x in no_symptoms else 'symptoms')
)

In [11]:
# Define the categories to summarize as "no_symptoms"
no_symptoms = ['No']  

# Create a new column with a binary classification
metadata['gluten_symptoms'] = metadata['gluten_sample'].apply(
    lambda x: 'Not provided' if x == 'Not provided' else 
              ('no_symptoms' if x in no_symptoms else 'symptoms')
)

### Add new feature "rural_urban"
Within the metadata we have for each sample the longitude and lattidude. From this we can get the  location where a person lives (roughly). We want to distinguish between rural areas and urban areas. Therefore we classify every location from a sample whether it is rural or urban with help of the packages 'geopandas' and 'carotpy'. 

In [12]:
# Create a boolean mask for rows with valid longitude and latitude values
valid_coords = (metadata['longitude_sample'] != "Not provided") & (metadata['latitude_sample'] != "Not provided")

# Initialize the 'is_urban' column with "Not provided"
metadata['is_urban'] = "Not provided"

# Process only rows with valid longitude and latitude
valid_metadata = metadata[valid_coords]

# Convert valid latitude and longitude columns to a GeoDataFrame
gdf = gpd.GeoDataFrame(valid_metadata, 
                       geometry=gpd.points_from_xy(valid_metadata['longitude_sample'].astype(float), 
                                                   valid_metadata['latitude_sample'].astype(float)), 
                       crs="EPSG:4326")

# Load the shapefile for urban areas
urban_areas = gpd.read_file('./data/urban_areas/ne_10m_urban_areas.shp')

# Perform the spatial join for valid rows
gdf['is_urban'] = gpd.sjoin(gdf, urban_areas, predicate='within', how='left').index_right.notnull()

# Update the 'is_urban' column in the original metadata DataFrame
metadata.loc[valid_coords, 'is_urban'] = gdf['is_urban'].astype(bool)

### Save processed metadata as tsv file

In [13]:
metadata.to_csv('./data/metadata/fungut_metadata_processed.tsv', sep='\t', index=False)