In [55]:
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline

# from mlxtend.frequent_patterns import apriori
# from mlxtend.frequent_patterns import association_rules

In [2]:
ILUMINA_PATH = Path('../data/Ilumina')
ILUMINA_CLEAN_PATH = Path('../data/Ilumina/clean')

NANOPORE_PATH = Path('../data/Nanopore')
NANOPORE_CLEAN_PATH = Path('../data/Nanopore/clean')

In [106]:
columns = [i for i in range(14) if i != 4] #31
SampleInfo = pd.read_excel(ILUMINA_PATH / "Metadata_and_relative_abundance_of_seminal_microbiota_from_idiopathic_infertile_patients_and_donors.xlsx",
            sheet_name="Sample info + Sperm quality", skipfooter=1, skiprows=1, usecols=columns, keep_default_na=False, 
            na_values=["null", ""])

columns = [i for i in range(31) if i not in range(2,14)]
SpermQuality = pd.read_excel(ILUMINA_PATH / "Metadata_and_relative_abundance_of_seminal_microbiota_from_idiopathic_infertile_patients_and_donors.xlsx",
            sheet_name="Sample info + Sperm quality", skipfooter=1, skiprows=1, usecols=columns, keep_default_na=False, 
            na_values=["null", ""])

IluminaPylum = pd.read_excel(ILUMINA_PATH / "Metadata_and_relative_abundance_of_seminal_microbiota_from_idiopathic_infertile_patients_and_donors.xlsx",
            sheet_name="Pylum-level microbiota")
IluminaFamily = pd.read_excel(ILUMINA_PATH / "Metadata_and_relative_abundance_of_seminal_microbiota_from_idiopathic_infertile_patients_and_donors.xlsx",
            sheet_name="Family-level microbiota")
IluminaGenus = pd.read_excel(ILUMINA_PATH / "Metadata_and_relative_abundance_of_seminal_microbiota_from_idiopathic_infertile_patients_and_donors.xlsx",
            sheet_name="Genus-level microbiota")

NanoporePylum = pd.read_excel(NANOPORE_PATH / "Relative abundance of seminal microbiota from idiopathic infertile patients and donors using MinION sequencing.xlsx",
            sheet_name="Pylum-level microbiota")
NanoporeFamily = pd.read_excel(NANOPORE_PATH / "Relative abundance of seminal microbiota from idiopathic infertile patients and donors using MinION sequencing.xlsx",
            sheet_name="Family-level microbiota")
NanoporeGenus = pd.read_excel(NANOPORE_PATH / "Relative abundance of seminal microbiota from idiopathic infertile patients and donors using MinION sequencing.xlsx",
            sheet_name="Genus-level microbiota")

In [137]:
IluminaPylum["Fertil"] = 1
IluminaPylum.loc[(IluminaPylum.loc[:,"Sample ID"]).str[:3] == "UAB", "Fertil"] = 0

IluminaFamily["Fertil"] = 1
IluminaFamily.loc[(IluminaPylum.loc[:,"Sample ID"]).str[:3] == "UAB", "Fertil"] = 0

IluminaGenus["Fertil"] = 1
IluminaGenus.loc[(IluminaPylum.loc[:,"Sample ID"]).str[:3] == "UAB", "Fertil"] = 0

NanoporePylum["Fertil"] = 1
NanoporePylum.loc[(IluminaPylum.loc[:,"Sample ID"]).str[:3] == "UAB", "Fertil"] = 0

NanoporeFamily["Fertil"] = 1
NanoporeFamily.loc[(IluminaPylum.loc[:,"Sample ID"]).str[:3] == "UAB", "Fertil"] = 0

NanoporeGenus["Fertil"] = 1
NanoporeGenus.loc[(IluminaPylum.loc[:,"Sample ID"]).str[:3] == "UAB", "Fertil"] = 0

In [141]:

encode =  KBinsDiscretizer(5, encode='ordinal', strategy='uniform', subsample=None, random_state=2707)

transformer = FunctionTransformer(encode.fit_transform)

IluminaPylumCod = IluminaPylum.copy()
IluminaFamilyCod = IluminaFamily.copy()
IluminaGenusCod = IluminaGenus.copy()

IluminaPylumCod.iloc[:,range(1,IluminaPylumCod.shape[1]-1)] = transformer.transform(IluminaPylumCod.iloc[:,range(1,IluminaPylumCod.shape[1]-1)])
IluminaFamilyCod.iloc[:,range(1,IluminaFamilyCod.shape[1]-1)] = transformer.transform(IluminaFamilyCod.iloc[:,range(1,IluminaFamilyCod.shape[1]-1)])
IluminaGenusCod.iloc[:,range(1,IluminaGenusCod.shape[1]-1)] = transformer.transform(IluminaGenusCod.iloc[:,range(1,IluminaGenusCod.shape[1]-1)])

NanoporePylum.iloc[:,range(1,NanoporePylum.shape[1]-1)] = transformer.transform(NanoporePylum.iloc[:,range(1,NanoporePylum.shape[1]-1)])
NanoporeFamily.iloc[:,range(1,NanoporeFamily.shape[1]-1)] = transformer.transform(NanoporeFamily.iloc[:,range(1,NanoporeFamily.shape[1]-1)])
NanoporeGenus.iloc[:,range(1,NanoporeGenus.shape[1]-1)] = transformer.transform(NanoporeGenus.iloc[:,range(1,NanoporeGenus.shape[1]-1)])


In [146]:
encodeOH = KBinsDiscretizer(n_bins=5, encode='onehot-dense', strategy='uniform', subsample=None, random_state=2707)

IluminaPylumCodOH = IluminaPylumCod.copy()
IluminaFamilyCodOH = IluminaFamilyCod.copy()
IluminaGenusCodOH = IluminaGenusCod.copy()

columns = IluminaPylum.iloc[:,range(1,IluminaPylum.shape[1]-1)].columns

for column in columns:
    colOH = encodeOH.fit_transform(IluminaPylum[[column]])
    aux = [sub_array[0] for sub_array in colOH]

# aux = [sub_array[0] for sub_array in colOH]

In [109]:
SampleInfo.to_csv(ILUMINA_CLEAN_PATH / "SampleInfo.csv", index=False, encoding='utf-8')
SpermQuality.to_csv(ILUMINA_CLEAN_PATH / "SpermQuality.csv", index=False, encoding='utf-8')
IluminaPylumCod.to_csv(ILUMINA_CLEAN_PATH / "IluminaPylumCod.csv", index=False, encoding='utf-8')
IluminaFamilyCod.to_csv(ILUMINA_CLEAN_PATH / "IluminaFamilyCod.csv", index=False, encoding='utf-8')
IluminaGenusCod.to_csv(ILUMINA_CLEAN_PATH / "IluminaGenusCod.csv", index=False, encoding='utf-8')
IluminaPylum.to_csv(ILUMINA_CLEAN_PATH / "IluminaPylum.csv", index=False, encoding='utf-8')
IluminaFamily.to_csv(ILUMINA_CLEAN_PATH / "IluminaFamily.csv", index=False, encoding='utf-8')
IluminaGenus.to_csv(ILUMINA_CLEAN_PATH / "IluminaGenus.csv", index=False, encoding='utf-8')

NanoporePylum.to_csv(NANOPORE_CLEAN_PATH / "NanoporePylum.csv", index=False, encoding='utf-8')
NanoporeFamily.to_csv(NANOPORE_CLEAN_PATH / "NanoporeFamily.csv", index=False, encoding='utf-8')
NanoporeGenus.to_csv(NANOPORE_CLEAN_PATH / "NanoporeGenus.csv", index=False, encoding='utf-8')

In [112]:
IluminaPylumCod

Unnamed: 0,Sample ID,Firmicutes,Proteobacteria,Actinobacteria,Tenericutes,Bacteroidetes,Armatimonadetes,Spirochaetes,Planctomycetes,Verrucomicrobia,...,Dictyoglomi,Balneolaeota,Fibrobacteres,Chlamydiae,Acidobacteria,Kiritimatiellaeota,Nitrospirae,candidate.division.Zixibacteria,Thermodesulfobacteria,Fertil
0,UAB 17001,2.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,UAB 17002,1.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3.0,0
2,UAB 17003,3.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,UAB 17004,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,UAB 17005,4.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0
5,UAB 17006,4.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0
6,UAB 17007,3.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0
7,UAB 17008,4.0,0.0,1.0,0.0,0.0,1.0,1.0,3.0,0.0,...,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
8,UAB 17009,3.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
9,UAB 17010,3.0,2.0,0.0,0.0,0.0,0.0,4.0,1.0,0.0,...,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,4.0,0
