In [90]:
import pandas as pd

otu_df = pd.read_csv('dataframes/OTU.txt', sep='\t')
hiera_blast = pd.read_csv('dataframes/hiera_BLAST.txt', sep='\t')
meta_map = pd.read_csv('dataframes/MCOSIS_map.txt', sep='\t')
meta = pd.read_csv('dataframes/antibiotics kazakh finished.csv', sep='\t')

In [91]:
meta = meta[['ID', 'sample', 'day of research', 'intervention', 'date of birth']]
meta_map_expanded = meta_map.set_index('#SampleID').fastqFile.str.split(',', expand=True).stack().reset_index(name='fastqFile').drop('level_1', axis=1)

# Соединение таблиц
merged = pd.merge(meta, meta_map_expanded, left_on='ID', right_on='fastqFile')

# Замена старых значений 'ID' на новые из '#SampleID'
meta['ID'] = merged['#SampleID']

# Показываем результат
meta.head()

Unnamed: 0,ID,sample,day of research,intervention,date of birth
0,SMPL0,1,40,yes,23.10.2012
1,SMPL1,1,45,yes,23.10.2012
2,SMPL2,1,50,yes,23.10.2012
3,SMPL3,1,55,yes,23.10.2012
4,SMPL4,1,60,yes,23.10.2012


In [92]:
# Удаление всех строк, которые не содержат 'Collinsella' в столбце 'Taxonomy'
meta_result = meta[meta['sample'].astype(str).str.contains('1|3|4', na=False)]
meta_result = meta_result[meta_result['day of research'].astype(str).str.contains('1|60', na=False)]
meta_result = meta_result[~meta_result['day of research'].astype(str).str.contains(r'\b10\b|\b15\b', na=False)]
meta_result = meta_result[~meta_result['ID'].astype(str).str.contains('SMPL78|SMPL80|SMPL68', na=False)]

meta_result

Unnamed: 0,ID,sample,day of research,intervention,date of birth
4,SMPL4,1,60,yes,23.10.2012
24,SMPL24,3,60,yes,19.11.2013
66,SMPL66,1,1,yes,23.10.2012
67,SMPL67,4,1,yes,19.05.2014
76,SMPL76,3,1,yes,19.11.2013
92,SMPL92,4,60,yes,19.05.2014


In [93]:
# Транспонирование otu_df
otu_df_transposed = otu_df.set_index('OTU').T
otu_df_transposed = otu_df_transposed.reset_index().rename(columns={'index': 'SampleID'})

# Переименование столбцов в meta_result для соответствия с otu_df_transposed
meta_result = meta_result.rename(columns={'ID': 'SampleID', 'intervention': 'GROUP'})

# Объединение таблиц
merged_df = pd.merge(meta_result, otu_df_transposed, on='SampleID')

# Переименование столбцов OTU
for col in merged_df.columns:
    if 'OTU' in col:
        merged_df = merged_df.rename(columns={col: col.replace('OTU_', 'OTU')})

# Вывод результата
merged_df = merged_df.drop(columns=['GROUP', 'date of birth'])
merged_df.to_csv('dataframes/output/otu/otu_starting.csv', sep='\t')
merged_df

Unnamed: 0,SampleID,sample,day of research,OTU1,OTU2,OTU3,OTU4,OTU5,OTU6,OTU7,...,OTU4190,OTU4191,OTU4192,OTU4193,OTU4194,OTU4195,OTU4196,OTU4197,OTU4198,OTU4199
0,SMPL4,1,60,15,3,2,8,123,2,1,...,0,0,0,0,0,0,0,0,0,0
1,SMPL24,3,60,13,0,36,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,SMPL66,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,SMPL67,4,1,0,0,0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,SMPL76,3,1,8,0,1,0,155,0,0,...,0,0,0,0,0,0,0,0,0,0
5,SMPL92,4,60,4,1,0,0,17,0,87,...,0,0,0,0,0,0,0,0,0,0


In [94]:
def modify_sample_id(row):
    if row['SampleID'] == 'SMPL4' and row['day of research'] == 60:
        return 'SMPL4_1_after'
    elif row['SampleID'] == 'SMPL66' and row['day of research'] == 1:
        return 'SMPL66_1_before'
    elif row['SampleID'] == 'SMPL24' and row['day of research'] == 60:
        return 'SMPL24_3_after'
    elif row['SampleID'] == 'SMPL76' and row['day of research'] == 1:
        return 'SMPL76_3_before'
    elif row['SampleID'] == 'SMPL67' and row['day of research'] == 1:
        return 'SMPL67_4_before'
    elif row['SampleID'] == 'SMPL92' and row['day of research'] == 60:
        return 'SMPL92_4_after'
    else:
        return row['SampleID']


merged_df['ModifiedSampleID'] = merged_df.apply(modify_sample_id, axis=1)
#merged_df = merged_df.drop(columns = ['sample', 'day of research'])
columns_order = ['SampleID', 'ModifiedSampleID'] + [col for col in merged_df.columns if col not in ['SampleID', 'ModifiedSampleID']]
merged_df = merged_df[columns_order]
merged_df

Unnamed: 0,SampleID,ModifiedSampleID,sample,day of research,OTU1,OTU2,OTU3,OTU4,OTU5,OTU6,...,OTU4190,OTU4191,OTU4192,OTU4193,OTU4194,OTU4195,OTU4196,OTU4197,OTU4198,OTU4199
0,SMPL4,SMPL4_1_after,1,60,15,3,2,8,123,2,...,0,0,0,0,0,0,0,0,0,0
1,SMPL24,SMPL24_3_after,3,60,13,0,36,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,SMPL66,SMPL66_1_before,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,SMPL67,SMPL67_4_before,4,1,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
4,SMPL76,SMPL76_3_before,3,1,8,0,1,0,155,0,...,0,0,0,0,0,0,0,0,0,0
5,SMPL92,SMPL92_4_after,4,60,4,1,0,0,17,0,...,0,0,0,0,0,0,0,0,0,0


In [95]:
for col in hiera_blast.columns[1:]: 
    hiera_blast[col] = col[0].lower() + '_' + hiera_blast[col]

hiera_blast['OTU'] = hiera_blast['OTU'].str.replace('_', '')

hiera_blast['Taxonomy'] = hiera_blast['Domain'] + '|' + \
                        hiera_blast['Phylum'] + '|' + \
                        hiera_blast['Class'] + '|' + \
                        hiera_blast['Order'] + '|' + \
                        hiera_blast['Family'] + '|' + \
                        hiera_blast['Genus'] + '|' + \
                        hiera_blast['Species']

hiera_blast = hiera_blast[['OTU', 'Taxonomy']]

hiera_blast.head()

Unnamed: 0,OTU,Taxonomy
0,OTU4155,d_Bacteria|p_Bacteroidetes|c_Bacteroidia|o_Bac...
1,OTU4149,d_Bacteria|p_Firmicutes|c_Clostridia|o_Clostri...
2,OTU4093,d_Bacteria|p_Firmicutes|c_Clostridia|o_Clostri...
3,OTU4078,d_Bacteria|p_Firmicutes|c_Clostridia|o_Clostri...
4,OTU4073,d_Bacteria|p_Firmicutes|c_Clostridia|o_Clostri...


In [96]:
long_merged_df = pd.melt(merged_df, id_vars=['SampleID','ModifiedSampleID'], var_name='OTU', value_name='Value')

# Объединяем hiera_blast с long_merged_df по столбцу 'OTU'
merged_result = pd.merge(hiera_blast, long_merged_df, on='OTU', how='inner')

merged_result_non_zero = merged_result[merged_result['Value'] > 0]
merged_result_non_zero

Unnamed: 0,OTU,Taxonomy,SampleID,ModifiedSampleID,Value
241,OTU3209,d_Bacteria|p_Bacteroidetes|c_Bacteroidia|o_Bac...,SMPL24,SMPL24_3_after,1
515,OTU2410,d_Bacteria|p_Firmicutes|c_Bacilli|o_Lactobacil...,SMPL92,SMPL92_4_after,1
736,OTU1973,d_Bacteria|p_Proteobacteria|c_Deltaproteobacte...,SMPL76,SMPL76_3_before,1
859,OTU516,d_Bacteria|p_Firmicutes|c_Clostridia|o_Clostri...,SMPL24,SMPL24_3_after,5
1117,OTU242,d_Bacteria|p_Firmicutes|c_Clostridia|o_Clostri...,SMPL24,SMPL24_3_after,11
...,...,...,...,...,...
6030,OTU36,d_Bacteria|p_Firmicutes|c_Clostridia|o_Clostri...,SMPL4,SMPL4_1_after,6
6031,OTU36,d_Bacteria|p_Firmicutes|c_Clostridia|o_Clostri...,SMPL24,SMPL24_3_after,1
6034,OTU36,d_Bacteria|p_Firmicutes|c_Clostridia|o_Clostri...,SMPL76,SMPL76_3_before,2
6035,OTU36,d_Bacteria|p_Firmicutes|c_Clostridia|o_Clostri...,SMPL92,SMPL92_4_after,68


In [97]:
# Удаление всех строк, которые не содержат 'Collinsella' в столбце 'Taxonomy'
filtered_result_collinsella = merged_result[merged_result['Taxonomy'].str.contains('Collinsella', na=False)]

filtered_result_collinsella.to_csv('dataframes/output/filtered_result_collinsella.csv', sep='\t')

In [98]:
from scipy.stats import shapiro
shapiro_results = shapiro(merged_result_non_zero['Value'])

shapiro_results

ShapiroResult(statistic=0.16837453842163086, pvalue=5.951906877591947e-31)

In [99]:
from scipy.stats import mannwhitneyu, ttest_ind, kruskal, f_oneway
from mlxtend.evaluate import permutation_test
import numpy as np

def statics(df, sample_before, sample_after):
    # Фильтруем данные по группам
    group_before = df[df['ModifiedSampleID'] == sample_before]['Value']
    group_after = df[df['ModifiedSampleID'] == sample_after]['Value']

    # Проверка, что в каждой группе есть хотя бы два наблюдения
    if len(group_before) > 1 and len(group_after) > 1:
        # Расчет p-value с помощью Mann-Whitney U теста
        u_statistic, p_value = mannwhitneyu(group_before, group_after, alternative='two-sided')
        file.write(f'U-Statistic mannwhitneyu: {u_statistic}\n')
        file.write(f'p-value mannwhitneyu: {p_value}\n')
    else:
        file.write("Одна из групп содержит слишком мало значений для анализа.")

    t_stat, p_value_t_test = ttest_ind(group_before, group_after, equal_var=False)  # Welch's t-test
    file.write(f'T-Statistic Welch\'s t-test: {t_stat}\n')
    file.write(f'p-value (T-test) Welch\'s t-test: {p_value_t_test}\n')

    kruskal_stat, p_value_kruskal = kruskal(group_before, group_after)
    file.write(f'Kruskal-Wallis Statistic: {kruskal_stat}\n')
    file.write(f'p-value (Kruskal-Wallis): {p_value_kruskal}\n')

    f_stat, p_value_anova = f_oneway(group_before, group_after)
    file.write(f'F-Statistic (ANOVA): {f_stat}\n')
    file.write(f'p-value (ANOVA): {p_value_anova}\n')

    p_value_permutation = permutation_test(group_before, group_after, method='approximate', num_rounds=10000, seed=0)
    file.write(f'p-value (Permutation test): {p_value_permutation}\n')

with open('dataframes/output/saumal_after_before.txt', 'w') as file: 
    file.write(f'SMPL66_1_before vs SMPL4_1_after:\n')
    statics(merged_result_non_zero, 'SMPL66_1_before', 'SMPL4_1_after')
    file.write(f'\n\nSMPL76_3_before vs SMPL24_3_after:\n')
    statics(merged_result_non_zero, 'nSMPL76_3_before', 'SMPL24_3_after')
    file.write(f'\n\nSMPL67_4_before vs SMPL92_4_after:\n')
    statics(merged_result_non_zero, 'SMPL67_4_before', 'SMPL92_4_after')


  t_stat, p_value_t_test = ttest_ind(group_before, group_after, equal_var=False)  # Welch's t-test
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [101]:
import pandas as pd

data = merged_result_non_zero.copy()

# Создание нового столбца 'Sample_Group'
data['№'] = data['SampleID'] + '_' + data['ModifiedSampleID']

# Создание сводной таблицы
pivot_df = data.pivot_table(index='Taxonomy', columns='№', values='Value', aggfunc='sum', fill_value=0)

# Сброс индекса, чтобы 'Taxonomy' стал столбцом
pivot_df.reset_index(inplace=True)

# Вывод первых нескольких строк полученного DataFrame для проверки
pivot_df.head()

№,Taxonomy,SMPL24_SMPL24_3_after,SMPL4_SMPL4_1_after,SMPL66_SMPL66_1_before,SMPL67_SMPL67_4_before,SMPL76_SMPL76_3_before,SMPL92_SMPL92_4_after
0,d_Bacteria|p_Actinobacteria|c_Coriobacteriia|o...,3,0,0,0,0,0
1,d_Bacteria|p_Actinobacteria|c_Coriobacteriia|o...,0,1,0,0,0,0
2,d_Bacteria|p_Bacteroidetes|c_Bacteroidia|o_Bac...,0,1,0,0,3,0
3,d_Bacteria|p_Bacteroidetes|c_Bacteroidia|o_Bac...,1,10,0,2,0,0
4,d_Bacteria|p_Bacteroidetes|c_Bacteroidia|o_Bac...,13,15,0,0,8,4


In [104]:
def rename_taxa(row):
    parts = row.split('|')
    cleaned_parts = [part for part in parts if not part.endswith('_?')]
    return '|'.join(cleaned_parts)

pivot_df['Taxonomy'] = pivot_df['Taxonomy'].apply(rename_taxa)
pivot_df['Taxonomy'] = pivot_df['Taxonomy'].str.split('|').str[-1]
pivot_df.to_csv('dataframes/output/pivot_non_zero.csv', sep='\t')