In [14]:
from ResearchMain import *

In [15]:
# 选择季节、显著性最小值
SIGN, COOR = 0.9, 0.5
EXPORT_PATH += 'Gephi/Beta_02/'

In [16]:
# 定义函数
def analyse_pearson_to_gephi(input_df):
    """
    传入一个df数据，计算各列之间的Pearson相关性。
    :param input_df: 传入一个df数据
    :return: 返回一个df数据，Source与Target之间的r值为r（r值的分布范围是-1~1），Weight为1-p（为何要使用1-p？p值越小，说明显著性越强，在gephi里线条应该越粗）
    """
    variable = input_df.columns.to_list()
    col_name = ['Source', 'Target', 'r', 'Weight']
    pearson_df = pd.DataFrame(columns=col_name)
    for i in range(len(variable)):
        for j in range(i + 1, len(variable)):
            r, p = pearsonr(input_df[variable[i]], input_df[variable[j]])
            _ = pd.DataFrame([[variable[i], variable[j], r, 1 - p]], columns=col_name)
            pearson_df = pd.concat([pearson_df, _])

    pearson_df.reset_index(drop=True, inplace=True)
    return pearson_df


def read_table_to_pearson(path):
    """
    输入文件地址然后直接进行pearson相关性分析
    :param path: 文件地址，支持csv、Excel文件
    :return: 返回计算的相关性df，r值为相关性，Weight值为1-p。
    """
    if path:  # 读取文件
        if path.endswith('.xlsx'):  #Excel文件
            read_df = pd.read_excel(path, index_col=0)
        elif path.endswith('.csv'):  #csv文件
            read_df = pd.read_csv(path, header=True)
        else:
            raise ValueError(f"该函数不支持此种文件类型{path.split('.')[1]}")
        return analyse_pearson_to_gephi(read_df)
    else:
        return None

In [17]:
all_df.iloc[:,:13]

Unnamed: 0,Bacterial 16S rRNA,Archaeal 16S rRNA,AOA_amoA,AOB_amoA,nxrA,narG,napA,nirK,nirS,nosZ,norB,hzsA,hzsB
D-N1,3.131311e+12,1.646282e+07,9.372749e+06,238737.749855,4.809669e+06,1.693294e+05,1.147444e+06,6.111955e+05,2.580789e+08,3.427432e+05,1.034718e+05,3.019290e+05,5.024014e+05
D-N2,2.310274e+12,1.606909e+07,1.310459e+07,338309.302429,6.175807e+06,5.203335e+05,3.427096e+06,7.079003e+05,4.167981e+08,9.668389e+05,3.084835e+05,8.599297e+05,4.908595e+05
D-N3,9.328554e+11,1.322307e+07,1.571698e+07,412887.196731,4.135236e+06,7.257937e+04,3.493181e+05,5.374574e+05,2.427963e+08,3.473164e+05,1.151906e+05,3.346185e+05,4.070752e+05
D-N4,9.369330e+11,4.875424e+07,1.643166e+07,489411.878787,1.537017e+07,5.395949e+04,2.081086e+05,5.704071e+05,2.891821e+08,1.213764e+06,1.467825e+05,4.220878e+05,1.392828e+05
D-N5,2.465170e+12,4.627955e+07,2.990348e+07,528491.109190,1.631022e+07,4.789525e+04,4.383332e+05,1.478592e+06,1.420427e+08,5.683137e+05,3.087835e+05,8.607312e+05,1.324880e+05
...,...,...,...,...,...,...,...,...,...,...,...,...,...
W-H8,3.018167e+12,2.008959e+08,7.468539e+07,292694.160549,4.418316e+09,1.857077e+06,4.228224e+06,3.455924e+07,9.231036e+07,8.684252e+08,1.027672e+06,1.819500e+07,8.110777e+07
W-H9,3.239835e+12,1.181798e+08,4.200252e+07,465551.966622,2.781485e+09,1.705901e+06,2.498973e+06,3.240429e+07,6.205803e+07,2.022403e+08,2.581152e+05,1.196430e+07,4.873319e+07
W-H10,3.313568e+12,1.974228e+08,7.328582e+07,615337.809914,4.666949e+09,9.620983e+05,4.403474e+06,2.686501e+07,9.518481e+07,8.536139e+08,4.550655e+05,1.879367e+07,7.976105e+07
W-H11,3.901704e+12,1.908625e+08,7.064787e+07,487954.228922,5.364548e+09,7.454658e+05,3.090290e+06,2.896472e+07,7.285221e+07,3.245037e+08,1.317855e+06,1.417144e+07,7.721460e+07


In [18]:
analyse_pearson_to_gephi(all_df.iloc[:,:13])

Unnamed: 0,Source,Target,r,Weight
0,Bacterial 16S rRNA,Archaeal 16S rRNA,0.244583,0.979837
1,Bacterial 16S rRNA,AOA_amoA,0.080195,0.54757
2,Bacterial 16S rRNA,AOB_amoA,0.095019,0.626987
3,Bacterial 16S rRNA,nxrA,0.1466,0.832041
4,Bacterial 16S rRNA,narG,0.120484,0.742014
...,...,...,...,...
73,nosZ,hzsA,0.490807,0.999999
74,nosZ,hzsB,0.705947,1.0
75,norB,hzsA,0.382483,0.999801
76,norB,hzsB,0.344754,0.999124


In [19]:
for period in period_list:
    for river in river_list:
        # 组装不同时期不同河流前缀
        select_prefix = period[0] + '-'+river[0]
        # 获取数据
        _ = all_df.loc[
        [row_name for row_name in all_df.index if row_name.startswith(select_prefix) and int(row_name[-1]) <= 5], [col_name
                                                                                                                  for
                                                                                                                  col_name
                                                                                                                  in
                                                                                                                  all_df.columns
                                                                                                                  if
                                                                                                                  isinstance(
                                                                                                                      col_name,
                                                                                                                      str)]]
        # 剔除不需要的列
        df_handle = _.drop([col_name for col_name in all_df.columns if
                     isinstance(col_name, str) and (col_name.endswith('-Ф') or col_name.endswith('-P'))] + ['Period', 'River', 'S-pH','W-PO_{4}^{3-}', 'W-TP'],
                    axis=1)

        # 删除空行
        df_handle = df_handle.replace(np.nan, 0)
        # df_handle.dropna(inplace=True)

        # 预处理后的df进行pearson相关性分析
        df_pearson = analyse_pearson_to_gephi(df_handle)
        # 对显著性与相关性进行过滤
        # df_pearson = df_pearson[(df_pearson['Weight'] >= SIGN) & (np.abs(df_pearson['r']) >= COOR)]
        df_pearson = df_pearson[df_pearson['Weight'] >= SIGN]

        # 不同时期不同入湖口功能基因之间相关性
        df_pearson.loc[df_pearson.apply(lambda s: True if s['Source'] and s['Target'] in gene_list else False, axis=1), :].to_csv(
            EXPORT_PATH + f'{select_prefix} - 功能基因相关性.csv', index=None, header=True)

        # 不同时期不同入湖口功能基因与环境因子之间相关性
        df_pearson.loc[df_pearson.apply(lambda s: True if s['Source'] in gene_list and s['Target'] not in gene_list else False, axis=1), :].to_csv(
            EXPORT_PATH + f'{select_prefix} - 功能基因与环境因子相关性.csv', index=None, header=True)

In [20]:
EXPORT_PATH

'./export/Gephi/Beta_02/'