In [113]:
from ResearchMain import *

In [114]:
# 选择季节、显著性最小值
SEASON, SIGN, COOR = 'W', 0.9, 0.5
EXPORT_PATH += 'Gephi/'

In [115]:
season_name = None
if SEASON == 'D':
    season_name = '枯水期'
elif SEASON == 'L':
    season_name = '平水期'
elif SEASON == 'W':
    season_name = '丰水期'
elif SEASON is None:
    hues.warn('请在上一个单元格内选择季节')
else:
    hues.error('输入季节不合法！D：枯水期；L：平水期；W：丰水期。')

In [116]:
# 定义函数
def analyse_pearson_to_gephi(input_df):
    """
    传入一个df数据，计算各列之间的Pearson相关性。
    :param input_df: 传入一个df数据
    :return: 返回一个df数据，Source与Target之间的r值为r（r值的分布范围是-1~1），Weight为1-p（为何要使用1-p？p值越小，说明显著性越强，在gephi里线条应该越粗）
    """
    variable = input_df.columns.to_list()
    col_name = ['Source', 'Target', 'r', 'Weight']
    pearson_df = pd.DataFrame(columns=col_name)
    for i in range(len(variable)):
        for j in range(i + 1, len(variable)):
            r, p = pearsonr(input_df[variable[i]], input_df[variable[j]])
            _ = pd.DataFrame([[variable[i], variable[j], r, 1 - p]], columns=col_name)
            pearson_df = pd.concat([pearson_df, _])

    pearson_df.reset_index(drop=True, inplace=True)
    return pearson_df


def read_table_to_pearson(path):
    """
    输入文件地址然后直接进行pearson相关性分析
    :param path: 文件地址，支持csv、Excel文件
    :return: 返回计算的相关性df，r值为相关性，Weight值为1-p。
    """
    if path:  # 读取文件
        if path.endswith('.xlsx'):  #Excel文件
            read_df = pd.read_excel(path, index_col=0)
        elif path.endswith('.csv'):  #csv文件
            read_df = pd.read_csv(path, header=True)
        else:
            raise ValueError(f"该函数不支持此种文件类型{path.split('.')[1]}")
        return analyse_pearson_to_gephi(read_df)
    else:
        return None

In [117]:
_ = all_df.loc[
    [row_name for row_name in all_df.index if row_name.startswith(f'{SEASON}-') and int(row_name[-1]) <= 5], [col_name
                                                                                                              for
                                                                                                              col_name
                                                                                                              in
                                                                                                              all_df.columns
                                                                                                              if
                                                                                                              isinstance(
                                                                                                                  col_name,
                                                                                                                  str)]]
# 剔除不需要的列
df = _.drop([col_name for col_name in all_df.columns if
             isinstance(col_name, str) and (col_name.endswith('-Ф') or col_name.endswith('-P'))] + ['Period', 'River', 'S-pH','W-PO_{4}^{3-}', 'W-TP'],
            axis=1)

# 删除空行
df = df.replace(0, np.nan)
df.dropna(inplace=True)

df

Unnamed: 0,Bacterial 16S rRNA,Archaeal 16S rRNA,AOA_amoA,AOB_amoA,nxrA,narG,napA,nirK,nirS,nosZ,...,Ⅳ-Area,Ⅴ-Area,W-pH,W-DO,W-TN,W-NO_{2}^{-},W-NO_{3}^{-},W-NH_{4}^{+},W-COD,W-T
W-N1,2158495000000.0,58463630.0,19575280.0,653561.357376,2142571000.0,1794488.0,9513339.0,7817231.0,57171280.0,101008600.0,...,18839.68,831438.019,8.4,8.21,1.86,0.0248,0.102461,1.0177,12.97,21.2
W-N2,3320460000000.0,157985300.0,57548690.0,616844.287014,3460091000.0,7147990.0,31279100.0,48790650.0,69536620.0,685163600.0,...,18862.65,1021062.859,8.29,8.11,1.83,0.0206,0.179035,1.344966,11.63,21.2
W-N3,2043295000000.0,97543470.0,34108700.0,561942.319697,2015707000.0,2055391.0,19010430.0,31577250.0,72732980.0,167360800.0,...,16188.6,787915.619,8.5,8.01,2.54,0.0334,0.268371,1.975929,12.51,21.4
W-N4,2390863000000.0,77143620.0,26444140.0,545181.131191,2400762000.0,2792997.0,18072580.0,36603200.0,25173180.0,337833500.0,...,18327.64,755204.529,8.33,7.87,1.92,0.0236,0.162018,1.202982,12.09,21.5
W-N5,1262222000000.0,111563200.0,39457720.0,503490.851666,1179310000.0,7156789.0,19846660.0,29841690.0,46379050.0,191066700.0,...,12330.47,764032.039,8.15,7.58,2.48,0.0228,0.08119,1.9267,10.8,20.6
W-N10,1335841000000.0,96028440.0,33534400.0,330945.877537,1256103000.0,7049180.0,3169426.0,25160360.0,70508020.0,419287500.0,...,7082.29,503739.469,8.23,7.88,2.06,0.0222,0.162018,1.2386,12.31,22.4
W-N11,3179300000000.0,63743660.0,21500190.0,306597.571342,3296796000.0,1501891.0,4032482.0,20566360.0,73020770.0,110001800.0,...,9706.8,188339.739,8.26,7.96,2.39,0.0234,0.08119,1.3577,12.39,22.3
W-N12,2084212000000.0,94074610.0,32794910.0,284176.25782,2060677000.0,3550521.0,3206041.0,23821490.0,293876300.0,410871100.0,...,20780.52,1016141.949,8.37,8.24,1.71,0.0232,0.204559,1.2253,11.8,22.3
W-P1,3596260000000.0,50097050.0,16555750.0,505712.679612,1487514000.0,3264990.0,6266568.0,10569000.0,48617240.0,86735130.0,...,17445.29,423735.989,8.97,11.55,1.63,0.0557,0.132239,1.1592,11.9,28.9
W-P2,2699581000000.0,124388000.0,44401230.0,660347.075105,1286545000.0,1793372.0,11823200.0,41291490.0,119425000.0,541209200.0,...,22557.17,573778.489,9.14,10.36,1.81,0.0288,0.170526,0.6563,11.84,28.3


In [118]:
_ = analyse_pearson_to_gephi(df)
# 对显著性与相关性进行过滤
# _ = _[(_['Weight'] >= SIGN) & (np.abs(_['r']) >= COOR)]
_ = _[_['Weight'] >= SIGN]

_.loc[_.apply(lambda s: True if s['Source'] and s['Target'] in gene_list else False, axis=1), :].to_csv(
    EXPORT_PATH + f'{season_name} - 功能基因相关性.csv', index=None, header=True)
# _.loc[_.apply(lambda s: True if s['Source'] or s['Target'] in gene_list else False, axis=1), :].to_csv(
#     EXPORT_PATH + f'{season_name} - 功能基因与环境因子相关性.csv', index=None, header=True)

_.loc[_.apply(lambda s: True if s['Source'] in gene_list and s['Target'] not in gene_list else False, axis=1), :].to_csv(
    EXPORT_PATH + f'{season_name} - 功能基因与环境因子相关性.csv', index=None, header=True)
# _