In [1]:
import pandas as pd
from scipy.stats import pearsonr
from sklearn.preprocessing import StandardScaler
import numpy as np

In [3]:
# Load and inspect the MZmine feature table with bioassay results
# Change the name in the code below if needed (.CSV file in red)
# NB: Make sure to add the value of bioactivity in the second row.
file_path = "TEMPLATE_FILES/INPUT_FILE/features_quantification_matrix_edited_bioactivity_MZmine2.csv"
in_tab = pd.read_csv(file_path)

# Display the dimensions of the table
print(in_tab.shape)

# Display the first 5 rows of the table
in_tab.head()

(588, 17)


Unnamed: 0,row ID,row m/z,row retention time,Extract.mzXML Peak area,F_5.mzXML Peak area,F_6.mzXML Peak area,F_7.mzXML Peak area,F_8.mzXML Peak area,F_9.mzXML Peak area,F_10.mzXML Peak area,F_11.mzXML Peak area,F_12.mzXML Peak area,F_13.mzXML Peak area,F_14.mzXML Peak area,F_15.mzXML Peak area,F_16.mzXML Peak area,F_17.mzXML Peak area
0,BioactivityCHIKV,,,68.0,1.0,4.0,1.0,3.0,19.0,8.0,16.0,41.0,140.0,17.0,10.5,5.0,57.0
1,1,270.279,1698.0,5690331.0,246596025.8,186949600.0,362958600.0,160968300.0,100851800.0,68548846.0,141101400.0,152605600.0,50568990.0,40253160.0,51162990.0,49859530.0,49792600.0
2,2,271.283,1699.0,569378.1,40396489.5,30710030.0,63895440.0,26317670.0,16091010.0,11523241.45,22836560.0,25351370.0,7873325.0,5872291.0,6356980.0,5143813.0,8061990.0
3,3,279.174,1413.0,3172924.0,0.0,0.0,0.0,0.0,93954.37,0.0,1538466.0,62451930.0,410657.7,41184.01,118659.0,0.0,0.0
4,4,280.264,1420.0,1821883.0,94393467.5,676012.6,97160.14,149605.1,199154.6,0.0,0.0,131789.8,2366321.0,0.0,25537.35,13143600.0,2932929.0


In [5]:
# Drop the first 3 columns and transpose the dataframe
tab = in_tab.iloc[:, 3:].transpose()

# Create a new DataFrame with 'Sample_name' and the transposed data
tab = pd.DataFrame(tab)
tab.insert(0, 'Sample_name', tab.index.str.replace('.mzXML Peak area', ''))

# Set the new column names
bioactivity_name = 'BioactivityCHIKV'
new_colnames = [bioactivity_name] + ['{}_{}'.format(col, row) for col, row in in_tab.iloc[1:, 1:3].values]
tab.columns = ['Sample_name'] + new_colnames

# Reset the index
tab.reset_index(drop=True, inplace=True)

tab

Unnamed: 0,Sample_name,BioactivityCHIKV,270.279_1698.0,271.283_1699.0,279.174_1413.0,280.264_1420.0,281.19_1031.0,282.279_1660.0,297.185_1652.0,297.185_1556.0,...,895.407_1768.0,897.375_1333.0,903.335_1024.0,905.349_1085.0,906.39_1200.0,914.392_1610.0,916.412_1631.0,927.391_1213.0,983.559_1688.0,987.593_1777.0
0,Extract,68.0,5690331.0,569378.1,3172924.0,1821883.0,1786365.0,18655710.0,2756178.0,13434570.0,...,27447940.0,1044450.0,30550620.0,10349740.0,1727922.0,9702100.0,2784934.0,0.0,20688290.0,502689.8
1,F_5,1.0,246596000.0,40396490.0,0.0,94393470.0,0.0,869225800.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,218517.5,0.0
2,F_6,4.0,186949600.0,30710030.0,0.0,676012.6,978061.6,8307734.0,94191750.0,34654520.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1984789000.0,379780300.0
3,F_7,1.0,362958600.0,63895440.0,0.0,97160.14,34595230.0,3306292.0,9480675.0,63097220.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10330600.0,311336.5
4,F_8,3.0,160968300.0,26317670.0,0.0,149605.1,248809.1,2679013.0,220247.6,1264704.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,188105.8,0.0
5,F_9,19.0,100851800.0,16091010.0,93954.37,199154.6,0.0,3557114.0,0.0,161591.2,...,157919100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,146742.8,0.0
6,F_10,8.0,68548850.0,11523240.0,0.0,0.0,0.0,951736.7,0.0,31515.84,...,86062930.0,0.0,24721.77,28569.09,0.0,0.0,0.0,0.0,144823.8,0.0
7,F_11,16.0,141101400.0,22836560.0,1538466.0,0.0,0.0,2045580.0,24716.11,1824419.0,...,1558383.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,76445.87,0.0
8,F_12,41.0,152605600.0,25351370.0,62451930.0,131789.8,0.0,1364687.0,127001.8,1195893.0,...,996322.8,0.0,0.0,0.0,0.0,0.0,685796.9,0.0,62185.83,0.0
9,F_13,140.0,50568990.0,7873325.0,410657.7,2366321.0,0.0,27919740.0,0.0,0.0,...,83620.68,0.0,0.0,0.0,0.0,10365430.0,4787812.0,24264.46,0.0,0.0


In [6]:
# Check if there are blank rows and remove them
if tab.iloc[:, 1].isna().any():
    tab = tab[~tab.iloc[:, 1].isna()]

In [7]:
# Add 1 to all to help scaling feature intensities and Normalize the features by TIC
tab2 = tab.copy()
tab2.iloc[:, 2:] = tab2.iloc[:, 2:].apply(lambda x: (x + 1) / (x + 1).sum(), axis=1)
tab2

Unnamed: 0,Sample_name,BioactivityCHIKV,270.279_1698.0,271.283_1699.0,279.174_1413.0,280.264_1420.0,281.19_1031.0,282.279_1660.0,297.185_1652.0,297.185_1556.0,...,895.407_1768.0,897.375_1333.0,903.335_1024.0,905.349_1085.0,906.39_1200.0,914.392_1610.0,916.412_1631.0,927.391_1213.0,983.559_1688.0,987.593_1777.0
0,Extract,68.0,0.000139,1.4e-05,7.760803e-05,4.45623e-05,4.369353e-05,0.000456,6.741464e-05,0.0003286022,...,0.0006713617,2.55467e-05,0.0007472515,0.000253149,4.226406e-05,0.0002373081,6.811799e-05,2.445946e-11,0.0005060243,1.229554e-05
1,F_5,1.0,0.159408,0.026114,6.464336e-10,0.06101911,6.464336e-10,0.561897,6.464336e-10,6.464336e-10,...,6.464336e-10,6.464336e-10,6.464336e-10,6.464336e-10,6.464336e-10,6.464336e-10,6.464336e-10,6.464336e-10,0.0001412577,6.464336e-10
2,F_6,4.0,0.02284,0.003752,1.221705e-10,8.258893e-05,0.0001194904,0.001015,0.01150745,0.004233761,...,1.221705e-10,1.221705e-10,1.221705e-10,1.221705e-10,1.221705e-10,1.221705e-10,1.221705e-10,1.221705e-10,0.2424826,0.04639795
3,F_7,1.0,0.313976,0.055273,8.650469e-10,8.404894e-05,0.0299265,0.00286,0.008201229,0.05458206,...,8.650469e-10,8.650469e-10,8.650469e-10,8.650469e-10,8.650469e-10,8.650469e-10,8.650469e-10,8.650469e-10,0.008936452,0.0002693216
4,F_8,3.0,0.055109,0.00901,3.423586e-10,5.121893e-05,8.518229e-05,0.000917,7.5404e-05,0.0004329827,...,3.423586e-10,3.423586e-10,3.423586e-10,3.423586e-10,3.423586e-10,3.423586e-10,3.423586e-10,3.423586e-10,6.439997e-05,3.423586e-10
5,F_9,19.0,0.003704,0.000591,3.450476e-06,7.313916e-06,3.672463e-11,0.000131,3.672463e-11,5.934413e-06,...,0.00579952,3.672463e-11,3.672463e-11,3.672463e-11,3.672463e-11,3.672463e-11,3.672463e-11,3.672463e-11,5.389113e-06,3.672463e-11
6,F_10,8.0,0.0022,0.00037,3.209307e-11,3.209307e-11,3.209307e-11,3.1e-05,3.209307e-11,1.011472e-06,...,0.002762024,3.209307e-11,7.934297e-07,9.169021e-07,3.209307e-11,3.209307e-11,3.209307e-11,3.209307e-11,4.647872e-06,3.209307e-11
7,F_11,16.0,0.005111,0.000827,5.572151e-05,3.621885e-11,3.621885e-11,7.4e-05,8.952253e-07,6.607841e-05,...,5.644287e-05,3.621885e-11,3.621885e-11,3.621885e-11,3.621885e-11,3.621885e-11,3.621885e-11,3.621885e-11,2.768818e-06,3.621885e-11
8,F_12,41.0,0.00455,0.000756,0.001861987,3.929308e-06,2.981473e-11,4.1e-05,3.786554e-06,3.565525e-05,...,2.970513e-05,2.981473e-11,2.981473e-11,2.981473e-11,2.981473e-11,2.981473e-11,2.044688e-05,2.981473e-11,1.854084e-06,2.981473e-11
9,F_13,140.0,0.001584,0.000247,1.286466e-05,7.412951e-05,3.132689e-11,0.000875,3.132689e-11,3.132689e-11,...,2.619607e-06,3.132689e-11,3.132689e-11,3.132689e-11,3.132689e-11,0.0003247168,0.0001499873,7.601614e-07,3.132689e-11,3.132689e-11


In [8]:
# Scale the bioactivity and the first feature column
scaler = StandardScaler()
scaled_bioactivity = scaler.fit_transform(tab2.iloc[:, [1]])
scaled_feature = scaler.fit_transform(tab2.iloc[:, [2]])

# Calculate the correlation coefficient and p-value
correlation, p_value = pearsonr(scaled_bioactivity.flatten(), scaled_feature.flatten())

# Output the results
result = {"estimate": correlation, "p.value": p_value}
print(result)

{'estimate': -0.3243776067257055, 'p.value': 0.2578493567193687}


In [9]:
# Scale the bioactivity column
scaler = StandardScaler()
scaled_bioactivity = scaler.fit_transform(tab2.iloc[:, [1]])

# Calculate the correlation coefficient and p-value for all features
results = []
for col in range(2, tab2.shape[1]):
    scaled_feature = scaler.fit_transform(tab2.iloc[:, [col]])
    correlation, p_value = pearsonr(scaled_bioactivity.flatten(), scaled_feature.flatten())
    results.append([correlation, p_value])

# Convert the results to a DataFrame
ct = pd.DataFrame(results, columns=['estimate', 'p.value'])

# Display the first few rows of the results
print(ct.head())

   estimate   p.value
0 -0.324378  0.257849
1 -0.318375  0.267271
2  0.113450  0.699375
3 -0.200703  0.491453
4 -0.201330  0.490071


In [10]:
# Show the dimensions of tab2 and ct
print("Dimensions of tab2:", tab2.shape)
print("Dimensions of ct:", ct.shape)

Dimensions of tab2: (14, 589)
Dimensions of ct: (587, 2)


### Correct up until this point

In [24]:
# Convert the results to a DataFrame and add header
ct = pd.DataFrame(results, columns=['cor', 'p_value'])
header = pd.DataFrame([['cor', 'p_value'], [0, 0]], columns=['cor', 'p_value'])
ct = pd.concat([header, ct])

# Convert ct to a matrix
ct_matrix = ct.values.T

# Combine ct_matrix and tab2_matrix vertically
tab3_matrix = np.vstack([ct_matrix, tab2.values])

# Convert the combined matrix back to a DataFrame
tab3_df = pd.DataFrame(tab3_matrix)

# Set column headers from tab2 to tab3_df
tab3_df.columns = tab2.columns

# Write the DataFrame to a CSV file
output_file = "features_quantification_matrix_edited_with_correlation_GM.csv"
tab3_df.to_csv(output_file, index=False)

tab3_df

Unnamed: 0,Sample_name,BioactivityCHIKV,270.279_1698.0,271.283_1699.0,279.174_1413.0,280.264_1420.0,281.19_1031.0,282.279_1660.0,297.185_1652.0,297.185_1556.0,...,895.407_1768.0,897.375_1333.0,903.335_1024.0,905.349_1085.0,906.39_1200.0,914.392_1610.0,916.412_1631.0,927.391_1213.0,983.559_1688.0,987.593_1777.0
0,cor,0.0,-0.324378,-0.318375,0.11345,-0.200703,-0.20133,-0.201929,-0.271214,-0.214875,...,-0.09798,0.219606,0.229359,0.242771,-0.197018,-0.007504,0.047311,-0.171019,-0.185422,-0.179208
1,p_value,0.0,0.257849,0.267271,0.699375,0.491453,0.490071,0.488755,0.348284,0.460688,...,0.738959,0.450632,0.430245,0.402988,0.499605,0.979689,0.872404,0.558837,0.525661,0.539867
2,Extract,68.0,0.000139,1.4e-05,7.8e-05,4.5e-05,4.4e-05,0.000456,6.7e-05,0.000329,...,0.000671,2.6e-05,0.000747,0.000253,4.2e-05,0.000237,6.8e-05,0.0,0.000506,1.2e-05
3,F_5,1.0,0.159408,0.026114,0.0,0.061019,0.0,0.561897,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000141,0.0
4,F_6,4.0,0.02284,0.003752,0.0,8.3e-05,0.000119,0.001015,0.011507,0.004234,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.242483,0.046398
5,F_7,1.0,0.313976,0.055273,0.0,8.4e-05,0.029927,0.00286,0.008201,0.054582,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.008936,0.000269
6,F_8,3.0,0.055109,0.00901,0.0,5.1e-05,8.5e-05,0.000917,7.5e-05,0.000433,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.4e-05,0.0
7,F_9,19.0,0.003704,0.000591,3e-06,7e-06,0.0,0.000131,0.0,6e-06,...,0.0058,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5e-06,0.0
8,F_10,8.0,0.0022,0.00037,0.0,0.0,0.0,3.1e-05,0.0,1e-06,...,0.002762,0.0,1e-06,1e-06,0.0,0.0,0.0,0.0,5e-06,0.0
9,F_11,16.0,0.005111,0.000827,5.6e-05,0.0,0.0,7.4e-05,1e-06,6.6e-05,...,5.6e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3e-06,0.0


May need to remove row names in .csv file above ^^^

In [34]:
# Transpose the DataFrame for molecular networking mapping
new = tab3_df.T

# Set column names from the first row
new.columns = new.iloc[0]

# Remove the first row (header row)
new = new[1:]

# Add row indices and sample names to the new DataFrame
new.insert(0, 'IDs', new.index)
new.insert(0, 'shared name', range(len(new)))

# Adjust the first cell in the 'shared name' column
new.iat[0, 0] = ''

# Reset the index to remove the old index
new = new.reset_index(drop=True)

# Rename the first two columns
new.columns.values[0:2] = ['shared name', 'IDs']

# Write the transposed DataFrame to a CSV file
output_file_transposed = "features_quantification_matrix_transposed_with_correlation_GM2.csv"
new.to_csv(output_file_transposed, index=False)

new

  new.iat[0, 0] = ''


Sample_name,shared name,IDs,cor,p_value,Extract,F_5,F_6,F_7,F_8,F_9,F_10,F_11,F_12,F_13,F_14,F_15,F_16,F_17
0,,BioactivityCHIKV,0,0,68.0,1.0,4.0,1.0,3.0,19.0,8.0,16.0,41.0,140.0,17.0,10.5,5.0,57.0
1,1,270.279_1698.0,-0.324378,0.257849,0.000139,0.159408,0.02284,0.313976,0.055109,0.003704,0.0022,0.005111,0.00455,0.001584,0.001186,0.001303,0.001188,0.000919
2,2,271.283_1699.0,-0.318375,0.267271,0.000014,0.026114,0.003752,0.055273,0.00901,0.000591,0.00037,0.000827,0.000756,0.000247,0.000173,0.000162,0.000123,0.000149
3,3,279.174_1413.0,0.11345,0.699375,0.000078,0.0,0.0,0.0,0.0,0.000003,0.0,0.000056,0.001862,0.000013,0.000001,0.000003,0.0,0.0
4,4,280.264_1420.0,-0.200703,0.491453,0.000045,0.061019,0.000083,0.000084,0.000051,0.000007,0.0,0.0,0.000004,0.000074,0.0,0.000001,0.000313,0.000054
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
583,583,914.392_1610.0,-0.007504,0.979689,0.000237,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000325,0.004268,0.000147,0.000062,0.00001
584,584,916.412_1631.0,0.047311,0.872404,0.000068,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00002,0.00015,0.00111,0.000034,0.000007,0.0
585,585,927.391_1213.0,-0.171019,0.558837,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000001,0.0,0.000008,0.001044,0.000001
586,586,983.559_1688.0,-0.185422,0.525661,0.000506,0.000141,0.242483,0.008936,0.000064,0.000005,0.000005,0.000003,0.000002,0.0,0.0,0.000001,0.000002,0.0


In [38]:
new.columns

Index(['shared name', 'IDs', 'cor', 'p_value', 'Extract', 'F_5', 'F_6', 'F_7',
       'F_8', 'F_9', 'F_10', 'F_11', 'F_12', 'F_13', 'F_14', 'F_15', 'F_16',
       'F_17'],
      dtype='object', name='Sample_name')

In [40]:
# Assuming `ct` is defined and contains the correlation test results

# Convert the 'p.value' column to numeric and exclude the first two rows
p_values = pd.to_numeric(ct['p_value'][2:])

# Get the indices where p-value < 0.05
significant_indices = p_values[p_values < 0.05].index

# Convert to 1-based index as in R (Python uses 0-based index)
significant_indices += 1

print(significant_indices)
print(len(significant_indices))

Index([ 17,  27,  29,  30,  94,  99, 100, 108, 115, 120, 137, 138, 157, 158,
       162, 171, 183, 201, 202, 207, 210, 239, 241, 255, 285, 292, 299, 306,
       323, 329, 331, 346, 357, 360, 366, 367, 370, 371, 390, 416, 431, 433,
       440, 446, 461, 462, 467, 469, 476, 482, 487, 496, 504, 512, 513, 518,
       524, 531, 543, 544, 545, 556],
      dtype='int64')
62


Back on track, just dont forget previous comment about column names in .csv

In [41]:
# Get the column names from tab (assuming tab is a DataFrame)
nm = tab.columns

# Convert the 'p.value' column to numeric and exclude the first two rows
p_values = pd.to_numeric(ct['p_value'][2:])

# Get the indices where p-value < 0.05
significant_indices = p_values[p_values < 0.05].index

# Filter column names based on significant indices (excluding the first two columns)
significant_feature_ids = nm[2:][significant_indices]

print(significant_feature_ids)

Index(['383.222_1524.0', '401.232_1290.0', '401.232_1619.0', '401.232_1527.0',
       '563.296_1291.0', '571.255_559.0', '571.301_1719.0', '577.265_998.0',
       '589.311_1520.0', '591.326_1626.0', '613.266_824.0', '613.266_874.0',
       '641.294_836.0', '641.294_742.0', '647.307_997.0', '661.287_945.0',
       '669.289_697.0', '681.29_558.0', '681.29_742.0', '683.303_823.0',
       '684.31_823.0', '701.314_827.0', '703.29_730.0', '709.284_697.0',
       '721.303_945.0', '723.299_822.0', '725.339_774.0', '730.34_898.0',
       '737.31_963.0', '739.319_785.0', '739.319_593.0', '747.321_773.0',
       '753.332_773.0', '757.285_870.0', '758.373_1047.0', '758.373_1257.0',
       '761.301_805.0', '761.301_595.0', '767.345_870.0', '780.353_1045.0',
       '789.332_870.0', '789.332_1014.0', '792.357_1201.0', '793.361_1211.0',
       '802.398_1055.0', '803.307_939.0', '806.335_1158.0', '806.335_1061.0',
       '814.337_1192.0', '817.321_1105.0', '820.349_1210.0', '824.377_1038.0',
       '82

correct, but may want to remove the'.0' at the end of the names

In [43]:
!pip install statsmodels

Collecting statsmodels
  Downloading statsmodels-0.14.2-cp310-cp310-macosx_10_9_x86_64.whl (10.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.4/10.4 MB[0m [31m24.7 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
Collecting patsy>=0.5.6
  Downloading patsy-0.5.6-py2.py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.9/233.9 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: patsy, statsmodels
Successfully installed patsy-0.5.6 statsmodels-0.14.2


In [44]:
from statsmodels.stats.multitest import multipletests

# Assuming `ct` is defined and contains the correlation test results

# Convert the 'p.value' column to numeric and exclude the first two rows
p_values = pd.to_numeric(ct['p_value'][2:])

# Apply Bonferroni correction
_, corrected_p_values, _, _ = multipletests(p_values, method='bonferroni')

# Get the indices where corrected p-value < 0.05
significant_indices_bonferroni = np.where(corrected_p_values < 0.05)[0]

# Convert to 1-based index as in R (Python uses 0-based index)
significant_indices_bonferroni += 1

print(significant_indices_bonferroni)

[ 27  30  94  99 108 115 120 137 138 157 162 171 183 201 207 210 239 241
 255 285 299 331 346 357 360 366 367 371 390 416 431 433 446 461 462 469
 487 496 504 513 518 524 543 544 545 556]


In [45]:
# Get the column names from tab (assuming tab is a DataFrame)
nm = tab.columns

# Convert the 'p.value' column to numeric and exclude the first two rows
p_values = pd.to_numeric(ct['p_value'][2:])

# Apply Bonferroni correction
_, corrected_p_values, _, _ = multipletests(p_values, method='bonferroni')

# Get the indices where corrected p-value < 0.05
significant_indices_bonferroni = np.where(corrected_p_values < 0.05)[0]

# Filter column names based on significant indices (excluding the first two columns)
significant_feature_ids = nm[2:][significant_indices_bonferroni]

print(significant_feature_ids)

Index(['401.232_1290.0', '401.232_1527.0', '563.296_1291.0', '571.255_559.0',
       '577.265_998.0', '589.311_1520.0', '591.326_1626.0', '613.266_824.0',
       '613.266_874.0', '641.294_836.0', '647.307_997.0', '661.287_945.0',
       '669.289_697.0', '681.29_558.0', '683.303_823.0', '684.31_823.0',
       '701.314_827.0', '703.29_730.0', '709.284_697.0', '721.303_945.0',
       '725.339_774.0', '739.319_593.0', '747.321_773.0', '753.332_773.0',
       '757.285_870.0', '758.373_1047.0', '758.373_1257.0', '761.301_595.0',
       '767.345_870.0', '780.353_1045.0', '789.332_870.0', '789.332_1014.0',
       '793.361_1211.0', '802.398_1055.0', '803.307_939.0', '806.335_1061.0',
       '820.349_1210.0', '824.377_1038.0', '828.318_1067.0', '834.368_1463.0',
       '836.384_1150.0', '840.319_1120.0', '856.348_1463.0', '856.348_1245.0',
       '858.369_1147.0', '866.393_1330.0'],
      dtype='object')


In [47]:
# Convert the 'p.value' column to numeric and exclude the first two rows
p_values = pd.to_numeric(ct['p_value'][2:])

# Apply Bonferroni correction
_, corrected_p_values, _, _ = multipletests(p_values, method='bonferroni')

# Prepare the new table by combining necessary columns and adding the corrected p-values
# Select the first 5 columns and all columns after the first 5 from the transposed DataFrame `new`
new_table = pd.concat([new.iloc[:, :5], pd.Series([0] + list(corrected_p_values), name='p_value_corrected'), new.iloc[:, 5:]], axis=1)

# Rename the 6th column
new_table.columns.values[5] = 'p_value_corrected'

# Display the first 10 columns of the first few rows
new_table.iloc[:, :10].head()

Unnamed: 0,shared name,IDs,cor,p_value,Extract,p_value_corrected,F_5,F_6,F_7,F_8
0,,BioactivityCHIKV,0.0,0.0,68.0,0.0,1.0,4.0,1.0,3.0
1,1.0,270.279_1698.0,-0.324378,0.257849,0.000139,1.0,0.159408,0.02284,0.313976,0.055109
2,2.0,271.283_1699.0,-0.318375,0.267271,1.4e-05,1.0,0.026114,0.003752,0.055273,0.00901
3,3.0,279.174_1413.0,0.11345,0.699375,7.8e-05,1.0,0.0,0.0,0.0,0.0
4,4.0,280.264_1420.0,-0.200703,0.491453,4.5e-05,1.0,0.061019,8.3e-05,8.4e-05,5.1e-05


In [48]:
# Write the modified DataFrame to a CSV file if needed
output_file_modified = "features_quantification_matrix_modified_with_bonferroni_GM.csv"
new_table.to_csv(output_file_modified, index=False)