In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind
from adjustText import adjust_text

# Load the data files
_30C_Culture_Tubes = pd.read_csv('241212_30C.csv')
Biolector = pd.read_csv('241212_Biolector_PCK_pooledcontrols.csv')
name_df = pd.read_csv('proteomics_id_translator_240305.csv')

In [None]:
# Step 1: Translate Protein.Group names to locus
translator_dict = {}
nontranslated = 0
for orig_label in _30C_Culture_Tubes['Protein.Group']:
    matching_idx = name_df['original'].str.contains(orig_label, regex=False)
    if sum(matching_idx) == 1:
        translator_dict[orig_label] = name_df.loc[matching_idx, 'locus'].values[0]
    else:
        translator_dict[orig_label] = orig_label
        nontranslated += 1
print(f"In total, N = {nontranslated}/{len(_30C_Culture_Tubes['Protein.Group'])} proteins were not translated to locus names")


In [None]:
# Apply translation
_30C_Culture_Tubes['Protein.Group'] = _30C_Culture_Tubes['Protein.Group'].map(translator_dict)
Biolector['Protein.Group'] = Biolector['Protein.Group'].map(translator_dict)

In [None]:
# Filter proteins that have value > 0.001 in at least one sample in either file
abundance_cols = [col for col in _30C_Culture_Tubes.columns if '%_of protein_abundance_Top3-method' in col]
proteins_to_keep = set()

for df in [_30C_Culture_Tubes, Biolector]:
    for col in abundance_cols:
        proteins_to_keep.update(
            df.loc[df[col] > 0.001, 'Protein.Group'].tolist()
        )

# Filter both dataframes to keep only these proteins
_30C_Culture_Tubes_filtered = _30C_Culture_Tubes[_30C_Culture_Tubes['Protein.Group'].isin(proteins_to_keep)]
Biolector_filtered = Biolector[Biolector['Protein.Group'].isin(proteins_to_keep)]

# Initialize results DataFrame
results_df = pd.DataFrame()

# Process each sample
for sample_col in abundance_cols:
    sample_name = sample_col.replace('%_of protein_abundance_Top3-method', '')
    print(f"Processing sample: {sample_name}")
    
    # Calculate statistics for each protein
    stats_data = []
    for protein in proteins_to_keep:
        _30C_value = _30C_Culture_Tubes_filtered.loc[
            _30C_Culture_Tubes_filtered['Protein.Group'] == protein, sample_col
        ].values
        
        Biolector_value = Biolector_filtered.loc[
            Biolector_filtered['Protein.Group'] == protein, sample_col
        ].values
        
        if len(_30C_value) > 0 and len(Biolector_value) > 0:
            log2_fc = np.log2(Biolector_value[0] / _30C_value[0])
            # For p-value calculation, you might need to adjust based on your replicate structure
            p_value = np.nan  # Placeholder - adjust based on your data structure
            
            stats_data.append({
                'Protein.Group': protein,
                'log2_fold_change': log2_fc,
                'p_value': p_value,
                '-log10_p_value': -np.log10(p_value) if not np.isnan(p_value) else np.nan
            })
    
    # Create DataFrame for this sample
    sample_df = pd.DataFrame(stats_data)
    
    # Create volcano plot
    plt.figure(figsize=(10, 8))
    plt.scatter(
        sample_df['log2_fold_change'],
        sample_df['-log10_p_value'],
        alpha=0.5
    )
    
    # Add threshold lines
    plt.axvline(x=2, color='r', linestyle='--', alpha=0.3)
    plt.axvline(x=-2, color='r', linestyle='--', alpha=0.3)
    plt.axhline(y=-np.log10(0.05), color='r', linestyle='--', alpha=0.3)
    
    # Label significant points
    significant = sample_df[
        (abs(sample_df['log2_fold_change']) > 2) & 
        (sample_df['-log10_p_value'] > -np.log10(0.05))
    ]
    
    texts = []
    for _, row in significant.iterrows():
        texts.append(plt.text(
            row['log2_fold_change'],
            row['-log10_p_value'],
            row['Protein.Group'],
            fontsize=8
        ))
    
    if texts:
        adjust_text(texts)
    
    plt.xlabel('Log2 Fold Change')
    plt.ylabel('-Log10 P-value')
    plt.title(f'Volcano Plot - {sample_name}')
    
    plt.savefig(f'volcano_plot_{sample_name}.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    # Add to results DataFrame
    if results_df.empty:
        results_df['Protein.Group'] = sample_df['Protein.Group']
    
    results_df[f'{sample_name}_log2_fc'] = sample_df['log2_fold_change']
    results_df[f'{sample_name}_p_value'] = sample_df['p_value']

# Save results
results_df.to_csv('protein_comparison_results.csv', index=False)
print("Analysis complete. Results saved to 'protein_comparison_results.csv'")

Found 1 samples to analyze
Processing sample: %_of protein_abundance_Top3-method
