In [None]:
import pandas as pd
import numpy as np

In [None]:
def match_and_merge_tsvs(tsv1_path, tsv2_path, output_path='matched_tsv.tsv'):
    # Read the TSV files into pandas DataFrames
    tsv1 = pd.read_csv(tsv1_path, sep='\t')
    tsv2 = pd.read_csv(tsv2_path, sep='\t')

    # Create new columns in tsv1 for the period, sample source, and UDG treated
    tsv1['period'] = None
    tsv1['sampleSource'] = None
    tsv1['code'] = None
    
    # Add the 'UDG treated' column based on the presence of "SG1.1" in the 'SampleName'
    tsv1['UDG treated'] = np.where(tsv1['SampleName'].str.contains('SG1.1', regex=False), 'yes', 'no')

    # Iterate over rows of tsv2 to extract samples and their associated period and sample source
    for _, row in tsv2.iterrows():
        samples = row['samples'].split(';')
        period = row['period']
        code = row['code']
        sample_source = row['sampleSource']
        
        # Iterate over the samples and check if any are substrings in the 'SampleName' column of tsv1
        for sample in samples:
            # Find rows in tsv1 where the sample is a substring of 'SampleName'
            matches = tsv1['SampleName'].str.contains(sample, regex=False)
            # Update 'period' and 'sampleSource' for matched rows
            tsv1.loc[matches, 'period'] = period
            tsv1.loc[matches, 'code'] = code
            tsv1.loc[matches, 'sampleSource'] = sample_source

    # Identify the primary cluster for each sample based on the cluster columns
    cluster_columns = ['Cluster1', 'Cluster2', 'Cluster3', 'Cluster4']
    tsv1['PrimaryCluster'] = tsv1[cluster_columns].idxmax(axis=1)

    # Sort by 'PrimaryCluster' and then by 'period' alphabetically within each cluster
    tsv1 = tsv1.sort_values(by=['PrimaryCluster', 'period', 'code'])

    # Write the updated tsv1 to a new TSV file
    tsv1.to_csv(output_path, sep='\t', index=False)
    print(f"Matched TSV saved to: {output_path}")

    return tsv1

# Example usage
# match_and_merge_tsvs('path_to_tsv1.tsv', 'path_to_tsv2.tsv', 'output_tsv.tsv')


In [None]:
# Example usage
match_and_merge_tsvs('/GMM/k4/cluster_report_k4.tsv', 
                     '/figs/02empiricalGenome/yates_samples.tsv', 
                     'output_tsv.tsv')