In [1]:
import pandas as pd
import re
from google.colab import files
from IPython.display import FileLink

# Step 1: Load the file (adjust delimiter as needed)
df = pd.read_csv('/content/clinvar_pathogenic_nphs1.txt', delimiter='\t')

# Step 2: Amino acid dictionary (3-letter to 1-letter)
aa_dict = {
    'Ala': 'A', 'Arg': 'R', 'Asn': 'N', 'Asp': 'D', 'Cys': 'C',
    'Glu': 'E', 'Gln': 'Q', 'Gly': 'G', 'His': 'H', 'Ile': 'I',
    'Leu': 'L', 'Lys': 'K', 'Met': 'M', 'Phe': 'F', 'Pro': 'P',
    'Ser': 'S', 'Thr': 'T', 'Trp': 'W', 'Tyr': 'Y', 'Val': 'V'
}

# Step 3: Convert 'p.Ala345Pro' or 'p.(Ala345Pro)' → 'A345P'
def convert_col1_to_1letter(entry):
    entry = str(entry).strip()
    match = re.search(r"p\.?\(?([A-Z][a-z]{2})(\d+)([A-Z][a-z]{2})\)?", entry)
    if match:
        from_aa, pos, to_aa = match.groups()
        return f"{aa_dict.get(from_aa, '?')}{pos}{aa_dict.get(to_aa, '?')}"
    return None

# Step 4: Apply conversion
df['AA_1letter_from_col1'] = df.iloc[:, 0].apply(convert_col1_to_1letter)

# Step 5: Drop rows where conversion failed
df = df.dropna(subset=['AA_1letter_from_col1'])

# Step 6: Extract amino acid position as integer
df['AA_position'] = df['AA_1letter_from_col1'].str.extract(r'[A-Z](\d+)[A-Z]')[0].astype(int)

# Step 7: Save intermediate file
converted_filename = "/content/converted_from_column1.csv"
df.to_csv(converted_filename, index=False)
print("Download your file with the new column:")
display(FileLink(converted_filename))

# Step 8: Drop rows missing required fields
required_cols = ['VariationID', 'Gene(s)', 'Protein change', 'Germline classification']
df = df.dropna(subset=required_cols)

# Step 9: Remove duplicated Variant IDs
df = df[~df['VariationID'].duplicated(keep=False)].copy()

# Step 10: Sort by amino acid position (and optionally gene)
sorted_data = df.sort_values(by=['Gene(s)', 'AA_position'])  # use ['AA_position'] if you don't want gene grouping

# Step 11: Save final sorted data
sorted_filename = '/content/sorted_clinvar_data.csv'
sorted_data.to_csv(sorted_filename, index=False)

# Step 12: Print classification counts
classification_counts = sorted_data['Germline classification'].value_counts()
print("\n✅ File processed successfully!\n")
print("Classification counts:\n")
print(classification_counts)

# Step 13: Download final file
print("\nDownload sorted final file:")
display(FileLink(sorted_filename))

Download your file with the new column:



✅ File processed successfully!

Classification counts:

Germline classification
Likely pathogenic               51
Pathogenic                      19
Pathogenic/Likely pathogenic    13
Name: count, dtype: int64

Download sorted final file:


In [2]:
import pandas as pd
import re
from google.colab import files
from IPython.display import FileLink

# Step 1: Load the file (adjust delimiter as needed)
df = pd.read_csv('/content/clinvar_benign_nphs1.txt', delimiter='\t')

# Step 2: Amino acid dictionary (3-letter to 1-letter)
aa_dict = {
    'Ala': 'A', 'Arg': 'R', 'Asn': 'N', 'Asp': 'D', 'Cys': 'C',
    'Glu': 'E', 'Gln': 'Q', 'Gly': 'G', 'His': 'H', 'Ile': 'I',
    'Leu': 'L', 'Lys': 'K', 'Met': 'M', 'Phe': 'F', 'Pro': 'P',
    'Ser': 'S', 'Thr': 'T', 'Trp': 'W', 'Tyr': 'Y', 'Val': 'V'
}

# Step 3: Convert 'p.Ala345Pro' or 'p.(Ala345Pro)' → 'A345P'
def convert_col1_to_1letter(entry):
    entry = str(entry).strip()
    match = re.search(r"p\.?\(?([A-Z][a-z]{2})(\d+)([A-Z][a-z]{2})\)?", entry)
    if match:
        from_aa, pos, to_aa = match.groups()
        return f"{aa_dict.get(from_aa, '?')}{pos}{aa_dict.get(to_aa, '?')}"
    return None

# Step 4: Apply conversion
df['AA_1letter_from_col1'] = df.iloc[:, 0].apply(convert_col1_to_1letter)

# Step 5: Drop rows where conversion failed
df = df.dropna(subset=['AA_1letter_from_col1'])

# Step 6: Extract amino acid position as integer
df['AA_position'] = df['AA_1letter_from_col1'].str.extract(r'[A-Z](\d+)[A-Z]')[0].astype(int)

# Step 7: Save intermediate file
converted_filename = "/content/converted_from_column1.csv"
df.to_csv(converted_filename, index=False)
print("Download your file with the new column:")
display(FileLink(converted_filename))

# Step 8: Drop rows missing required fields
required_cols = ['VariationID', 'Gene(s)', 'Protein change', 'Germline classification']
df = df.dropna(subset=required_cols)

# Step 9: Remove duplicated Variant IDs
df = df[~df['VariationID'].duplicated(keep=False)].copy()

# Step 10: Sort by amino acid position (and optionally gene)
sorted_data = df.sort_values(by=['Gene(s)', 'AA_position'])  # use ['AA_position'] if you don't want gene grouping

# Step 11: Save final sorted data
sorted_filename = '/content/benign_clinvar_data.csv'
sorted_data.to_csv(sorted_filename, index=False)

# Step 12: Print classification counts
classification_counts = sorted_data['Germline classification'].value_counts()
print("\n✅ File processed successfully!\n")
print("Classification counts:\n")
print(classification_counts)

# Step 13: Download final file
print("\nDownload sorted final file:")
display(FileLink(sorted_filename))

Download your file with the new column:



✅ File processed successfully!

Classification counts:

Germline classification
Likely benign           31
Benign/Likely benign     6
Benign                   4
Name: count, dtype: int64

Download sorted final file:
