# Analysis of clinical variatns of ORC1 protein

## ORC1 regions

In [24]:
LLPS = [(360, 382), (412, 476), (577, 636), (677, 729), (757, 816)] # Chinees paper
IDR_total = [(183, 476)] # Chinees paper
G4_RNA = [(413, 511)] # Hoshina paper


In [25]:
def generate_list(x):
    '''Generate a list of all numbers in the given intervals'''
    generated_list = []
    for start, end in x:
        generated_list.extend(range(start, end + 1))  # `end + 1` to include the endpoint
    return generated_list

In [32]:
G4_RNA_positions = generate_list(G4_RNA)
LLPS_positions = generate_list(LLPS)

## Read data

In [26]:
import pandas as pd
import re
# Set option to display all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

df = pd.read_csv('clinvar_result.csv', sep = "\t")
df = df[df['Gene(s)'] == 'ORC1'] # save only ORC1 gene - delete 7 records
df.shape

(333, 25)

In [27]:
df.head()

Unnamed: 0,Name,Gene(s),Protein change,Condition(s),Accession,GRCh37Chromosome,GRCh37Location,GRCh38Chromosome,GRCh38Location,VariationID,AlleleID(s),dbSNP ID,Canonical SPDI,Variant type,Molecular consequence,Germline classification,Germline date last evaluated,Germline review status,Somatic clinical impact,Somatic clinical impact date last evaluated,Somatic clinical impact review status,Oncogenicity classification,Oncogenicity date last evaluated,Oncogenicity review status,Unnamed: 24
0,NM_001190818.1(ORC1):c.2392-249_*3804del,ORC1,,Meier-Gorlin syndrome 1,VCV001173067,1,52835048 - 52839295,1,52369376 - 52373623,1173067,1162237,,,Deletion,,Pathogenic,,"criteria provided, single submitter",,,,,,,
1,NM_004153.4(ORC1):c.*231C>T,ORC1,,Meier-Gorlin syndrome 1|not provided,VCV000297574,1,52838622,1,52372950,297574,282556,rs3087471,NC_000001.11:52372949:G:A,single nucleotide variant,3 prime UTR variant,Benign,"Jun 20, 2021","criteria provided, multiple submitters, no con...",,,,,,,
2,NM_004153.4(ORC1):c.*226G>C,ORC1,,Meier-Gorlin syndrome 1,VCV000297575,1,52838627,1,52372955,297575,282836,rs886046395,NC_000001.11:52372954:C:G,single nucleotide variant,3 prime UTR variant,Uncertain significance,"Jan 13, 2018","criteria provided, single submitter",,,,,,,
3,NM_004153.4(ORC1):c.*94C>T,ORC1,,Meier-Gorlin syndrome 1,VCV000297576,1,52838759,1,52373087,297576,281284,rs574908976,NC_000001.11:52373086:G:A,single nucleotide variant,3 prime UTR variant,Uncertain significance,"Jan 13, 2018","criteria provided, single submitter",,,,,,,
4,NM_004153.4(ORC1):c.2580C>T (p.Asp860=),ORC1,,ORC1-related disorder|Meier-Gorlin syndrome 1|...,VCV000703217,1,52838859,1,52373187,703217,690638,rs61756139,NC_000001.11:52373186:G:A,single nucleotide variant,synonymous variant,Conflicting classifications of pathogenicity,"Dec 13, 2023","criteria provided, conflicting classifications",,,,,,,


In [22]:
# Subset of data: only mutations in protein
df_proteins = df[df['Protein change'].apply(lambda x: isinstance(x, str))]

## Look at the different columns of df

In [None]:
df["Condition(s)"].value_counts()

In [None]:
df["Germline classification"].value_counts()

In [None]:
df['Germline review status'].value_counts()

## Get list of transcript IDs

In [None]:
IDs = df['Name'].apply(lambda x: x.split(":")[0])
IDs.value_counts()

## Rename Condition

In [None]:
df['Condition(s)'].unique()

In [28]:
def rename_condition(x):
    if x == 'Meier-Gorlin syndrome 1':
        return x
    elif 'Meier-Gorlin syndrome 1' in x:
        return "Meier-Gorlin syndrome 1 probaly"
    elif x == 'ORC1-related disorder':
        return x
    elif 'ORC1-related disorder' in x:
        return 'ORC1-related disorder probably'
    elif x == "Inborn genetic diseases":
        return x
    elif 'Inborn genetic diseases' in x:
        return 'Inborn genetic diseases probably'
    elif x == "not provided":
        return "not provided"
    elif "not provided" in x:
        return "not provided"
    else:
        return x

In [29]:
df['Condition_new'] = df['Condition(s)'].map(rename_condition, na_action='ignore')

In [30]:
df['Condition_new'].value_counts()

Condition_new
not provided                        206
Meier-Gorlin syndrome 1 probaly      42
Meier-Gorlin syndrome 1              30
Inborn genetic diseases              30
Inborn genetic diseases probably     12
ORC1-related disorder probably        6
not specified                         5
Name: count, dtype: int64

## Groupby

In [None]:
df.groupby("Germline review status")["Condition_new"].value_counts()

## Subsets

In [34]:
def mutation_analysis(sb, li):
    '''
    Input - subset of dataframe, list of position within interval of interest
    '''
    sb.dropna(inplace=True)
    sb = list(sb)
    interval_mutations = []
    no_interval_mutations = []
    for mutation in sb:
        positions = list(map(int, re.findall(r'\d+', mutation)))
        for position in positions:
            if position in li:
                interval_mutations.append(mutation)
            else:
                no_interval_mutations.append(mutation)
    print(f"Number of mutations within interaval of interest is {len(interval_mutations)}")
    print(f"Number of mutations outside interaval of interest is {len(no_interval_mutations)}")
    return(interval_mutations, no_interval_mutations)

In [29]:
G4_RNA_positions

In [87]:
# Protein change: criteria provided, multiple submitters, no conflicts + Meier-Gorlin syndrome 1 probaly
protein_mut = df['Protein change'][(df['Germline review status'] == 'criteria provided, multiple submitters, no conflicts') & (df['Condition_new']=='not provided')]

In [90]:
# Protein change + criteria provided, single submitter + Meier-Gorlin syndrome 1 probaly
protein_mut = df['Protein change'][(df['Germline review status'] == 'criteria provided, single submitter') & (df['Condition_new']=='not provided')]

In [91]:
yes, no = mutation_analysis(protein_mut, G4_RNA_positions)

Number of mutations within interaval of interest is 23
Number of mutations outside interaval of interest is 111


In [92]:
yes

['N511S, N516S',
 'I509V, I514V',
 'I509L, I514L',
 'E497Q, E502Q',
 'E497Q, E502Q',
 'V495A, V500A',
 'V495A, V500A',
 'S473N, S478N',
 'S473N, S478N',
 'R470H, R475H',
 'R470H, R475H',
 'R470C, R475C',
 'R470C, R475C',
 'R463C, R468C',
 'R463C, R468C',
 'K460del',
 'T455A',
 'R444L',
 'R444*',
 'R441M',
 'T438N',
 'P430A',
 'D418E']

In [93]:
no

['R841Q, R846Q',
 'R841Q, R846Q',
 'R839Q, R844Q',
 'R839Q, R844Q',
 'D840N, D835N',
 'D840N, D835N',
 'E830D, E835D',
 'E830D, E835D',
 'R825C, R830C',
 'R825C, R830C',
 'V822L, V817L',
 'V822L, V817L',
 'P809S, P814S',
 'P809S, P814S',
 'M803V, M808V',
 'M803V, M808V',
 'H802R, H797R',
 'H802R, H797R',
 'I793M, I798M',
 'I793M, I798M',
 'T789M, T794M',
 'T789M, T794M',
 'R786C, R781C',
 'R786C, R781C',
 'V767L, V772L',
 'V767L, V772L',
 'K763R, K768R',
 'K763R, K768R',
 'M753T, M758T',
 'M753T, M758T',
 'E748D, E753D',
 'E748D, E753D',
 'G739fs, G744fs',
 'G739fs, G744fs',
 'E726fs, E731fs',
 'E726fs, E731fs',
 'R728H, R723H',
 'R728H, R723H',
 'R728C, R723C',
 'R728C, R723C',
 'C721fs, C726fs',
 'C721fs, C726fs',
 'D697G, D702G',
 'D697G, D702G',
 'R688W, R693W',
 'R688W, R693W',
 'Y683F, Y678F',
 'Y683F, Y678F',
 'R666W, R661W',
 'R666W, R661W',
 'M659I, M664I',
 'M659I, M664I',
 'M658T, M663T',
 'M658T, M663T',
 'I647T, I652T',
 'I647T, I652T',
 'A640T, A645T',
 'A640T, A645T',
 '