# Analysis of clinical variatns of ORC1 protein

In [8]:
from clinvar_functions import *
import pandas as pd

# Set option to display all rows
pd.set_option('display.max_rows', None)

## ORC1 regions

In [2]:
# List of intervals for particular feature
LLPS = [(360, 382), (412, 476), (577, 636), (677, 729), (757, 816)] # Chinees paper
IDR_total = [(183, 476)] # Chinees paper
G4_RNA = [(413, 511)] # Hoshina paper
CDCD6=[(180, 240)] # CDC6 paper
BP=[(354, 368), (378, 392)] # CDC6 paper

# Positions
G4_RNA_positions = generate_list(G4_RNA)
LLPS_positions = generate_list(LLPS)
CDCD6_positions = generate_list(CDCD6)
BP_positions = generate_list(BP)

## Read data

In [16]:
# Set option to display all rows and columns
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)

df = pd.read_csv('clinvar_result.csv', sep = "\t")
df = df[df['Gene(s)'] == 'ORC1'] # save only ORC1 gene - delete 7 records
df.shape

(333, 25)

In [17]:
# Subset of data: only mutations in protein
df = df[df['Protein change'].apply(lambda x: isinstance(x, str))]
df.shape

(188, 25)

In [18]:
df[df['Protein change'] == 'S440P']

Unnamed: 0,Name,Gene(s),Protein change,Condition(s),Accession,GRCh37Chromosome,GRCh37Location,GRCh38Chromosome,GRCh38Location,VariationID,...,Germline classification,Germline date last evaluated,Germline review status,Somatic clinical impact,Somatic clinical impact date last evaluated,Somatic clinical impact review status,Oncogenicity classification,Oncogenicity date last evaluated,Oncogenicity review status,Unnamed: 24
178,NM_004153.4(ORC1):c.1318T>C (p.Ser440Pro),ORC1,S440P,ORC1-related disorder|not specified|Meier-Gorl...,VCV000211798,1,52854179,1,52388507,211798,...,Conflicting classifications of pathogenicity,"Nov 27, 2023","criteria provided, conflicting classifications",,,,,,,


In [19]:
df[df['Protein change'] == 'R396W']

Unnamed: 0,Name,Gene(s),Protein change,Condition(s),Accession,GRCh37Chromosome,GRCh37Location,GRCh38Chromosome,GRCh38Location,VariationID,...,Germline classification,Germline date last evaluated,Germline review status,Somatic clinical impact,Somatic clinical impact date last evaluated,Somatic clinical impact review status,Oncogenicity classification,Oncogenicity date last evaluated,Oncogenicity review status,Unnamed: 24
192,NM_004153.4(ORC1):c.1186C>T (p.Arg396Trp),ORC1,R396W,not specified|not provided|Meier-Gorlin syndro...,VCV000211797,1,52854890,1,52389218,211797,...,Benign/Likely benign,"Jan 30, 2024","criteria provided, multiple submitters, no con...",,,,,,,


## Look at the different columns of df

In [None]:
df["Condition(s)"].value_counts()

In [None]:
df["Germline classification"].value_counts()

In [None]:
df['Germline review status'].value_counts()

## Get list of transcript IDs

In [None]:
IDs = df['Name'].apply(lambda x: x.split(":")[0])
IDs.value_counts()

## Add new column: Condition_new

In [None]:
df['Condition_new'] = df['Condition(s)'].map(rename_condition, na_action='ignore')

In [None]:
df['Condition_new'].value_counts()

## Groupby

In [None]:
df.groupby("Germline review status")["Condition_new"].value_counts()

## Subsets

In [None]:
criterion = 'criteria provided, single submitter'
disease = 'Inborn genetic diseases'
protein_mut = df['Protein change'][(df['Germline review status'] == criterion) & (df['Condition_new']==disease)]
yes, no = mutation_analysis(protein_mut, BP_positions)  

In [None]:
yes

## Amino acids analysis

Analyse the percantage of Arginine (R) in different domains of ORC1 protein

In [None]:
# Protein sequences
IDR = "AAKCQKPVRAKSKSAESPSWTPAEHVAKRIESRHSASKSRQTPTHPLTPRARKRLELGNLGNPQMSQQTSCASLDSPGRIKRKVAFSEITSPSKRSQPDKLQTLSPALKAPEKTRETGLSYTEDDKKASPEHRIILRTRIAASKTIDIREERTLTPISGGQRSSVVPSVILKPENIKKRDAKEAKAQNEATSTPHRIRRKSSVLTMNRIRQQLRFLGNSKSDQEEKEILPAAEISDSSSDEEEASTPPLPRRAPRTVSRNLRSSLKSSLHTLTKVPKKSLKPRTPRCAAPQIRS"
G4 = "AAEISDSSSDEEEASTPPLPRRAPRTVSRNLRSSLKSSLHTLTKVPKKSLKPRTPRCAAPQIRSRSLAAQEPASVLEEARLRLHVSAVPESLPCREQEF"
CDC6 = "QESAAKCQKPVRAKSKSAESPSWTPAEHVAKRIESRHSASKSRQTPTHPLTPRARKRLELG"
ORC="MAHYPTRLKTRKTYSWVGRPLLDRKLHYQTYREMCVKTEGCSTEIHIQIGQFVLIEGDDDENPYVAKLLELFEDDSDPPPKKRARVQWFVRFCEVPACKRHLLGRKPGAQEIFWYDYPACDSNINAETIIGLVRVIPLAPKDVVPTNLKNEKTLFVKLSWNEKKFRPLSSELFAELNKPQESAAKCQKPVRAKSKSAESPSWTPAEHVAKRIESRHSASKSRQTPTHPLTPRARKRLELGNLGNPQMSQQTSCASLDSPGRIKRKVAFSEITSPSKRSQPDKLQTLSPALKAPEKTRETGLSYTEDDKKASPEHRIILRTRIAASKTIDIREERTLTPISGGQRSSVVPSVILKPENIKKRDAKEAKAQNEATSTPHRIRRKSSVLTMNRIRQQLRFLGNSKSDQEEKEILPAAEISDSSSDEEEASTPPLPRRAPRTVSRNLRSSLKSSLHTLTKVPKKSLKPRTPRCAAPQIRSRSLAAQEPASVLEEARLRLHVSAVPESLPCREQEFQDIYNFVESKLLDHTGGCMYISGVPGTGKTATVHEVIRCLQQAAQANDVPPFQYIEVNGMKLTEPHQVYVQILQKLTGQKATANHAAELLAKQFCTRGSPQETTVLLVDELDLLWTHKQDIMYNLFDWPTHKEARLVVLAIANTMDLPERIMMNRVSSRLGLTRMCFQPYTYSQLQQILRSRLKHLKAFEDDAIQLVARKVAALSGDARRCLDICRRATEICEFSQQKPDSPGLVTIAHSMEAVDEMFSSSYITAIKNSSVLEQSFLRAILAEFRRSGLEEATFQQIYSQHVALCRMEGLPYPTMSETMAVCSHLGSCRLLLVEPSRNDLLLRVRLNVSQDDVLYALKDE"

# Calculations
ORC1_R = percantage_aa(seq=ORC, A="R")
IDR_R = percantage_aa(seq=IDR, A="R")
G4_R = percantage_aa(seq=G4, A="R")
CDC6_R = percantage_aa(seq=CDC6, A="R")

ORC1_G = percantage_aa(seq=ORC, A="G")
IDR_G = percantage_aa(seq=IDR, A="G")
G4_G = percantage_aa(seq=G4, A="G")
CDC6_G = percantage_aa(seq=CDC6, A="G")

print(f"Arginines in different ORC1 domains:")
print(f"ORC1: {ORC1_R} ; IDR: {IDR_R} ; G4 binding: {G4_R} ; CDC6 binding: {CDC6_R}")

print(f"Glycines in different ORC1 domains:")
print(f"ORC1: {ORC1_G} ; IDR: {IDR_G} ; G4 binding: {G4_G} ; CDC6 binding: {CDC6_G}")