# Analysis of clinical variatns of ORC1 protein

In [1]:
from clinvar_functions import *
import pandas as pd

## ORC1 regions

In [2]:
# List of intervals for particular feature
LLPS = [(360, 382), (412, 476), (577, 636), (677, 729), (757, 816)] # Chinees paper
IDR_total = [(183, 476)] # Chinees paper
G4_RNA = [(413, 511)] # Hoshina paper
CDCD6=[(180, 240)] # CDC6 paper

# Positions
G4_RNA_positions = generate_list(G4_RNA)
LLPS_positions = generate_list(LLPS)
CDCD6_positions = generate_list(CDCD6)

## Read data

In [3]:
# Set option to display all rows and columns
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)

df = pd.read_csv('clinvar_result.csv', sep = "\t")
df = df[df['Gene(s)'] == 'ORC1'] # save only ORC1 gene - delete 7 records
df.shape

(333, 25)

In [4]:
# Subset of data: only mutations in protein
df_proteins = df[df['Protein change'].apply(lambda x: isinstance(x, str))]

## Look at the different columns of df

In [None]:
df["Condition(s)"].value_counts()

In [None]:
df["Germline classification"].value_counts()

In [None]:
df['Germline review status'].value_counts()

## Get list of transcript IDs

In [None]:
IDs = df['Name'].apply(lambda x: x.split(":")[0])
IDs.value_counts()

## Add new column: Condition_new

In [None]:
df['Condition(s)'].unique()

In [5]:
df_proteins['Condition_new'] = df_proteins['Condition(s)'].map(rename_condition, na_action='ignore')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_proteins['Condition_new'] = df_proteins['Condition(s)'].map(rename_condition, na_action='ignore')


In [6]:
df_proteins

Unnamed: 0,Name,Gene(s),Protein change,Condition(s),Accession,GRCh37Chromosome,GRCh37Location,GRCh38Chromosome,GRCh38Location,VariationID,...,Germline date last evaluated,Germline review status,Somatic clinical impact,Somatic clinical impact date last evaluated,Somatic clinical impact review status,Oncogenicity classification,Oncogenicity date last evaluated,Oncogenicity review status,Unnamed: 24,Condition_new
6,NM_004153.4(ORC1):c.2570C>T (p.Ala857Val),ORC1,"A852V, A857V",not provided|Meier-Gorlin syndrome 1,VCV000873948,1,52838869,1,52373197,873948,...,"Aug 19, 2022","criteria provided, multiple submitters, no con...",,,,,,,,Meier-Gorlin syndrome 1 probaly
8,NM_004153.4(ORC1):c.2554G>T (p.Asp852Tyr),ORC1,"D847Y, D852Y",Inborn genetic diseases,VCV003206932,1,52838885,1,52373213,3206932,...,"Feb 6, 2024","criteria provided, single submitter",,,,,,,,Inborn genetic diseases
10,NM_004153.4(ORC1):c.2537G>A (p.Arg846Gln),ORC1,"R841Q, R846Q",not provided,VCV001486128,1,52838902,1,52373230,1486128,...,"Jul 1, 2022","criteria provided, single submitter",,,,,,,,not provided
11,NM_004153.4(ORC1):c.2536C>T (p.Arg846Trp),ORC1,"R846W, R841W",Meier-Gorlin syndrome 1,VCV000873949,1,52838903,1,52373231,873949,...,"Jan 12, 2018","criteria provided, single submitter",,,,,,,,Meier-Gorlin syndrome 1
12,NM_004153.4(ORC1):c.2531G>A (p.Arg844Gln),ORC1,"R839Q, R844Q",not provided,VCV001909061,1,52838908,1,52373236,1909061,...,"Jul 15, 2022","criteria provided, single submitter",,,,,,,,not provided
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
323,NM_004153.4(ORC1):c.83A>G (p.Tyr28Cys),ORC1,Y28C,Inborn genetic diseases,VCV003206936,1,52867813,1,52402141,3206936,...,"Dec 8, 2023","criteria provided, single submitter",,,,,,,,Inborn genetic diseases
324,NM_004153.4(ORC1):c.71G>A (p.Arg24Gln),ORC1,R24Q,Inborn genetic diseases,VCV002618990,1,52867825,1,52402153,2618990,...,"Aug 15, 2023","criteria provided, single submitter",,,,,,,,Inborn genetic diseases
326,NM_004153.4(ORC1):c.57G>C (p.Arg19Ser),ORC1,R19S,not specified|Meier-Gorlin syndrome 1|not prov...,VCV000259242,1,52867839,1,52402167,259242,...,"Jun 1, 2024","criteria provided, multiple submitters, no con...",,,,,,,,Meier-Gorlin syndrome 1 probaly
328,NM_004153.4(ORC1):c.35A>G (p.Lys12Arg),ORC1,K12R,Inborn genetic diseases,VCV003303495,1,52867861,1,52402189,3303495,...,"Jun 17, 2024","criteria provided, single submitter",,,,,,,,Inborn genetic diseases


In [9]:
df_proteins['Condition_new'].value_counts()

Condition_new
not provided                        92
Inborn genetic diseases             29
Meier-Gorlin syndrome 1 probaly     27
Meier-Gorlin syndrome 1             21
Inborn genetic diseases probably    11
not specified                        4
ORC1-related disorder probably       2
Name: count, dtype: int64

## Groupby

In [15]:
df_proteins.groupby("Germline review status")["Condition_new"].value_counts()

KeyError: 'Column not found: Condition_new'

## Subsets

In [14]:
criterion = 'criteria provided, multiple submitters, no conflicts'
disease = 'Meier-Gorlin syndrome 1 probably'
protein_mut = df_proteins['Protein change'][(df_proteins['Germline review status'] == criterion) & (df_proteins['Condition_new']==disease)]
print(protein_mut)

Series([], Name: Protein change, dtype: object)


In [None]:
yes, no = mutation_analysis(protein_mut, G4_RNA_positions)  

In [None]:
yes

In [None]:
no