# Analysis of clinical variatns of ORC1 protein

In [1]:
from clinvar_functions import *
import pandas as pd

## ORC1 regions

In [2]:
# List of intervals for particular feature
LLPS = [(360, 382), (412, 476), (577, 636), (677, 729), (757, 816)] # Chinees paper
IDR_total = [(183, 476)] # Chinees paper
G4_RNA = [(413, 511)] # Hoshina paper
CDCD6=[(180, 240)] # CDC6 paper

# Positions
G4_RNA_positions = generate_list(G4_RNA)
LLPS_positions = generate_list(LLPS)
CDCD6_positions = generate_list(CDCD6)

In [4]:
G4_RNA_positions

[413,
 414,
 415,
 416,
 417,
 418,
 419,
 420,
 421,
 422,
 423,
 424,
 425,
 426,
 427,
 428,
 429,
 430,
 431,
 432,
 433,
 434,
 435,
 436,
 437,
 438,
 439,
 440,
 441,
 442,
 443,
 444,
 445,
 446,
 447,
 448,
 449,
 450,
 451,
 452,
 453,
 454,
 455,
 456,
 457,
 458,
 459,
 460,
 461,
 462,
 463,
 464,
 465,
 466,
 467,
 468,
 469,
 470,
 471,
 472,
 473,
 474,
 475,
 476,
 477,
 478,
 479,
 480,
 481,
 482,
 483,
 484,
 485,
 486,
 487,
 488,
 489,
 490,
 491,
 492,
 493,
 494,
 495,
 496,
 497,
 498,
 499,
 500,
 501,
 502,
 503,
 504,
 505,
 506,
 507,
 508,
 509,
 510,
 511]

## Read data

In [5]:
# Set option to display all rows and columns
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)

df = pd.read_csv('clinvar_result.csv', sep = "\t")
df = df[df['Gene(s)'] == 'ORC1'] # save only ORC1 gene - delete 7 records
df.shape

(333, 25)

In [6]:
# Subset of data: only mutations in protein
df = df[df['Protein change'].apply(lambda x: isinstance(x, str))]
df.shape

(188, 25)

## Look at the different columns of df

In [None]:
df["Condition(s)"].value_counts()

In [None]:
df["Germline classification"].value_counts()

In [None]:
df['Germline review status'].value_counts()

## Get list of transcript IDs

In [None]:
IDs = df['Name'].apply(lambda x: x.split(":")[0])
IDs.value_counts()

## Add new column: Condition_new

In [7]:
df['Condition_new'] = df['Condition(s)'].map(rename_condition, na_action='ignore')

In [None]:
df['Condition_new'].value_counts()

## Groupby

In [8]:
df.groupby("Germline review status")["Condition_new"].value_counts()

Germline review status                                Condition_new                   
criteria provided, conflicting classifications        Meier-Gorlin syndrome 1 probably     6
                                                      Inborn genetic diseases probably     3
                                                      ORC1-related disorder probably       1
criteria provided, multiple submitters, no conflicts  Meier-Gorlin syndrome 1 probably    21
                                                      Inborn genetic diseases probably     8
                                                      not provided                         4
                                                      ORC1-related disorder probably       1
criteria provided, single submitter                   not provided                        90
                                                      Inborn genetic diseases             29
                                                      Meier-Gorlin syndrome 

## Subsets

In [22]:
criterion = 'criteria provided, single submitter'
disease = 'Inborn genetic diseases'
protein_mut = df['Protein change'][(df['Germline review status'] == criterion) & (df['Condition_new']==disease)]
yes, no = mutation_analysis(protein_mut, G4_RNA_positions)  

Total number of mutations is: 40
Number of mutations within interaval of interest is 4
Number of mutations outside interaval of interest is 36


In [23]:
yes

['E502D', 'E497D', 'L451V', 'A426G']