# Analysis of clinical variatns of ORC1 protein

In [1]:
from clinvar_functions import *
import pandas as pd

## ORC1 regions

In [15]:
# List of intervals for particular feature
LLPS = [(360, 382), (412, 476), (577, 636), (677, 729), (757, 816)] # Chinees paper
IDR_total = [(183, 476)] # Chinees paper
G4_RNA = [(413, 511)] # Hoshina paper
CDCD6=[(180, 240)] # CDC6 paper
BP=[(354, 368), (378, 392)] # CDC6 paper

# Positions
G4_RNA_positions = generate_list(G4_RNA)
LLPS_positions = generate_list(LLPS)
CDCD6_positions = generate_list(CDCD6)
BP_positions = generate_list(BP)

In [3]:
G4_RNA_positions

[413,
 414,
 415,
 416,
 417,
 418,
 419,
 420,
 421,
 422,
 423,
 424,
 425,
 426,
 427,
 428,
 429,
 430,
 431,
 432,
 433,
 434,
 435,
 436,
 437,
 438,
 439,
 440,
 441,
 442,
 443,
 444,
 445,
 446,
 447,
 448,
 449,
 450,
 451,
 452,
 453,
 454,
 455,
 456,
 457,
 458,
 459,
 460,
 461,
 462,
 463,
 464,
 465,
 466,
 467,
 468,
 469,
 470,
 471,
 472,
 473,
 474,
 475,
 476,
 477,
 478,
 479,
 480,
 481,
 482,
 483,
 484,
 485,
 486,
 487,
 488,
 489,
 490,
 491,
 492,
 493,
 494,
 495,
 496,
 497,
 498,
 499,
 500,
 501,
 502,
 503,
 504,
 505,
 506,
 507,
 508,
 509,
 510,
 511]

## Read data

In [4]:
# Set option to display all rows and columns
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)

df = pd.read_csv('clinvar_result.csv', sep = "\t")
df = df[df['Gene(s)'] == 'ORC1'] # save only ORC1 gene - delete 7 records
df.shape

(333, 25)

In [5]:
# Subset of data: only mutations in protein
df = df[df['Protein change'].apply(lambda x: isinstance(x, str))]
df.shape

(188, 25)

## Look at the different columns of df

In [6]:
df["Condition(s)"].value_counts()

Condition(s)
not provided                                                                89
Inborn genetic diseases                                                     29
Meier-Gorlin syndrome 1                                                     21
not provided|Inborn genetic diseases                                         6
not provided|Meier-Gorlin syndrome 1                                         5
Inborn genetic diseases|not provided                                         5
not specified                                                                4
Meier-Gorlin syndrome 1|not provided                                         3
Meier-Gorlin syndrome 1|not provided|Inborn genetic diseases                 3
Inborn genetic diseases|Meier-Gorlin syndrome 1                              2
not provided|not specified|Meier-Gorlin syndrome 1                           2
not specified|not provided|Meier-Gorlin syndrome 1                           2
not provided|not specified             

In [7]:
df["Germline classification"].value_counts()

Germline classification
Uncertain significance                          142
Pathogenic                                       13
Conflicting classifications of pathogenicity     10
Benign                                            6
Likely pathogenic                                 5
Benign/Likely benign                              5
Likely benign                                     4
no classification for the single variant          2
Pathogenic/Likely pathogenic                      1
Name: count, dtype: int64

In [8]:
df['Germline review status'].value_counts()

Germline review status
criteria provided, single submitter                     137
criteria provided, multiple submitters, no conflicts     34
criteria provided, conflicting classifications           10
no assertion criteria provided                            5
no classification for the single variant                  2
Name: count, dtype: int64

## Get list of transcript IDs

In [9]:
IDs = df['Name'].apply(lambda x: x.split(":")[0])
IDs.value_counts()

Name
NM_004153.4(ORC1)    188
Name: count, dtype: int64

## Add new column: Condition_new

In [10]:
df['Condition_new'] = df['Condition(s)'].map(rename_condition, na_action='ignore')

In [11]:
df['Condition_new'].value_counts()

Condition_new
not provided                        96
Inborn genetic diseases             29
Meier-Gorlin syndrome 1 probably    27
Meier-Gorlin syndrome 1             21
Inborn genetic diseases probably    11
ORC1-related disorder probably       2
Name: count, dtype: int64

## Groupby

In [12]:
df.groupby("Germline review status")["Condition_new"].value_counts()

Germline review status                                Condition_new                   
criteria provided, conflicting classifications        Meier-Gorlin syndrome 1 probably     6
                                                      Inborn genetic diseases probably     3
                                                      ORC1-related disorder probably       1
criteria provided, multiple submitters, no conflicts  Meier-Gorlin syndrome 1 probably    21
                                                      Inborn genetic diseases probably     8
                                                      not provided                         4
                                                      ORC1-related disorder probably       1
criteria provided, single submitter                   not provided                        90
                                                      Inborn genetic diseases             29
                                                      Meier-Gorlin syndrome 

## Subsets

In [25]:
criterion = 'criteria provided, single submitter'
disease = 'Inborn genetic diseases'
protein_mut = df['Protein change'][(df['Germline review status'] == criterion) & (df['Condition_new']==disease)]
yes, no = mutation_analysis(protein_mut, BP_positions)  

Total number of mutations is: 40
Number of mutations within interaval of interest is 0
Number of mutations outside interaval of interest is 40


In [24]:
yes

['R380L']