# Analysis of clinical variatns of ORC1 protein

In [1]:
from clinvar_functions import *
import pandas as pd

## ORC1 regions

In [2]:
# List of intervals for particular feature
LLPS = [(360, 382), (412, 476), (577, 636), (677, 729), (757, 816)] # Chinees paper
IDR_total = [(183, 476)] # Chinees paper
G4_RNA = [(413, 511)] # Hoshina paper
CDCD6=[(180, 240)] # CDC6 paper

# Positions
G4_RNA_positions = generate_list(G4_RNA)
LLPS_positions = generate_list(LLPS)
CDCD6_positions = generate_list(CDCD6)

## Read data

In [3]:
# Set option to display all rows and columns
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)

df = pd.read_csv('clinvar_result.csv', sep = "\t")
df = df[df['Gene(s)'] == 'ORC1'] # save only ORC1 gene - delete 7 records
df.shape

(333, 25)

In [4]:
# Subset of data: only mutations in protein
df = df[df['Protein change'].apply(lambda x: isinstance(x, str))]
df.shape

(188, 25)

## Look at the different columns of df

In [None]:
df["Condition(s)"].value_counts()

In [None]:
df["Germline classification"].value_counts()

In [None]:
df['Germline review status'].value_counts()

## Get list of transcript IDs

In [None]:
IDs = df['Name'].apply(lambda x: x.split(":")[0])
IDs.value_counts()

## Add new column: Condition_new

In [5]:
df['Condition_new'] = df['Condition(s)'].map(rename_condition, na_action='ignore')

In [6]:
df['Condition_new'].value_counts()

Condition_new
not provided                        92
Inborn genetic diseases             29
Meier-Gorlin syndrome 1 probably    27
Meier-Gorlin syndrome 1             21
Inborn genetic diseases probably    11
not specified                        4
ORC1-related disorder probably       2
Name: count, dtype: int64

## Groupby

In [7]:
df.groupby("Germline review status")["Condition_new"].value_counts()

Germline review status                                Condition_new                   
criteria provided, conflicting classifications        Meier-Gorlin syndrome 1 probably     6
                                                      Inborn genetic diseases probably     3
                                                      ORC1-related disorder probably       1
criteria provided, multiple submitters, no conflicts  Meier-Gorlin syndrome 1 probably    21
                                                      Inborn genetic diseases probably     8
                                                      not provided                         4
                                                      ORC1-related disorder probably       1
criteria provided, single submitter                   not provided                        86
                                                      Inborn genetic diseases             29
                                                      Meier-Gorlin syndrome 

## Subsets

In [14]:
criterion = 'criteria provided, multiple submitters, no conflicts'
disease = 'Meier-Gorlin syndrome 1 probably'
protein_mut = df['Protein change'][(df['Germline review status'] == criterion) & (df['Condition_new']==disease)]
print(f"Total number of mutations is: {len(protein_mut)}")
yes, no = mutation_analysis(protein_mut, CDCD6_positions)  

Total number of mutations is: 21
Number of mutations within interaval of interest is 4
Number of mutations outside interaval of interest is 26


In [None]:
def mutation_analysis(sb, li):
    '''
    Input - pd.Series with list of mutations to test
    Output - 2 lists of mutations: first - within interval of interest (li); second - outside of interval of interest
    '''
    sb.dropna(inplace=True) # deleete empty positions in the 
    sb = list(sb) # convert pd.Series to list
    sb = [string.split(", ") for string in sb] # split strings with contains several mutaions
    sb = [item for sublist in sb for item in sublist] # flatten list of lists
    interval_mutations = []
    no_interval_mutations = []
    for mutation in sb:
        positions = list(map(int, re.findall(r'\d+', mutation)))
        for position in positions:
            if position in li:
                interval_mutations.append(mutation)
            else:
                no_interval_mutations.append(mutation)
    print(f"Total number of mutations is: {len(sb)}")
    print(f"Number of mutations within interaval of interest is {len(interval_mutations)}")
    print(f"Number of mutations outside interaval of interest is {len(no_interval_mutations)}")
    return(interval_mutations, no_interval_mutations)