### Imports

In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.style as style

style.use('ggplot')

### Data loading

In [2]:
DATA_FILE = 'CPCT02220079.annotated.processed.tsv'
OUTPUT_FOLDER = 'zadanie4_pngs'

In [3]:
if not os.path.isdir(OUTPUT_FOLDER):
    os.mkdir(OUTPUT_FOLDER)

In [4]:
def load_data(file):
    return pd.read_csv(file, sep='\t', usecols=['CHROM', 'POS', 'REF', 'ALT'], dtype={'CHROM': str})

In [5]:
df = load_data(DATA_FILE)

### Variant classification

In [6]:
def classify_variant(s):
    ref_len = len(s['REF'])
    alt_len = len(s['ALT'])

    if ref_len < alt_len:
        return 'INS'
    elif ref_len > alt_len:
        return 'DEL'
    elif ref_len == alt_len == 1:
        return 'SNP'
    else:
        return None

In [7]:
df['VARIANT_CLASS'] = df.apply(classify_variant, axis=1)

In [8]:
indels = df[df['VARIANT_CLASS'].isin(['INS', 'DEL'])].copy()

In [9]:
indels['indel_len'] = (indels['ALT'].apply(len) - indels['REF'].apply(len)).apply(abs)

In [10]:
indels.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 915373 entries, 0 to 5062751
Data columns (total 6 columns):
CHROM            915373 non-null object
POS              915373 non-null int64
REF              915373 non-null object
ALT              915373 non-null object
VARIANT_CLASS    915373 non-null object
indel_len        915373 non-null int64
dtypes: int64(2), object(4)
memory usage: 48.9+ MB


In [11]:
indels.CHROM.unique()

array(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
       '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', 'X',
       'Y', 'MT'], dtype=object)

### Plot for every chromosome

In [14]:
for name, group in indels[['CHROM', 'indel_len']].groupby('CHROM'):
    plt.hist(group['indel_len'], bins=(range(1, max(group['indel_len'].max(), 50))))
    plt.yscale('log')
    plt.title(f'Chromosome {name}')
    plt.xlabel('Sequence length')
    plt.ylabel('Frequency')
    plt.savefig(os.path.join(OUTPUT_FOLDER, f'{name}.png'))
    plt.close()