# Create lists of human genes within Copy Number Variable Regions (CNVR) from Zarrei et al. CNVR map

## Analysis

In [1]:
import pandas as pd

### Prepare CNV data

##### Extract the columns from CNV map that contain chromosome, start and end to make BED files

In [1]:
%%bash
tail -n+2 ../../datasets/ZarreiEtAlCNVRMap/Inclusive.Gain.hg19.2015-02-03.txt | cut -f 1,2,3 > ../../datasets/ZarreiEtAlCNVRMap/inclusiveGainRegions.bed
tail -n+2 ../../datasets/ZarreiEtAlCNVRMap/Inclusive.Loss.hg19.2015-02-03.txt | cut -f 1,2,3 > ../../datasets/ZarreiEtAlCNVRMap/inclusiveLossRegions.bed
tail -n+2 ../../datasets/ZarreiEtAlCNVRMap/Inclusive.Gain+Loss.hg19.2015-02-03.txt | cut -f 1,2,3 > ../../datasets/ZarreiEtAlCNVRMap/inclusiveRegions.bed
tail -n+2 ../../datasets/ZarreiEtAlCNVRMap/Stringent.Gain.hg19.2015-02-03.txt | cut -f 1,2,3 > ../../datasets/ZarreiEtAlCNVRMap/stringentGainRegions.bed
tail -n+2 ../../datasets/ZarreiEtAlCNVRMap/Stringent.Loss.hg19.2015-02-03.txt | cut -f 1,2,3 > ../../datasets/ZarreiEtAlCNVRMap/stringentLossRegions.bed
tail -n+2 ../../datasets/ZarreiEtAlCNVRMap/Stringent.Gain+Loss.hg19.2015-02-03.txt | cut -f 1,2,3 > ../../datasets/ZarreiEtAlCNVRMap/stringentRegions.bed

##### Number of regions for each type

In [2]:
%%bash
wc -l ../../datasets/ZarreiEtAlCNVRMap/inclusiveGainRegions.bed
wc -l ../../datasets/ZarreiEtAlCNVRMap/inclusiveLossRegions.bed
wc -l ../../datasets/ZarreiEtAlCNVRMap/inclusiveRegions.bed
wc -l ../../datasets/ZarreiEtAlCNVRMap/stringentGainRegions.bed
wc -l ../../datasets/ZarreiEtAlCNVRMap/stringentLossRegions.bed
wc -l ../../datasets/ZarreiEtAlCNVRMap/stringentRegions.bed

    3132 ../../datasets/ZarreiEtAlCNVRMap/inclusiveGainRegions.bed
   23438 ../../datasets/ZarreiEtAlCNVRMap/inclusiveLossRegions.bed
   24032 ../../datasets/ZarreiEtAlCNVRMap/inclusiveRegions.bed
    1169 ../../datasets/ZarreiEtAlCNVRMap/stringentGainRegions.bed
   11530 ../../datasets/ZarreiEtAlCNVRMap/stringentLossRegions.bed
   11732 ../../datasets/ZarreiEtAlCNVRMap/stringentRegions.bed


##### Create protein-coding gene lists for CNV regions

In [7]:
%%bash
tail -n+2 ../../datasets/geneLists/Ensembl/EnsV75ProteinCodingGenes1-Y.txt | awk '{print "chr" $2 "\t" $3 "\t" $4 "\t" $1}' > ../../datasets/geneLists/Ensembl/EnsV75ProteinCodingGenes1-Y.bed

In [9]:
%%bash
bedtools intersect -a ../../datasets/geneLists/Ensembl/EnsV75ProteinCodingGenes1-Y.bed -b ../../datasets/ZarreiEtAlCNVRMap/inclusiveGainRegions.bed -u |\
cut -f 4 > ../../datasets/geneLists/ZarreiEtAlCNVRMap/inclusiveGainRegionsChr1-YGenes.txt
bedtools intersect -a ../../datasets/geneLists/Ensembl/EnsV75ProteinCodingGenes1-Y.bed -b ../../datasets/ZarreiEtAlCNVRMap/inclusiveLossRegions.bed -u |\
cut -f 4 > ../../datasets/geneLists/ZarreiEtAlCNVRMap/inclusiveLossRegionsChr1-YGenes.txt
bedtools intersect -a ../../datasets/geneLists/Ensembl/EnsV75ProteinCodingGenes1-Y.bed -b ../../datasets/ZarreiEtAlCNVRMap/inclusiveRegions.bed -u |\
cut -f 4 > ../../datasets/geneLists/ZarreiEtAlCNVRMap/inclusiveRegionsChr1-YGenes.txt
bedtools intersect -a ../../datasets/geneLists/Ensembl/EnsV75ProteinCodingGenes1-Y.bed -b ../../datasets/ZarreiEtAlCNVRMap/stringentGainRegions.bed -u |\
cut -f 4 > ../../datasets/geneLists/ZarreiEtAlCNVRMap/stringentGainRegionsChr1-YGenes.txt
bedtools intersect -a ../../datasets/geneLists/Ensembl/EnsV75ProteinCodingGenes1-Y.bed -b ../../datasets/ZarreiEtAlCNVRMap/stringentLossRegions.bed -u |\
cut -f 4 > ../../datasets/geneLists/ZarreiEtAlCNVRMap/stringentLossRegionsChr1-YGenes.txt
bedtools intersect -a ../../datasets/geneLists/Ensembl/EnsV75ProteinCodingGenes1-Y.bed -b ../../datasets/ZarreiEtAlCNVRMap/stringentRegions.bed -u |\
cut -f 4 > ../../datasets/geneLists/ZarreiEtAlCNVRMap/stringentRegionsChr1-YGenes.txt

In [10]:
%%bash
wc -l ../../datasets/geneLists/ZarreiEtAlCNVRMap/inclusiveGainRegionsChr1-YGenes.txt
wc -l ../../datasets/geneLists/ZarreiEtAlCNVRMap/inclusiveLossRegionsChr1-YGenes.txt
wc -l ../../datasets/geneLists/ZarreiEtAlCNVRMap/inclusiveRegionsChr1-YGenes.txt
wc -l ../../datasets/geneLists/ZarreiEtAlCNVRMap/stringentGainRegionsChr1-YGenes.txt
wc -l ../../datasets/geneLists/ZarreiEtAlCNVRMap/stringentLossRegionsChr1-YGenes.txt
wc -l ../../datasets/geneLists/ZarreiEtAlCNVRMap/stringentRegionsChr1-YGenes.txt

    2216 ../../datasets/geneLists/ZarreiEtAlCNVRMap/inclusiveGainRegionsChr1-YGenes.txt
    6884 ../../datasets/geneLists/ZarreiEtAlCNVRMap/inclusiveLossRegionsChr1-YGenes.txt
    7604 ../../datasets/geneLists/ZarreiEtAlCNVRMap/inclusiveRegionsChr1-YGenes.txt
    1042 ../../datasets/geneLists/ZarreiEtAlCNVRMap/stringentGainRegionsChr1-YGenes.txt
    3909 ../../datasets/geneLists/ZarreiEtAlCNVRMap/stringentLossRegionsChr1-YGenes.txt
    4295 ../../datasets/geneLists/ZarreiEtAlCNVRMap/stringentRegionsChr1-YGenes.txt
