Integrative Analysis of Synthetic Copy Number Variation and Gene Expression Data

In [None]:
# import library
import pandas as pd # for tabular data processing

In [None]:
from google.colab import files
# Upload files from the local machine to the Colab environment.
# The returned object is a dictionary: {filename: file_content}
uploaded = files.upload()

Saving cnv_data.txt to cnv_data (1).txt
Saving gene_expression.txt to gene_expression (1).txt


In [None]:
# Read the first uploaded file as CNV (Copy Number Variation) data.
# Assumes the first file corresponds to CNV information.
cnv_data = pd.read_csv(list(uploaded.keys())[0])

# Inspect the first few rows to verify successful loading and structure.
print(cnv_data.head())

    sample  chromosome     start       end  segment_mean
0  S1_rep1           1   1000000   5000000          0.25
1  S1_rep1           8  12000000  18000000          0.60
2  S1_rep1          17  40000000  45000000         -0.45
3  S1_rep2           1   1000000   5000000          0.30
4  S1_rep2           8  12000000  18000000          0.55


In [None]:
# Read the second uploaded file as gene expression data.
# Assumes the second file corresponds to expression measurements.
expression_data = pd.read_csv(list(uploaded.keys())[1])

# Preview expression data to confirm correct import.
print(expression_data.head())

    sample   gene  expression
0  S1_rep1   TP53         8.2
1  S1_rep1   EGFR         5.4
2  S1_rep1    MYC         7.1
3  S1_rep1  BRCA1         6.3
4  S1_rep2   TP53         8.5


In [None]:
# Filtering at chromosome level
chr1_cnv = cnv_data[cnv_data['chromosome'] == 1]

# Inspect chromosome 1 CNV entries.
print(chr1_cnv.head())

    sample  chromosome    start      end  segment_mean
0  S1_rep1           1  1000000  5000000          0.25
3  S1_rep2           1  1000000  5000000          0.30
6  S2_rep1           1  1000000  5000000         -0.20
9  S2_rep2           1  1000000  5000000         -0.15


In [None]:
# Filtering at region level
region_1p = cnv_data[cnv_data['start'] < 5_000_000]

# Display all CNV segments in the 1p region.
print(region_1p)

    sample  chromosome    start      end  segment_mean
0  S1_rep1           1  1000000  5000000          0.25
3  S1_rep2           1  1000000  5000000          0.30
6  S2_rep1           1  1000000  5000000         -0.20
9  S2_rep2           1  1000000  5000000         -0.15


In [None]:
# Filtering by gene-level region
gene_level_cnv = cnv_data[
    (cnv_data['start'] <= 2_000_000) &
    (cnv_data['end'] >= 2_000_000)
]

# Inspect CNV segments affecting the target locus.
print(gene_level_cnv)

    sample  chromosome    start      end  segment_mean
0  S1_rep1           1  1000000  5000000          0.25
3  S1_rep2           1  1000000  5000000          0.30
6  S2_rep1           1  1000000  5000000         -0.20
9  S2_rep2           1  1000000  5000000         -0.15


In [None]:
# Integrate CNV and gene expression data by sample ID.
integrated_data = pd.merge(
    cnv_data,
    expression_data,
    on='sample',
    how='inner'
)

# Inspect the merged dataset to verify alignment.
print(integrated_data)

     sample  chromosome     start       end  segment_mean   gene  expression
0   S1_rep1           1   1000000   5000000          0.25   TP53         8.2
1   S1_rep1           1   1000000   5000000          0.25   EGFR         5.4
2   S1_rep1           1   1000000   5000000          0.25    MYC         7.1
3   S1_rep1           1   1000000   5000000          0.25  BRCA1         6.3
4   S1_rep1           8  12000000  18000000          0.60   TP53         8.2
..      ...         ...       ...       ...           ...    ...         ...
67  S3_rep2          17  40000000  45000000         -0.30  BRCA1         7.3
68  S3_rep2           8  12000000  18000000          0.25   TP53         5.6
69  S3_rep2           8  12000000  18000000          0.25   EGFR         6.0
70  S3_rep2           8  12000000  18000000          0.25    MYC         6.9
71  S3_rep2           8  12000000  18000000          0.25  BRCA1         7.3

[72 rows x 7 columns]


In [None]:
# Filter for high-amplification CNV events.
high_amp = integrated_data[integrated_data['segment_mean'] > 0.5]

# Review samples and genes with strong CNV amplification.
print(high_amp)

     sample  chromosome     start       end  segment_mean   gene  expression
4   S1_rep1           8  12000000  18000000          0.60   TP53         8.2
5   S1_rep1           8  12000000  18000000          0.60   EGFR         5.4
6   S1_rep1           8  12000000  18000000          0.60    MYC         7.1
7   S1_rep1           8  12000000  18000000          0.60  BRCA1         6.3
16  S1_rep2           8  12000000  18000000          0.55   TP53         8.5
17  S1_rep2           8  12000000  18000000          0.55   EGFR         5.6
18  S1_rep2           8  12000000  18000000          0.55    MYC         7.0
19  S1_rep2           8  12000000  18000000          0.55  BRCA1         6.1
28  S2_rep1           7  55000000  60000000          0.80   TP53         6.9
29  S2_rep1           7  55000000  60000000          0.80   EGFR         7.8
30  S2_rep1           7  55000000  60000000          0.80    MYC         8.2
31  S2_rep1           7  55000000  60000000          0.80  BRCA1         5.9

In [None]:
# Compute the correlation between CNV amplification values and gene expression.
# Grouped by gene to assess gene-specific CNV–expression relationships.
correlation = (
    high_amp
    .groupby('gene')[['segment_mean', 'expression']]
    .corr()
    .unstack()['segment_mean']['expression']
)

# Output the CNV–expression correlation per gene.
print(correlation)

gene
BRCA1   -0.639064
EGFR     0.814889
MYC      0.825398
TP53    -0.836575
Name: expression, dtype: float64
