In [1]:
import pandas as pd
from Bio import SeqIO
from scipy import stats

# Load the unaffected genes list
unique_genes = pd.read_csv('Unaffected_Genes.csv')

# Load genome file
genome_file = 'Homo_sapiens.GRCh38.dna.primary_assembly.fa'
genome = SeqIO.to_dict(SeqIO.parse(genome_file, 'fasta'))


# Load Exon BED file
exon_bed_file = 'exons.bed'
exons_bed = pd.read_csv(exon_bed_file, sep='\t', header=None, 
                        names=['seqname', 'start', 'end', 'strand', 'transcript', "gene_id", "length"])
exons_bed['seqname'] = exons_bed['seqname'].astype(str)

#Remove duplicate exons
exons_bed = exons_bed.drop_duplicates(subset=['seqname', 'start', 'end'])


  gtf = pd.read_csv(gtf_file, sep='\t', comment='#', header=None,
  exons_bed = pd.read_csv(exon_bed_file, sep='\t', header=None,


In [2]:
# Function to get exon sequences for a gene
def get_exon_sequences(gene_id, exons_bed, genome):
    # Filter exons for the specific gene ID
    gene_exons = exons_bed[exons_bed['gene_id'] == gene_id]
    exon_sequences = []
    for _, row in gene_exons.iterrows():
        seqname, start, end = row['seqname'], row['start'], row['end']
        
        if seqname not in genome:
            print(f"Chromosome {seqname} not found in genome for gene_id {gene_id}")
            continue
        
        # Extract and concatenate exon sequences
        exon_seq = genome[seqname].seq[start:end]
        exon_sequences.append(str(exon_seq))
    
    # Concatenate all exon sequences for the gene
    full_exon_sequence = ''.join(exon_sequences)
    
    return full_exon_sequence



# function for calculating GC content 
def calculate_gc_content(sequence):
    gc_count = sequence.count('G') + sequence.count('C')
    return (gc_count / len(sequence)) * 100 if len(sequence) > 0 else 0

In [3]:
# testing the function
test = "ENSG00000141959"
print (get_exon_sequences(test, exons_bed, genome))

GCGACGCGGCGCAGGCGGCGGGAGTGCGAGCTGGGCCCGTGTTTCGGCCGCCGCCATGGCCGCGGTGGACCTGGAGAAGCTGCGGGCGTCGGGCGCGGGCAAGGCCATCGGCGTCCTGACCAGCGGCGGCGACGCGCAAGGTCCCCTGACAAGCCCACCAGGCCCCCTGCTGAGATGGCTGTGACCCTGGGCTGACCCGCCCAGTGGCACATTGACTCCGCCTGGAGCTGGGGAGACCAGAGAGGCCCTGTGGTTGGACGGTGGCCTGGGTGCGCTGCTCCTGCCCTCTCCTTGCCCTGCCTCAGCTGCTGCCTGCCAGAGGCGTGGCACCTCACCTCACACCTGCTCCCTGCTGCTGAGCCCCACGCCAAGCTGGAGAGCGGATGAGAAGCATGTGTAACCAGGGTAGAGGTCGAGAGTCCTCTCGTGGGGGTCTCCATGTTCAAGGGAGCTGCCGAGGCTTGAGCAGGAGCCCCCAGCAGGAAACTGGCTTTGCCAAGGCCCCCGCTGGGACAGACTGTTTCTTTCACTGCAGTCCTGGGAGCCGAGGGCAAGGGGACAGGAAAGAGGAAGTGACCTCAGAGCCTGGTGGCACCAGCATCATGTCCAGGCTGGGGGGCATGAACGCTGCTGTCCGGGCTGTGACGCGCATGGGCATTTATGTGGGTGCCAAAGTCTTCCTCATCTACGAGGGCTATGAGGGCCTCGTGGAGGGAGGTGAGAACATCAAGCAGGCCAACTGGCTGAGCGTCTCCAACATCATCCAGCTGGGCGGCACTATCATTGGCAGCGCTCGCTGCAAGGCCTTTACCACCAGGGAGGGGCGCCGGGCAGCGGCCTACAACCTGGTCCAGCACGGCATCACCAACCTGTGCGTCATCGGCGGGGATGGCAGCCTCACAGGTGCCAACATCTTCCGCAGCGAGTGGGGCAGCCTGCTGGAGGAGCTGGTGGCGGAAGGTAAGATCTCAGAGACTACAGCCCGGACCTACTCGCACCT

In [4]:
# testing the function
test = "ENSG00000141959"
seq_test = get_exon_sequences(test, exons_bed, genome)
print (calculate_gc_content(seq_test))

64.60165870821814


In [None]:
# The results were compared to the datasets using human BLAT search and Ensembl genome browser.
# The function proved to be working perfectly and giving the correct results.

In [3]:
# Calculate GC content for exons of the unaffected genes
gc_content = {}
for gene_id in unique_genes['GeneID']:
    try:
        seq = get_exon_sequences(gene_id, exons_bed, genome)
        if seq:
            gc_content[gene_id] = calculate_gc_content(seq)
        else:
            gc_content[gene_id] = None
    except Exception as e:
        print(f"Error processing {gene_id}: {e}")
        gc_content[gene_id] = None

# Convert GC content dictionary to DataFrame
gc_content_df = pd.DataFrame(gc_content.items(), columns=['gene_id', 'gc_content'])
print(gc_content_df)

# Save the DataFrame to a CSV file
gc_content_df.to_csv('GC_ExonsUnaffected.csv', index=False)

# Print summary statistics for GC content
summary_GC_content = gc_content_df["gc_content"].describe()
print(summary_GC_content)

               gene_id  gc_content
0      ENSG00000284662   45.728643
1      ENSG00000186827   69.876003
2      ENSG00000260179   55.712452
3      ENSG00000234396   60.122699
4      ENSG00000225972   44.354839
...                ...         ...
53252  ENSG00000276289   51.491003
53253  ENSG00000288711   53.292683
53254  ENSG00000159200   48.058838
53255  ENSG00000142197   51.295283
53256  ENSG00000276076   62.980992

[53257 rows x 2 columns]
count    53230.000000
mean        47.706387
std          8.244582
min         16.326531
25%         41.584158
50%         46.623182
75%         53.216491
max         92.857143
Name: gc_content, dtype: float64


In [None]:


# Load the significant genes list
unique_genes = pd.read_csv('unique_gene_ids.csv')

# Load genome file
genome_file = 'Homo_sapiens.GRCh38.dna.primary_assembly.fa'
genome = SeqIO.to_dict(SeqIO.parse(genome_file, 'fasta'))



# Load Exon BED file
exon_bed_file = 'exons.bed'
exons_bed = pd.read_csv(exon_bed_file, sep='\t', header=None, 
                        names=['seqname', 'start', 'end', 'strand', 'transcript', "gene_id", "length"])
exons_bed['seqname'] = exons_bed['seqname'].astype(str)

#Remove duplicate exons
exons_bed = exons_bed.drop_duplicates(subset=['seqname', 'start', 'end'])


In [40]:
# Calculate GC content for exons of significant genes
gc_content = {}
for gene_id in unique_genes['GeneID']:
    try:
        seq = get_exon_sequences(gene_id, exons_bed, genome)
        if seq:
            gc_content[gene_id] = calculate_gc_content(seq)
        else:
            gc_content[gene_id] = None
    except Exception as e:
        print(f"Error processing {gene_id}: {e}")
        gc_content[gene_id] = None

# Convert GC content dictionary to DataFrame
gc_content_df = pd.DataFrame(gc_content.items(), columns=['gene_id', 'gc_content'])
print(gc_content_df)

# Save the DataFrame to a CSV file
gc_content_df.to_csv('GC_ExonsSignificant.csv', index=False)

# Print summary statistics for GC content
summary_GC_content = gc_content_df["gc_content"].describe()
print(summary_GC_content)

              gene_id  gc_content
0     ENSG00000141959   64.601659
1     ENSG00000100211   51.824818
2     ENSG00000133460   54.351412
3     ENSG00000093009   51.514046
4     ENSG00000183506   58.956462
...               ...         ...
7307  ENSG00000133110   35.049205
7308  ENSG00000276087   42.917957
7309  ENSG00000196312   48.431492
7310  ENSG00000185989   55.593153
7311  ENSG00000186184   45.790977

[7312 rows x 2 columns]
count    7311.000000
mean       49.815231
std         8.401445
min        33.506045
25%        42.650754
50%        48.859613
75%        56.835111
max        73.391210
Name: gc_content, dtype: float64


In [66]:
# T-test
# Load the data
unaffected_data = pd.read_csv('GC_ExonsUnaffected.csv')
significant_data = pd.read_csv('GC_ExonsSignificant.csv')

# Extract the GC content values
gc_unaffected = unaffected_data['gc_content']
gc_significant = significant_data['gc_content']

# Remove the rows that have no values 
gc_unaffected = unaffected_data['gc_content'].dropna()
gc_significant = significant_data['gc_content'].dropna()

# Perform Levene's test for equal variances
levene_stat, levene_p = stats.levene(gc_unaffected, gc_significant)
print(f"Levene's test - Statistic: {levene_stat}, P-value: {levene_p}")


# Based on Levene's test result, perform the appropriate t-test
if levene_p > 0.05:
    # Equal variances
    t_statistic_equal, p_value_equal = stats.ttest_ind(gc_unaffected, gc_significant, equal_var=True)
    print(f"Equal variances - T-statistic: {t_statistic_equal}, P-value: {p_value_equal}")
else:
    # Unequal variances
    t_statistic_unequal, p_value_unequal = stats.ttest_ind(gc_unaffected, gc_significant, equal_var=False)
    print(f"Unequal variances - T-statistic: {t_statistic_unequal}, P-value: {p_value_unequal}")

Levene's test - Statistic: 87.9708622800676, P-value: 6.863036871780596e-21
Unequal variances - T-statistic: -20.16993992566417, P-value: 1.355937427978016e-88


In [63]:
# Identify rows with missing gc_content in unaffected data 
missing_gc_unaffected = unaffected_data[unaffected_data['gc_content'].isnull()]
print(missing_gc_unaffected['gene_id'])



2758     ENSG00000263390
7012     ENSG00000283637
7044     ENSG00000263468
7969     ENSG00000264684
8195     ENSG00000211596
10771    ENSG00000250939
13256    ENSG00000255072
18602    ENSG00000278104
18603    ENSG00000275856
20366    ENSG00000283237
20375    ENSG00000283461
22767    ENSG00000283334
25511    ENSG00000207704
25646    ENSG00000254690
31385    ENSG00000279239
32385    ENSG00000283558
32725    ENSG00000273767
35359    ENSG00000207546
43549    ENSG00000286030
43960    ENSG00000285057
44359    ENSG00000273948
44431    ENSG00000283935
44530    ENSG00000207688
45106    ENSG00000272920
50379    ENSG00000288671
51891    ENSG00000274170
51983    ENSG00000283366
Name: gene_id, dtype: object


In [64]:
# Identify rows with missing gc_content in significant data 
missing_gc_unaffected = significant_data[significant_data['gc_content'].isnull()]
print(missing_gc_unaffected['gene_id'])

3771    ENSG00000271254
Name: gene_id, dtype: object


In [None]:
# Now we calculate the GC content based on the different alternative splicing events. 

In [7]:
# Calculate GC content for exons of the significant genes with skipped exons alternative splicing

unique_genes = pd.read_csv('SkippedExon_Genes.txt', header=None, names=['GeneID'])

gc_content = {}
for gene_id in unique_genes['GeneID']:
    try:
        seq = get_exon_sequences(gene_id, exons_bed, genome)
        if seq:
            gc_content[gene_id] = calculate_gc_content(seq)
        else:
            gc_content[gene_id] = None
    except Exception as e:
        print(f"Error processing {gene_id}: {e}")
        gc_content[gene_id] = None

# Convert GC content dictionary to DataFrame
gc_content_df = pd.DataFrame(gc_content.items(), columns=['gene_id', 'gc_content'])
print(gc_content_df)

# Save the DataFrame to a CSV file
gc_content_df.to_csv('GC_SkippedExons.csv', index=False)

# Print summary statistics for GC content
summary_GC_content = gc_content_df["gc_content"].describe()
print(summary_GC_content)

              gene_id  gc_content
0     ENSG00000271254         NaN
1     ENSG00000186866   58.602910
2     ENSG00000185917   48.528302
3     ENSG00000160216   55.485285
4     ENSG00000182670   41.354420
...               ...         ...
5926  ENSG00000276087   42.917957
5927  ENSG00000196312   48.431492
5928  ENSG00000185989   55.593153
5929  ENSG00000186184   45.790977
5930  ENSG00000134900   39.569601

[5931 rows x 2 columns]
count    5930.000000
mean       49.364622
std         8.316400
min        33.701188
25%        42.393955
50%        48.123423
75%        56.135080
max        73.391210
Name: gc_content, dtype: float64


In [8]:
# Calculate GC content for exons of the significant genes with mutually exclsive alternative splicing


unique_genes = pd.read_csv('MutuallyExclusive_Genes.txt', header=None, names=['GeneID'])

gc_content = {}
for gene_id in unique_genes['GeneID']:
    try:
        seq = get_exon_sequences(gene_id, exons_bed, genome)
        if seq:
            gc_content[gene_id] = calculate_gc_content(seq)
        else:
            gc_content[gene_id] = None
    except Exception as e:
        print(f"Error processing {gene_id}: {e}")
        gc_content[gene_id] = None

# Convert GC content dictionary to DataFrame
gc_content_df = pd.DataFrame(gc_content.items(), columns=['gene_id', 'gc_content'])
print(gc_content_df)

# Save the DataFrame to a CSV file
gc_content_df.to_csv('GC_MutuallyExclusive.csv', index=False)

# Print summary statistics for GC content
summary_GC_content = gc_content_df["gc_content"].describe()
print(summary_GC_content)

              gene_id  gc_content
0     ENSG00000185437   49.154135
1     ENSG00000100209   52.623688
2     ENSG00000243156   55.185351
3     ENSG00000100031   62.178970
4     ENSG00000239382   59.704861
...               ...         ...
1356  ENSG00000145781   39.507990
1357  ENSG00000002016   49.255114
1358  ENSG00000186815   58.011842
1359  ENSG00000123415   54.229623
1360  ENSG00000136146   39.245728

[1361 rows x 2 columns]
count    1361.000000
mean       48.811028
std         8.320953
min        33.506045
25%        41.906460
50%        47.408120
75%        55.345251
max        72.020569
Name: gc_content, dtype: float64


In [9]:
# Calculate GC content for exons of the significant genes with intron retention alternative splicing


unique_genes = pd.read_csv('IntronRetention_Genes.txt', header=None, names=['GeneID'])

gc_content = {}
for gene_id in unique_genes['GeneID']:
    try:
        seq = get_exon_sequences(gene_id, exons_bed, genome)
        if seq:
            gc_content[gene_id] = calculate_gc_content(seq)
        else:
            gc_content[gene_id] = None
    except Exception as e:
        print(f"Error processing {gene_id}: {e}")
        gc_content[gene_id] = None

# Convert GC content dictionary to DataFrame
gc_content_df = pd.DataFrame(gc_content.items(), columns=['gene_id', 'gc_content'])
print(gc_content_df)

# Save the DataFrame to a CSV file
gc_content_df.to_csv('GC_IntronRetention.csv', index=False)

# Print summary statistics for GC content
summary_GC_content = gc_content_df["gc_content"].describe()
print(summary_GC_content)

              gene_id  gc_content
0     ENSG00000235123   43.698469
1     ENSG00000160294   52.441340
2     ENSG00000206140   62.682464
3     ENSG00000025770   62.269562
4     ENSG00000100276   67.401842
...               ...         ...
1323  ENSG00000139044   59.332850
1324  ENSG00000135407   47.710633
1325  ENSG00000211451   52.362056
1326  ENSG00000123384   59.026798
1327  ENSG00000111077   62.090644

[1328 rows x 2 columns]
count    1328.000000
mean       53.912634
std         8.416187
min        34.428050
25%        47.606459
50%        55.423161
75%        60.754296
max        71.039326
Name: gc_content, dtype: float64


In [10]:
# Calculate GC content for exons of the significant genes with 3 and 5 prime alternative splicing


unique_genes = pd.read_csv('3_5_PrimeGenes.txt', header=None, names=['GeneID'])

gc_content = {}
for gene_id in unique_genes['GeneID']:
    try:
        seq = get_exon_sequences(gene_id, exons_bed, genome)
        if seq:
            gc_content[gene_id] = calculate_gc_content(seq)
        else:
            gc_content[gene_id] = None
    except Exception as e:
        print(f"Error processing {gene_id}: {e}")
        gc_content[gene_id] = None

# Convert GC content dictionary to DataFrame
gc_content_df = pd.DataFrame(gc_content.items(), columns=['gene_id', 'gc_content'])
print(gc_content_df)

# Save the DataFrame to a CSV file
gc_content_df.to_csv('GC_3_5_Prime.csv', index=False)

# Print summary statistics for GC content
summary_GC_content = gc_content_df["gc_content"].describe()
print(summary_GC_content)

              gene_id  gc_content
0     ENSG00000141959   64.601659
1     ENSG00000100211   51.824818
2     ENSG00000133460   54.351412
3     ENSG00000093009   51.514046
4     ENSG00000183506   58.956462
...               ...         ...
2214  ENSG00000120662   39.449686
2215  ENSG00000198715   54.986577
2216  ENSG00000139116   38.635905
2217  ENSG00000185046   45.067117
2218  ENSG00000060656   61.751349

[2219 rows x 2 columns]
count    2219.000000
mean       50.999993
std         8.451088
min        34.136334
25%        43.563681
50%        50.790590
75%        58.388437
max        70.720346
Name: gc_content, dtype: float64


In [None]:
# Now we calculate the GC content based on the datasets of cancer cell lines. 

In [6]:
# Calculate GC content for exons of the AU565 dataset


unique_genes = pd.read_csv('AU565_Genes.csv')

gc_content = {}
for gene_id in unique_genes['GeneID']:
    try:
        seq = get_exon_sequences(gene_id, exons_bed, genome)
        if seq:
            gc_content[gene_id] = calculate_gc_content(seq)
        else:
            gc_content[gene_id] = None
    except Exception as e:
        print(f"Error processing {gene_id}: {e}")
        gc_content[gene_id] = None

# Convert GC content dictionary to DataFrame
gc_content_df = pd.DataFrame(gc_content.items(), columns=['gene_id', 'gc_content'])
print(gc_content_df)

# Save the DataFrame to a CSV file
gc_content_df.to_csv('GC_AU565.csv', index=False)

# Print summary statistics for GC content
summary_GC_content = gc_content_df["gc_content"].describe()
print(summary_GC_content)

              gene_id  gc_content
0     ENSG00000141959   64.601659
1     ENSG00000100211   51.824818
2     ENSG00000133460   54.351412
3     ENSG00000093009   51.514046
4     ENSG00000183506   58.956462
...               ...         ...
2859  ENSG00000137822   44.945589
2860  ENSG00000139668   43.490054
2861  ENSG00000166529   49.849962
2862  ENSG00000133104   37.865443
2863  ENSG00000005810   43.821066

[2864 rows x 2 columns]
count    2863.000000
mean       50.647673
std         8.533137
min        33.701188
25%        43.284007
50%        49.990377
75%        58.153797
max        70.123967
Name: gc_content, dtype: float64


In [7]:
# Calculate GC content for exons of the BULK_MCF dataset.


unique_genes = pd.read_csv('BULK_MCF_Genes.csv')

gc_content = {}
for gene_id in unique_genes['GeneID']:
    try:
        seq = get_exon_sequences(gene_id, exons_bed, genome)
        if seq:
            gc_content[gene_id] = calculate_gc_content(seq)
        else:
            gc_content[gene_id] = None
    except Exception as e:
        print(f"Error processing {gene_id}: {e}")
        gc_content[gene_id] = None

# Convert GC content dictionary to DataFrame
gc_content_df = pd.DataFrame(gc_content.items(), columns=['gene_id', 'gc_content'])
print(gc_content_df)

# Save the DataFrame to a CSV file
gc_content_df.to_csv('GC_BULK_MCF.csv', index=False)

# Print summary statistics for GC content
summary_GC_content = gc_content_df["gc_content"].describe()
print(summary_GC_content)

              gene_id  gc_content
0     ENSG00000141959   64.601659
1     ENSG00000133460   54.351412
2     ENSG00000248751   62.273684
3     ENSG00000138867   58.025024
4     ENSG00000100401   58.630453
...               ...         ...
3121  ENSG00000275066   45.204628
3122  ENSG00000133104   37.865443
3123  ENSG00000139793   35.691180
3124  ENSG00000005810   43.821066
3125  ENSG00000226453   43.078461

[3126 rows x 2 columns]
count    3125.000000
mean       50.301064
std         8.578404
min        33.722839
25%        42.861671
50%        49.667534
75%        57.706367
max        73.391210
Name: gc_content, dtype: float64


In [8]:
# Calculate GC content for exons of the Ito dataset.


unique_genes = pd.read_csv('Ito_Genes.csv')

gc_content = {}
for gene_id in unique_genes['GeneID']:
    try:
        seq = get_exon_sequences(gene_id, exons_bed, genome)
        if seq:
            gc_content[gene_id] = calculate_gc_content(seq)
        else:
            gc_content[gene_id] = None
    except Exception as e:
        print(f"Error processing {gene_id}: {e}")
        gc_content[gene_id] = None

# Convert GC content dictionary to DataFrame
gc_content_df = pd.DataFrame(gc_content.items(), columns=['gene_id', 'gc_content'])
print(gc_content_df)

# Save the DataFrame to a CSV file
gc_content_df.to_csv('GC_Ito.csv', index=False)

# Print summary statistics for GC content
summary_GC_content = gc_content_df["gc_content"].describe()
print(summary_GC_content)

              gene_id  gc_content
0     ENSG00000248751   62.273684
1     ENSG00000100417   59.425837
2     ENSG00000100401   58.630453
3     ENSG00000198089   56.058673
4     ENSG00000274602   58.078163
...               ...         ...
4405  ENSG00000133104   37.865443
4406  ENSG00000139793   35.691180
4407  ENSG00000127022   43.266146
4408  ENSG00000005810   43.821066
4409  ENSG00000136100   39.356815

[4410 rows x 2 columns]
count    4410.000000
mean       49.444098
std         8.419025
min        33.506045
25%        42.351537
50%        48.055537
75%        56.370814
max        73.391210
Name: gc_content, dtype: float64


In [9]:
# Calculate GC content for exons of the MCF7 dataset.


unique_genes = pd.read_csv('MCF7_Genes.csv')

gc_content = {}
for gene_id in unique_genes['GeneID']:
    try:
        seq = get_exon_sequences(gene_id, exons_bed, genome)
        if seq:
            gc_content[gene_id] = calculate_gc_content(seq)
        else:
            gc_content[gene_id] = None
    except Exception as e:
        print(f"Error processing {gene_id}: {e}")
        gc_content[gene_id] = None

# Convert GC content dictionary to DataFrame
gc_content_df = pd.DataFrame(gc_content.items(), columns=['gene_id', 'gc_content'])
print(gc_content_df)

# Save the DataFrame to a CSV file
gc_content_df.to_csv('GC_MCF7.csv', index=False)

# Print summary statistics for GC content
summary_GC_content = gc_content_df["gc_content"].describe()
print(summary_GC_content)

              gene_id  gc_content
0     ENSG00000183570   54.887093
1     ENSG00000100401   58.630453
2     ENSG00000127616   55.624801
3     ENSG00000076928   61.627990
4     ENSG00000198300   43.790728
...               ...         ...
2150  ENSG00000139793   35.691180
2151  ENSG00000286724   46.164793
2152  ENSG00000127022   43.266146
2153  ENSG00000005810   43.821066
2154  ENSG00000136114   53.567488

[2155 rows x 2 columns]
count    2155.000000
mean       49.926808
std         8.435269
min        34.237605
25%        42.600826
50%        49.165012
75%        57.065833
max        72.020569
Name: gc_content, dtype: float64


In [5]:
# Calculate GC content for exons of the MCF10 dataset


unique_genes = pd.read_csv('MCF10_Genes.csv')

gc_content = {}
for gene_id in unique_genes['GeneID']:
    try:
        seq = get_exon_sequences(gene_id, exons_bed, genome)
        if seq:
            gc_content[gene_id] = calculate_gc_content(seq)
        else:
            gc_content[gene_id] = None
    except Exception as e:
        print(f"Error processing {gene_id}: {e}")
        gc_content[gene_id] = None

# Convert GC content dictionary to DataFrame
gc_content_df = pd.DataFrame(gc_content.items(), columns=['gene_id', 'gc_content'])
print(gc_content_df)

# Save the DataFrame to a CSV file
gc_content_df.to_csv('GC_MCF10.csv', index=False)

# Print summary statistics for GC content
summary_GC_content = gc_content_df["gc_content"].describe()
print(summary_GC_content)

              gene_id  gc_content
0     ENSG00000182871   65.897481
1     ENSG00000142173   65.308919
2     ENSG00000141959   64.601659
3     ENSG00000100211   51.824818
4     ENSG00000100417   59.425837
...               ...         ...
3290  ENSG00000133104   37.865443
3291  ENSG00000139793   35.691180
3292  ENSG00000151849   41.032258
3293  ENSG00000005810   43.821066
3294  ENSG00000071564   58.623489

[3295 rows x 2 columns]
count    3295.000000
mean       50.335877
std         8.498414
min        33.853238
25%        42.836642
50%        49.831683
75%        57.739295
max        70.964029
Name: gc_content, dtype: float64
