In [2]:
import pandas as pd
import re

from Bio import SeqIO

In [3]:


# Reading files
splicing_file = 'splice_regions.bed'
genome_file = 'Homo_sapiens.GRCh38.dna.primary_assembly.fa'
significant_genes_file = 'unique_gene_ids.csv'


# Load the files
significant_genes = pd.read_csv(significant_genes_file)
genome = SeqIO.to_dict(SeqIO.parse(genome_file, 'fasta'))




# Define column names for the BED file
column_names = ['Chromosome', 'Start', 'End', 'Region', 'Score', 'Strand', 'Type', 'Transcript', 'Gene']


# Read the BED file with headers
bed_data = pd.read_csv(splicing_file, sep='\t', header=None, names=column_names, dtype=str)
bed_data['Chromosome'] = bed_data['Chromosome'].astype(str)
# Display the first few rows of the dataframe with headers
print(bed_data.head())

  Chromosome    Start      End             Region Score Strand      Type  \
0          1  1211697  1211706  1:1211625-1211703     0      -     donor   
1          1  1211622  1211645  1:1211625-1211703     0      -  acceptor   
2          1  1211935  1211944  1:1211832-1211941     0      -     donor   
3          1  1211829  1211852  1:1211832-1211941     0      -  acceptor   
4          1  1212631  1212640  1:1212138-1212637     0      -     donor   

        Transcript             Gene  
0  ENST00000379236  ENSG00000186827  
1  ENST00000379236  ENSG00000186827  
2  ENST00000379236  ENSG00000186827  
3  ENST00000379236  ENSG00000186827  
4  ENST00000379236  ENSG00000186827  


In [4]:
# Filtering the data to include only the unique of splice sites
unique_splice_sites = bed_data.drop_duplicates(subset=['Chromosome', 'Start', 'End', 'Strand', 'Type'])


# Display the DataFrame to verify
print(unique_splice_sites)

        Chromosome     Start       End                Region Score Strand  \
0                1   1211697   1211706     1:1211625-1211703     0      -   
1                1   1211622   1211645     1:1211625-1211703     0      -   
2                1   1211935   1211944     1:1211832-1211941     0      -   
3                1   1211829   1211852     1:1211832-1211941     0      -   
4                1   1212631   1212640     1:1212138-1212637     0      -   
...            ...       ...       ...                   ...   ...    ...   
2522493         21  31464032  31464055  21:31464035-31558926     0      -   
2522495         21  31118667  31118690  21:31118670-31124521     0      -   
2522522         21  31160438  31160447  21:31154426-31160444     0      -   
2522542         21  31559912  31559921  21:31464035-31559918     0      -   
2522548         21  31344131  31344140  21:31339422-31344137     0      -   

             Type       Transcript             Gene  
0           donor  EN

In [53]:
# testing on gene ENSG00000186827  

test_data = bed_data[bed_data['Gene'] == 'ENSG00000186827']

# Print the filtered rows
print(test_data)

   Chromosome    Start      End             Region Score Strand      Type  \
0           1  1211697  1211706  1:1211625-1211703     0      -     donor   
1           1  1211622  1211645  1:1211625-1211703     0      -  acceptor   
2           1  1211935  1211944  1:1211832-1211941     0      -     donor   
3           1  1211829  1211852  1:1211832-1211941     0      -  acceptor   
4           1  1212631  1212640  1:1212138-1212637     0      -     donor   
5           1  1212135  1212158  1:1212138-1212637     0      -  acceptor   
6           1  1212985  1212994  1:1212704-1212991     0      -     donor   
7           1  1212701  1212724  1:1212704-1212991     0      -  acceptor   
8           1  1213656  1213665  1:1213093-1213662     0      -     donor   
9           1  1213090  1213113  1:1213093-1213662     0      -  acceptor   
10          1  1213976  1213985  1:1213785-1213982     0      -     donor   
11          1  1213782  1213805  1:1213785-1213982     0      -  acceptor   

In [54]:
# now we check the unique splice sites of gene ENSG00000186827
test_data2 = unique_splice_sites[bed_data['Gene'] == 'ENSG00000186827']


print(test_data2)

   Chromosome    Start      End             Region Score Strand      Type  \
0           1  1211697  1211706  1:1211625-1211703     0      -     donor   
1           1  1211622  1211645  1:1211625-1211703     0      -  acceptor   
2           1  1211935  1211944  1:1211832-1211941     0      -     donor   
3           1  1211829  1211852  1:1211832-1211941     0      -  acceptor   
4           1  1212631  1212640  1:1212138-1212637     0      -     donor   
5           1  1212135  1212158  1:1212138-1212637     0      -  acceptor   
6           1  1212985  1212994  1:1212704-1212991     0      -     donor   
7           1  1212701  1212724  1:1212704-1212991     0      -  acceptor   
8           1  1213656  1213665  1:1213093-1213662     0      -     donor   
9           1  1213090  1213113  1:1213093-1213662     0      -  acceptor   
10          1  1213976  1213985  1:1213785-1213982     0      -     donor   
11          1  1213782  1213805  1:1213785-1213982     0      -  acceptor   

  test_data2 = unique_splice_sites[bed_data['Gene'] == 'ENSG00000186827']


In [5]:
# function to sequence
def get_sequence(chromosome, start, end, strand):
    sequence = genome[chromosome].seq[start:end]
    if strand == '-':
        sequence = sequence.reverse_complement()
    return str(sequence)

In [6]:
# function to extract the 5 prime sequences
def extract_5_prime_sequences(gene_id):
    sequences = []
    gene_splice_sites = unique_splice_sites[(unique_splice_sites['Gene'] == gene_id) & (unique_splice_sites['Type'] == 'donor')]
    for idx, row in gene_splice_sites.iterrows():
        sequence = get_sequence(row['Chromosome'], int(row['Start']), int(row['End']), row['Strand'])
        sequences.append(sequence)
    return sequences

In [57]:
all_5_prime_sequences = []

for gene_id in significant_genes['GeneID']:
    try:
        sequences = extract_5_prime_sequences(gene_id)
        all_5_prime_sequences.extend(sequences)  # Only add the sequences
    except Exception as e:
        print(f"Error processing {gene_id}: {e}")

In [58]:
# Convert to DataFrame
sequences_df = pd.DataFrame(all_5_prime_sequences, columns=['sequence'])

# Save to TXT
sequences_df.to_csv('5_prime_sequences_Significant.txt', index=False, sep='\t')

In [7]:
# function to extract the 3 prime sequences
def extract_3_prime_sequences(gene_id):
    sequences = []
    gene_splice_sites = unique_splice_sites[(unique_splice_sites['Gene'] == gene_id) & (unique_splice_sites['Type'] == 'acceptor')]
    for idx, row in gene_splice_sites.iterrows():
        sequence = get_sequence(row['Chromosome'], int(row['Start']), int(row['End']), row['Strand'])
        sequences.append(sequence)
    return sequences

In [60]:
all_3_prime_sequences = []

for gene_id in significant_genes['GeneID']:
    try:
        sequences = extract_3_prime_sequences(gene_id)
        all_3_prime_sequences.extend(sequences)  # Only add the sequences
    except Exception as e:
        print(f"Error processing {gene_id}: {e}")

# Convert to DataFrame
sequences_df = pd.DataFrame(all_3_prime_sequences, columns=['sequence'])

# Save to TXT
sequences_df.to_csv('3_prime_sequences_Significant.txt', index=False, sep='\t')

In [30]:
# The 3' and 5' splice site sequences are analyzed for splice site strength using the following two websites.
# http://hollywood.mit.edu/burgelab/maxent/Xmaxentscan_scoreseq.html
# http://hollywood.mit.edu/burgelab/maxent/Xmaxentscan_scoreseq_acc.html
# the results are saved in 3_prime_sequences_Significant.txt and 5_prime_sequences_Significant.txt

In [None]:
# We extract the values from both files, and then we combine them into one file so we can compare them to the unaffected genes.

In [62]:

# Read the prime 5 file content
with open('5_prime_sequences_Significant.txt', 'r') as file:
    content = file.read()

# Use regular expression to find all MAXENT values
pattern = r'MAXENT:\s*(-?\d*\.?\d+)'

values = re.findall(pattern, content)


# Convert values to float 
values = [float(value) for value in values]

with open('SSS_5significantGenes.txt', 'w') as output_file:
    for value in values:
        output_file.write(f'{value}\n')

# SSS = Splice Site Strength

In [63]:
# Read the prime 3 file content
with open('3_prime_sequences_Significant.txt', 'r') as file:
    content = file.read()

# Use regular expression to find all MAXENT values
pattern = r'MAXENT:\s*(-?\d*\.?\d+)'

values = re.findall(pattern, content)


# Convert values to float 
values = [float(value) for value in values]

with open('SSS_3significantGenes.txt', 'w') as output_file:
    for value in values:
        output_file.write(f'{value}\n')

In [64]:
# combine the two files
file1 = 'SSS_3significantGenes.txt'
file2 = 'SSS_5significantGenes.txt'
combined_file = 'SSS_significantGenes.txt'

# Open the files in read mode
with open(file1, 'r') as f1, open(file2, 'r') as f2:
    # Read the contents of the files
    file1_contents = f1.readlines()
    file2_contents = f2.readlines()

# Open the combined file in write mode
with open(combined_file, 'w') as f_combined:
    # Write the contents of the first file
    f_combined.writelines(file1_contents)
    # Write the contents of the second file
    f_combined.writelines(file2_contents)

In [65]:
# The same process is done for the unaffected genes

unaffected_genes_file = 'Unaffected_Genes.csv'

# Load the files
unaffected_genes = pd.read_csv(unaffected_genes_file)




In [66]:
#extracting the 5 prime splice sites sequences of the unaffected genes

all_5_prime_sequences = []

for gene_id in unaffected_genes['GeneID']:
    try:
        sequences = extract_5_prime_sequences(gene_id)
        all_5_prime_sequences.extend(sequences)  # Only add the sequences
    except Exception as e:
        print(f"Error processing {gene_id}: {e}")



# Convert to DataFrame
sequences_df = pd.DataFrame(all_5_prime_sequences, columns=['sequence'])

# Save to TXT
sequences_df.to_csv('5_prime_sequences_Unaffected.txt', index=False, sep='\t')

In [43]:
all_3_prime_sequences = []

for gene_id in unaffected_genes['GeneID']:
    try:
        sequences = extract_3_prime_sequences(gene_id)
        all_3_prime_sequences.extend(sequences)  # Only add the sequences
    except Exception as e:
        print(f"Error processing {gene_id}: {e}")

# Convert to DataFrame
sequences_df = pd.DataFrame(all_3_prime_sequences, columns=['sequence'])

# Save to TXT
sequences_df.to_csv('3_prime_sequences_Unaffected.txt', index=False, sep='\t')

In [None]:
# The 3' and 5' splice site sequences are analyzed for splice site strength using the following two websites.
# http://hollywood.mit.edu/burgelab/maxent/Xmaxentscan_scoreseq.html
# http://hollywood.mit.edu/burgelab/maxent/Xmaxentscan_scoreseq_acc.html
# the results are saved in 3_prime_sequences_Unaffected.txt and 5_prime_sequences_Unaffected.txt
# We extract the values from both files, and then we combine them into one file so we can compare them to the significant genes.

In [67]:
# Read the prime 5 file content
with open('5_prime_sequences_Unaffected.txt', 'r') as file:
    content = file.read()

# Use regular expression to find all MAXENT values
pattern = r'MAXENT:\s*(-?\d*\.?\d+)'

values = re.findall(pattern, content)


# Convert values to float 
values = [float(value) for value in values]

with open('SSS_5UnaffectedGenes.txt', 'w') as output_file:
    for value in values:
        output_file.write(f'{value}\n')


In [69]:
# Read the prime 3 file content
with open('3_prime_sequences_Unaffected.txt', 'r') as file:
    content = file.read()

# Use regular expression to find all MAXENT values
pattern = r'MAXENT:\s*(-?\d*\.?\d+)'

values = re.findall(pattern, content)


# Convert values to float 
values = [float(value) for value in values]

with open('SSS_3UnaffectedGenes.txt', 'w') as output_file:
    for value in values:
        output_file.write(f'{value}\n')

In [70]:
# combine the two files
file1 = 'SSS_3UnaffectedGenes.txt'
file2 = 'SSS_5UnaffectedGenes.txt'
combined_file = 'SSS_UnaffectedGenes.txt'

# Open the files in read mode
with open(file1, 'r') as f1, open(file2, 'r') as f2:
    # Read the contents of the files
    file1_contents = f1.readlines()
    file2_contents = f2.readlines()

# Open the combined file in write mode
with open(combined_file, 'w') as f_combined:
    # Write the contents of the first file
    f_combined.writelines(file1_contents)
    # Write the contents of the second file
    f_combined.writelines(file2_contents)

In [None]:
# Now we calculate the splice sites strength based on the different alternative splicing events of the significant genes. 

In [8]:
# skipped exon alternative splicing.

unique_genes = pd.read_csv('SkippedExon_Genes.txt', header=None, names=['GeneID'])




#extracting the 5 prime splice sites sequences 
all_5_prime_sequences = []

for gene_id in unique_genes['GeneID']:
    try:
        sequences = extract_5_prime_sequences(gene_id)
        all_5_prime_sequences.extend(sequences)  # Only add the sequences
    except Exception as e:
        print(f"Error processing {gene_id}: {e}")



# Convert to DataFrame
sequences_df = pd.DataFrame(all_5_prime_sequences, columns=['sequence'])

# Save to TXT
sequences_df.to_csv('5_prime_SKippedExons.txt', index=False, sep='\t')




#extracting the 3 prime splice sites sequences
all_3_prime_sequences = []

for gene_id in unique_genes['GeneID']:
    try:
        sequences = extract_3_prime_sequences(gene_id)
        all_3_prime_sequences.extend(sequences)  # Only add the sequences
    except Exception as e:
        print(f"Error processing {gene_id}: {e}")

# Convert to DataFrame
sequences_df = pd.DataFrame(all_3_prime_sequences, columns=['sequence'])

# Save to TXT
sequences_df.to_csv('3_prime_SkippedExons.txt', index=False, sep='\t')




# The 3' and 5' splice site sequences are analyzed for splice site strength using the following two websites.
# http://hollywood.mit.edu/burgelab/maxent/Xmaxentscan_scoreseq.html
# http://hollywood.mit.edu/burgelab/maxent/Xmaxentscan_scoreseq_acc.html
# the results are saved in 5_prime_SKippedExons.txt and 3_prime_SKippedExons.txt
# We extract the values from both files, and then we combine them into one file 



In [10]:
# Read the prime 5 file content
with open('5_prime_SKippedExons.txt', 'r') as file:
    content = file.read()

# Use regular expression to find all MAXENT values
pattern = r'MAXENT:\s*(-?\d*\.?\d+)'

values = re.findall(pattern, content)


# Convert values to float 
values = [float(value) for value in values]

with open('SSS_5SkippedExons.txt', 'w') as output_file:
    for value in values:
        output_file.write(f'{value}\n')


# Read the prime 3 file content
with open('3_prime_SKippedExons.txt', 'r') as file:
    content = file.read()

# Use regular expression to find all MAXENT values
pattern = r'MAXENT:\s*(-?\d*\.?\d+)'

values = re.findall(pattern, content)


# Convert values to float 
values = [float(value) for value in values]

with open('SSS_3SkippedExons.txt', 'w') as output_file:
    for value in values:
        output_file.write(f'{value}\n')



# combine the two files
file1 = 'SSS_5SkippedExons.txt'
file2 = 'SSS_3SkippedExons.txt'
combined_file = 'SSS_SkippedExons.txt'

# Open the files in read mode
with open(file1, 'r') as f1, open(file2, 'r') as f2:
    # Read the contents of the files
    file1_contents = f1.readlines()
    file2_contents = f2.readlines()

# Open the combined file in write mode
with open(combined_file, 'w') as f_combined:
    # Write the contents of the first file
    f_combined.writelines(file1_contents)
    # Write the contents of the second file
    f_combined.writelines(file2_contents)

In [11]:
# Mutually Exclusive alternative splicing.

unique_genes = pd.read_csv('MutuallyExclusive_Genes.txt', header=None, names=['GeneID'])




#extracting the 5 prime splice sites sequences 
all_5_prime_sequences = []

for gene_id in unique_genes['GeneID']:
    try:
        sequences = extract_5_prime_sequences(gene_id)
        all_5_prime_sequences.extend(sequences)  # Only add the sequences
    except Exception as e:
        print(f"Error processing {gene_id}: {e}")



# Convert to DataFrame
sequences_df = pd.DataFrame(all_5_prime_sequences, columns=['sequence'])

# Save to TXT
sequences_df.to_csv('5_prime_MutuallyExclusive.txt', index=False, sep='\t')




#extracting the 3 prime splice sites sequences
all_3_prime_sequences = []

for gene_id in unique_genes['GeneID']:
    try:
        sequences = extract_3_prime_sequences(gene_id)
        all_3_prime_sequences.extend(sequences)  # Only add the sequences
    except Exception as e:
        print(f"Error processing {gene_id}: {e}")

# Convert to DataFrame
sequences_df = pd.DataFrame(all_3_prime_sequences, columns=['sequence'])

# Save to TXT
sequences_df.to_csv('3_prime_MutuallyExclusive.txt', index=False, sep='\t')




# The 3' and 5' splice site sequences are analyzed for splice site strength using the following two websites.
# http://hollywood.mit.edu/burgelab/maxent/Xmaxentscan_scoreseq.html
# http://hollywood.mit.edu/burgelab/maxent/Xmaxentscan_scoreseq_acc.html
# the results are saved in 3_prime_MutuallyExclusive.txt and 5_prime_MutuallyExclusive.txt
# We extract the values from both files, and then we combine them into one file 




In [12]:
# Read the prime 5 file content
with open('5_prime_MutuallyExclusive.txt', 'r') as file:
    content = file.read()

# Use regular expression to find all MAXENT values
pattern = r'MAXENT:\s*(-?\d*\.?\d+)'

values = re.findall(pattern, content)


# Convert values to float 
values = [float(value) for value in values]

with open('SSS_5MutuallyExclusive.txt', 'w') as output_file:
    for value in values:
        output_file.write(f'{value}\n')


# Read the prime 3 file content
with open('3_prime_MutuallyExclusive.txt', 'r') as file:
    content = file.read()

# Use regular expression to find all MAXENT values
pattern = r'MAXENT:\s*(-?\d*\.?\d+)'

values = re.findall(pattern, content)


# Convert values to float 
values = [float(value) for value in values]

with open('SSS_3MutuallyExclusive.txt', 'w') as output_file:
    for value in values:
        output_file.write(f'{value}\n')



# combine the two files
file1 = 'SSS_5MutuallyExclusive.txt'
file2 = 'SSS_3MutuallyExclusive.txt'
combined_file = 'SSS_MutuallyExclusive.txt'

# Open the files in read mode
with open(file1, 'r') as f1, open(file2, 'r') as f2:
    # Read the contents of the files
    file1_contents = f1.readlines()
    file2_contents = f2.readlines()

# Open the combined file in write mode
with open(combined_file, 'w') as f_combined:
    # Write the contents of the first file
    f_combined.writelines(file1_contents)
    # Write the contents of the second file
    f_combined.writelines(file2_contents)

In [13]:
#  intron retention alternative splicing.

unique_genes = pd.read_csv('IntronRetention_Genes.txt', header=None, names=['GeneID'])




#extracting the 5 prime splice sites sequences 
all_5_prime_sequences = []

for gene_id in unique_genes['GeneID']:
    try:
        sequences = extract_5_prime_sequences(gene_id)
        all_5_prime_sequences.extend(sequences)  # Only add the sequences
    except Exception as e:
        print(f"Error processing {gene_id}: {e}")



# Convert to DataFrame
sequences_df = pd.DataFrame(all_5_prime_sequences, columns=['sequence'])

# Save to TXT
sequences_df.to_csv('5_prime_IntronRetention.txt', index=False, sep='\t')




#extracting the 3 prime splice sites sequences
all_3_prime_sequences = []

for gene_id in unique_genes['GeneID']:
    try:
        sequences = extract_3_prime_sequences(gene_id)
        all_3_prime_sequences.extend(sequences)  # Only add the sequences
    except Exception as e:
        print(f"Error processing {gene_id}: {e}")

# Convert to DataFrame
sequences_df = pd.DataFrame(all_3_prime_sequences, columns=['sequence'])

# Save to TXT
sequences_df.to_csv('3_prime_IntronRetention.txt', index=False, sep='\t')




# The 3' and 5' splice site sequences are analyzed for splice site strength using the following two websites.
# http://hollywood.mit.edu/burgelab/maxent/Xmaxentscan_scoreseq.html
# http://hollywood.mit.edu/burgelab/maxent/Xmaxentscan_scoreseq_acc.html
# the results are saved in 3_prime_IntronRetention.txt and 5_prime_IntronRetention.txt
# We extract the values from both files, and then we combine them into one file 




In [14]:
# Read the prime 5 file content
with open('5_prime_IntronRetention.txt', 'r') as file:
    content = file.read()

# Use regular expression to find all MAXENT values
pattern = r'MAXENT:\s*(-?\d*\.?\d+)'

values = re.findall(pattern, content)


# Convert values to float 
values = [float(value) for value in values]

with open('SSS_5IntronRetention.txt', 'w') as output_file:
    for value in values:
        output_file.write(f'{value}\n')


# Read the prime 3 file content
with open('3_prime_IntronRetention.txt', 'r') as file:
    content = file.read()

# Use regular expression to find all MAXENT values
pattern = r'MAXENT:\s*(-?\d*\.?\d+)'

values = re.findall(pattern, content)


# Convert values to float 
values = [float(value) for value in values]

with open('SSS_3IntronRetention.txt', 'w') as output_file:
    for value in values:
        output_file.write(f'{value}\n')



# combine the two files
file1 = 'SSS_5IntronRetention.txt'
file2 = 'SSS_3IntronRetention.txt'
combined_file = 'SSS_IntronRetention.txt'

# Open the files in read mode
with open(file1, 'r') as f1, open(file2, 'r') as f2:
    # Read the contents of the files
    file1_contents = f1.readlines()
    file2_contents = f2.readlines()

# Open the combined file in write mode
with open(combined_file, 'w') as f_combined:
    # Write the contents of the first file
    f_combined.writelines(file1_contents)
    # Write the contents of the second file
    f_combined.writelines(file2_contents)

In [15]:
# 3' and 5' alternative splicing.

unique_genes = pd.read_csv('3_5_PrimeGenes.txt', header=None, names=['GeneID'])




#extracting the 5 prime splice sites sequences 
all_5_prime_sequences = []

for gene_id in unique_genes['GeneID']:
    try:
        sequences = extract_5_prime_sequences(gene_id)
        all_5_prime_sequences.extend(sequences)  # Only add the sequences
    except Exception as e:
        print(f"Error processing {gene_id}: {e}")



# Convert to DataFrame
sequences_df = pd.DataFrame(all_5_prime_sequences, columns=['sequence'])

# Save to TXT
sequences_df.to_csv('5_prime_3_5_Prime.txt', index=False, sep='\t')




#extracting the 3 prime splice sites sequences
all_3_prime_sequences = []

for gene_id in unique_genes['GeneID']:
    try:
        sequences = extract_3_prime_sequences(gene_id)
        all_3_prime_sequences.extend(sequences)  # Only add the sequences
    except Exception as e:
        print(f"Error processing {gene_id}: {e}")

# Convert to DataFrame
sequences_df = pd.DataFrame(all_3_prime_sequences, columns=['sequence'])

# Save to TXT
sequences_df.to_csv('3_prime_3_5_Prime.txt', index=False, sep='\t')




# The 3' and 5' splice site sequences are analyzed for splice site strength using the following two websites.
# http://hollywood.mit.edu/burgelab/maxent/Xmaxentscan_scoreseq.html
# http://hollywood.mit.edu/burgelab/maxent/Xmaxentscan_scoreseq_acc.html
# the results are saved in 5_prime_3_5_Prime.txt and 3_prime_3_5_Prime.txt
# We extract the values from both files, and then we combine them into one file 




In [16]:
# Read the prime 5 file content
with open('5_prime_3_5_Prime.txt', 'r') as file:
    content = file.read()

# Use regular expression to find all MAXENT values
pattern = r'MAXENT:\s*(-?\d*\.?\d+)'

values = re.findall(pattern, content)


# Convert values to float 
values = [float(value) for value in values]

with open('SSS_5_3_5_Prime.txt', 'w') as output_file:
    for value in values:
        output_file.write(f'{value}\n')


# Read the prime 3 file content
with open('3_prime_3_5_Prime.txt', 'r') as file:
    content = file.read()

# Use regular expression to find all MAXENT values
pattern = r'MAXENT:\s*(-?\d*\.?\d+)'

values = re.findall(pattern, content)


# Convert values to float 
values = [float(value) for value in values]

with open('SSS_3_3_5_Prime.txt', 'w') as output_file:
    for value in values:
        output_file.write(f'{value}\n')



# combine the two files
file1 = 'SSS_5_3_5_Prime.txt'
file2 = 'SSS_3_3_5_Prime.txt'
combined_file = 'SSS_3_5_Prime.txt'

# Open the files in read mode
with open(file1, 'r') as f1, open(file2, 'r') as f2:
    # Read the contents of the files
    file1_contents = f1.readlines()
    file2_contents = f2.readlines()

# Open the combined file in write mode
with open(combined_file, 'w') as f_combined:
    # Write the contents of the first file
    f_combined.writelines(file1_contents)
    # Write the contents of the second file
    f_combined.writelines(file2_contents)

In [None]:
# Now we calculate the splice sites strength based on the different datasets of cell lines 
# Datasets are AU565, BULK_MCF, Ito, MCF7, and MCF10.  

In [8]:
# AU565 dataset

unique_genes = pd.read_csv('AU565_Genes.csv')




#extracting the 5 prime splice sites sequences 
all_5_prime_sequences = []

for gene_id in unique_genes['GeneID']:
    try:
        sequences = extract_5_prime_sequences(gene_id)
        all_5_prime_sequences.extend(sequences)  # Only add the sequences
    except Exception as e:
        print(f"Error processing {gene_id}: {e}")



# Convert to DataFrame
sequences_df = pd.DataFrame(all_5_prime_sequences, columns=['sequence'])

# Save to TXT
sequences_df.to_csv('5_prime_AU565.txt', index=False, sep='\t')




#extracting the 3 prime splice sites sequences
all_3_prime_sequences = []

for gene_id in unique_genes['GeneID']:
    try:
        sequences = extract_3_prime_sequences(gene_id)
        all_3_prime_sequences.extend(sequences)  # Only add the sequences
    except Exception as e:
        print(f"Error processing {gene_id}: {e}")

# Convert to DataFrame
sequences_df = pd.DataFrame(all_3_prime_sequences, columns=['sequence'])

# Save to TXT
sequences_df.to_csv('3_prime_AU565.txt', index=False, sep='\t')




# The 3' and 5' splice site sequences are analyzed for splice site strength using the following two websites.
# http://hollywood.mit.edu/burgelab/maxent/Xmaxentscan_scoreseq.html
# http://hollywood.mit.edu/burgelab/maxent/Xmaxentscan_scoreseq_acc.html
# the results are saved in 5_prime_AU565.txt and 3_prime_AU565.txt
# We extract the values from both files, and then we combine them into one file 

In [9]:
# Read the prime 5 file content
with open('5_prime_AU565.txt', 'r') as file:
    content = file.read()

# Use regular expression to find all MAXENT values
pattern = r'MAXENT:\s*(-?\d*\.?\d+)'

values = re.findall(pattern, content)


# Convert values to float 
values = [float(value) for value in values]

with open('SSS_5_AU565.txt', 'w') as output_file:
    for value in values:
        output_file.write(f'{value}\n')


# Read the prime 3 file content
with open('3_prime_AU565.txt', 'r') as file:
    content = file.read()

# Use regular expression to find all MAXENT values
pattern = r'MAXENT:\s*(-?\d*\.?\d+)'

values = re.findall(pattern, content)


# Convert values to float 
values = [float(value) for value in values]

with open('SSS_3_AU565.txt', 'w') as output_file:
    for value in values:
        output_file.write(f'{value}\n')



# combine the two files
file1 = 'SSS_5_AU565.txt'
file2 = 'SSS_3_AU565.txt'
combined_file = 'SSS_AU565.txt'

# Open the files in read mode
with open(file1, 'r') as f1, open(file2, 'r') as f2:
    # Read the contents of the files
    file1_contents = f1.readlines()
    file2_contents = f2.readlines()

# Open the combined file in write mode
with open(combined_file, 'w') as f_combined:
    # Write the contents of the first file
    f_combined.writelines(file1_contents)
    # Write the contents of the second file
    f_combined.writelines(file2_contents)

In [10]:
# BULK_MCF dataset

unique_genes = pd.read_csv('BULK_MCF_Genes.csv')




#extracting the 5 prime splice sites sequences 
all_5_prime_sequences = []

for gene_id in unique_genes['GeneID']:
    try:
        sequences = extract_5_prime_sequences(gene_id)
        all_5_prime_sequences.extend(sequences)  # Only add the sequences
    except Exception as e:
        print(f"Error processing {gene_id}: {e}")



# Convert to DataFrame
sequences_df = pd.DataFrame(all_5_prime_sequences, columns=['sequence'])

# Save to TXT
sequences_df.to_csv('5_prime_BULK_MCF.txt', index=False, sep='\t')




#extracting the 3 prime splice sites sequences
all_3_prime_sequences = []

for gene_id in unique_genes['GeneID']:
    try:
        sequences = extract_3_prime_sequences(gene_id)
        all_3_prime_sequences.extend(sequences)  # Only add the sequences
    except Exception as e:
        print(f"Error processing {gene_id}: {e}")

# Convert to DataFrame
sequences_df = pd.DataFrame(all_3_prime_sequences, columns=['sequence'])

# Save to TXT
sequences_df.to_csv('3_prime_BULK_MCF.txt', index=False, sep='\t')




# The 3' and 5' splice site sequences are analyzed for splice site strength using the following two websites.
# http://hollywood.mit.edu/burgelab/maxent/Xmaxentscan_scoreseq.html
# http://hollywood.mit.edu/burgelab/maxent/Xmaxentscan_scoreseq_acc.html
# the results are saved in 5_prime_BULK_MCF.txt and 3_prime_BULK_MCF.txt
# We extract the values from both files, and then we combine them into one file 

In [11]:
# Read the prime 5 file content
with open('5_prime_BULK_MCF.txt', 'r') as file:
    content = file.read()

# Use regular expression to find all MAXENT values
pattern = r'MAXENT:\s*(-?\d*\.?\d+)'

values = re.findall(pattern, content)


# Convert values to float 
values = [float(value) for value in values]

with open('SSS_5_BULK_MCF.txt', 'w') as output_file:
    for value in values:
        output_file.write(f'{value}\n')


# Read the prime 3 file content
with open('3_prime_BULK_MCF.txt', 'r') as file:
    content = file.read()

# Use regular expression to find all MAXENT values
pattern = r'MAXENT:\s*(-?\d*\.?\d+)'

values = re.findall(pattern, content)


# Convert values to float 
values = [float(value) for value in values]

with open('SSS_3_BULK_MCF.txt', 'w') as output_file:
    for value in values:
        output_file.write(f'{value}\n')



# combine the two files
file1 = 'SSS_5_BULK_MCF.txt'
file2 = 'SSS_3_BULK_MCF.txt'
combined_file = 'SSS_BULK_MCF.txt'

# Open the files in read mode
with open(file1, 'r') as f1, open(file2, 'r') as f2:
    # Read the contents of the files
    file1_contents = f1.readlines()
    file2_contents = f2.readlines()

# Open the combined file in write mode
with open(combined_file, 'w') as f_combined:
    # Write the contents of the first file
    f_combined.writelines(file1_contents)
    # Write the contents of the second file
    f_combined.writelines(file2_contents)

In [12]:
# Ito dataset

unique_genes = pd.read_csv('Ito_Genes.csv')




#extracting the 5 prime splice sites sequences 
all_5_prime_sequences = []

for gene_id in unique_genes['GeneID']:
    try:
        sequences = extract_5_prime_sequences(gene_id)
        all_5_prime_sequences.extend(sequences)  # Only add the sequences
    except Exception as e:
        print(f"Error processing {gene_id}: {e}")



# Convert to DataFrame
sequences_df = pd.DataFrame(all_5_prime_sequences, columns=['sequence'])

# Save to TXT
sequences_df.to_csv('5_prime_Ito.txt', index=False, sep='\t')




#extracting the 3 prime splice sites sequences
all_3_prime_sequences = []

for gene_id in unique_genes['GeneID']:
    try:
        sequences = extract_3_prime_sequences(gene_id)
        all_3_prime_sequences.extend(sequences)  # Only add the sequences
    except Exception as e:
        print(f"Error processing {gene_id}: {e}")

# Convert to DataFrame
sequences_df = pd.DataFrame(all_3_prime_sequences, columns=['sequence'])

# Save to TXT
sequences_df.to_csv('3_prime_Ito.txt', index=False, sep='\t')




# The 3' and 5' splice site sequences are analyzed for splice site strength using the following two websites.
# http://hollywood.mit.edu/burgelab/maxent/Xmaxentscan_scoreseq.html
# http://hollywood.mit.edu/burgelab/maxent/Xmaxentscan_scoreseq_acc.html
# the results are saved in 5_prime_Ito.txt and 3_prime_Ito.txt
# We extract the values from both files, and then we combine them into one file 

In [13]:
# Read the prime 5 file content
with open('5_prime_Ito.txt', 'r') as file:
    content = file.read()

# Use regular expression to find all MAXENT values
pattern = r'MAXENT:\s*(-?\d*\.?\d+)'

values = re.findall(pattern, content)


# Convert values to float 
values = [float(value) for value in values]

with open('SSS_5_Ito.txt', 'w') as output_file:
    for value in values:
        output_file.write(f'{value}\n')


# Read the prime 3 file content
with open('3_prime_Ito.txt', 'r') as file:
    content = file.read()

# Use regular expression to find all MAXENT values
pattern = r'MAXENT:\s*(-?\d*\.?\d+)'

values = re.findall(pattern, content)


# Convert values to float 
values = [float(value) for value in values]

with open('SSS_3_Ito.txt', 'w') as output_file:
    for value in values:
        output_file.write(f'{value}\n')



# combine the two files
file1 = 'SSS_5_Ito.txt'
file2 = 'SSS_3_Ito.txt'
combined_file = 'SSS_Ito.txt'

# Open the files in read mode
with open(file1, 'r') as f1, open(file2, 'r') as f2:
    # Read the contents of the files
    file1_contents = f1.readlines()
    file2_contents = f2.readlines()

# Open the combined file in write mode
with open(combined_file, 'w') as f_combined:
    # Write the contents of the first file
    f_combined.writelines(file1_contents)
    # Write the contents of the second file
    f_combined.writelines(file2_contents)

In [14]:
# MCF7 dataset

unique_genes = pd.read_csv('MCF7_Genes.csv')




#extracting the 5 prime splice sites sequences 
all_5_prime_sequences = []

for gene_id in unique_genes['GeneID']:
    try:
        sequences = extract_5_prime_sequences(gene_id)
        all_5_prime_sequences.extend(sequences)  # Only add the sequences
    except Exception as e:
        print(f"Error processing {gene_id}: {e}")



# Convert to DataFrame
sequences_df = pd.DataFrame(all_5_prime_sequences, columns=['sequence'])

# Save to TXT
sequences_df.to_csv('5_prime_MCF7.txt', index=False, sep='\t')




#extracting the 3 prime splice sites sequences
all_3_prime_sequences = []

for gene_id in unique_genes['GeneID']:
    try:
        sequences = extract_3_prime_sequences(gene_id)
        all_3_prime_sequences.extend(sequences)  # Only add the sequences
    except Exception as e:
        print(f"Error processing {gene_id}: {e}")

# Convert to DataFrame
sequences_df = pd.DataFrame(all_3_prime_sequences, columns=['sequence'])

# Save to TXT
sequences_df.to_csv('3_prime_MCF7.txt', index=False, sep='\t')




# The 3' and 5' splice site sequences are analyzed for splice site strength using the following two websites.
# http://hollywood.mit.edu/burgelab/maxent/Xmaxentscan_scoreseq.html
# http://hollywood.mit.edu/burgelab/maxent/Xmaxentscan_scoreseq_acc.html
# the results are saved in 5_prime_MCF7.txt and 3_prime_MCF7.txt
# We extract the values from both files, and then we combine them into one file 

In [15]:
# Read the prime 5 file content
with open('5_prime_MCF7.txt', 'r') as file:
    content = file.read()

# Use regular expression to find all MAXENT values
pattern = r'MAXENT:\s*(-?\d*\.?\d+)'

values = re.findall(pattern, content)


# Convert values to float 
values = [float(value) for value in values]

with open('SSS_5_MCF7.txt', 'w') as output_file:
    for value in values:
        output_file.write(f'{value}\n')


# Read the prime 3 file content
with open('3_prime_MCF7.txt', 'r') as file:
    content = file.read()

# Use regular expression to find all MAXENT values
pattern = r'MAXENT:\s*(-?\d*\.?\d+)'

values = re.findall(pattern, content)


# Convert values to float 
values = [float(value) for value in values]

with open('SSS_3_MCF7.txt', 'w') as output_file:
    for value in values:
        output_file.write(f'{value}\n')



# combine the two files
file1 = 'SSS_5_MCF7.txt'
file2 = 'SSS_3_MCF7.txt'
combined_file = 'SSS_MCF7.txt'

# Open the files in read mode
with open(file1, 'r') as f1, open(file2, 'r') as f2:
    # Read the contents of the files
    file1_contents = f1.readlines()
    file2_contents = f2.readlines()

# Open the combined file in write mode
with open(combined_file, 'w') as f_combined:
    # Write the contents of the first file
    f_combined.writelines(file1_contents)
    # Write the contents of the second file
    f_combined.writelines(file2_contents)

In [17]:
# MCF10 dataset

unique_genes = pd.read_csv('MCF10_Genes.csv')




#extracting the 5 prime splice sites sequences 
all_5_prime_sequences = []

for gene_id in unique_genes['GeneID']:
    try:
        sequences = extract_5_prime_sequences(gene_id)
        all_5_prime_sequences.extend(sequences)  # Only add the sequences
    except Exception as e:
        print(f"Error processing {gene_id}: {e}")



# Convert to DataFrame
sequences_df = pd.DataFrame(all_5_prime_sequences, columns=['sequence'])

# Save to TXT
sequences_df.to_csv('5_prime_MCF10.txt', index=False, sep='\t')




#extracting the 3 prime splice sites sequences
all_3_prime_sequences = []

for gene_id in unique_genes['GeneID']:
    try:
        sequences = extract_3_prime_sequences(gene_id)
        all_3_prime_sequences.extend(sequences)  # Only add the sequences
    except Exception as e:
        print(f"Error processing {gene_id}: {e}")

# Convert to DataFrame
sequences_df = pd.DataFrame(all_3_prime_sequences, columns=['sequence'])

# Save to TXT
sequences_df.to_csv('3_prime_MCF10.txt', index=False, sep='\t')




# The 3' and 5' splice site sequences are analyzed for splice site strength using the following two websites.
# http://hollywood.mit.edu/burgelab/maxent/Xmaxentscan_scoreseq.html
# http://hollywood.mit.edu/burgelab/maxent/Xmaxentscan_scoreseq_acc.html
# the results are saved in 5_prime_MCF10.txt and 3_prime_MCF10.txt
# We extract the values from both files, and then we combine them into one file 

In [18]:
# Read the prime 5 file content
with open('5_prime_MCF10.txt', 'r') as file:
    content = file.read()

# Use regular expression to find all MAXENT values
pattern = r'MAXENT:\s*(-?\d*\.?\d+)'

values = re.findall(pattern, content)


# Convert values to float 
values = [float(value) for value in values]

with open('SSS_5_MCF10.txt', 'w') as output_file:
    for value in values:
        output_file.write(f'{value}\n')


# Read the prime 3 file content
with open('3_prime_MCF10.txt', 'r') as file:
    content = file.read()

# Use regular expression to find all MAXENT values
pattern = r'MAXENT:\s*(-?\d*\.?\d+)'

values = re.findall(pattern, content)


# Convert values to float 
values = [float(value) for value in values]

with open('SSS_3_MCF10.txt', 'w') as output_file:
    for value in values:
        output_file.write(f'{value}\n')



# combine the two files
file1 = 'SSS_5_MCF10.txt'
file2 = 'SSS_3_MCF10.txt'
combined_file = 'SSS_MCF10.txt'

# Open the files in read mode
with open(file1, 'r') as f1, open(file2, 'r') as f2:
    # Read the contents of the files
    file1_contents = f1.readlines()
    file2_contents = f2.readlines()

# Open the combined file in write mode
with open(combined_file, 'w') as f_combined:
    # Write the contents of the first file
    f_combined.writelines(file1_contents)
    # Write the contents of the second file
    f_combined.writelines(file2_contents)