In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

path = "../results/aligned_genes_summary.csv"

# Read the CSV file into a DataFrame
df = pd.read_csv(path)

# Convert the 'Mapped Chromosome' to a categorical type for proper sorting
df['Mapped Chromosome'] = pd.Categorical(df['Mapped Chromosome'], 
                                          categories=['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 
                                                      'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 
                                                      'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX', 'chrY'], 
                                          ordered=True)

# Group by Gene
grouped = df.groupby('Gene')

# Function to display specific gene tables
def display_gene_table(gene=None):
    if gene:
        # Display only the selected gene
        if gene in grouped.groups:
            group = grouped.get_group(gene)
            sorted_group = group.sort_values(by='Mapped Chromosome')
            if len(sorted_group) > 1:  # Ensure that there are at least two rows
                print(f"\nGene: {gene}, ({sorted_group.iloc[1][['Chromosome', 'Position']].to_dict()})")
            else:
                print(f"\nGene: {gene} does not have enough rows to display.")
            print(sorted_group[['Mapped Chromosome', 'Mapped Position']])
        else:
            print(f"Gene '{gene}' not found.")
    else:
        # Display all genes if no specific gene is chosen
        for gene, group in grouped:
            sorted_group = group.sort_values(by='Mapped Chromosome')
            if len(sorted_group) > 1:  # Ensure that there are at least two rows
                print(f"\nGene: {gene} ({sorted_group.iloc[1][['Chromosome', 'Position']].to_dict()})")
            else:
                print(f"\nGene: {gene} does not have enough rows to display.")
            print(sorted_group[['Mapped Chromosome', 'Mapped Position']])

# Example usage:
# To print all genes
display_gene_table()

# To print a specific gene (e.g., 'BLACE')
# display_gene_table('RNU6-9')



Gene: BLACE ({'Chromosome': 'chr7', 'Position': '155367809-155367933'})
   Mapped Chromosome  Mapped Position
0               chr1         34246237
11              chr2        230124893
14              chr3        109484133
15              chr4         86253850
16              chr5         44688516
17              chr6        129640543
18              chr7        160791435
19              chr8         31930484
20              chr9         95861852
1              chr10        100403328
2              chr11         76641106
3              chr12         98899542
4              chr13         46452310
5              chr14         66529225
6              chr15         91751148
7              chr16          5444152
8              chr17         34021640
9              chr18         13474122
10             chr19         22846119
12             chr20         49102948
13             chr22         41883530

Gene: FAM30A does not have enough rows to display.
   Mapped Chromosome  Mapped Position
2

In [37]:
import pandas as pd

# Path to your input CSV file
path = "../data/BLASTn_results_summary.csv"

# Read the CSV file into a DataFrame
df = pd.read_csv(path)

# Extract mapped chromosome from 'Hit Title'
df['Mapped Chromosome'] = df['Hit Title'].str.extract(r'chromosome (\d+|X|Y)', expand=False)

# Convert 'Mapped Chromosome' to a categorical type to sort chromosomes numerically and 'X', 'Y' last
chromosome_order = [str(i) for i in range(1, 23)] + ['X', 'Y']
df['Mapped Chromosome'] = pd.Categorical(df['Mapped Chromosome'], categories=chromosome_order, ordered=True)

# Convert 'Mapped Position' to numeric if needed (e.g., if it's stored as a string in the CSV)
df['Mapped Position'] = pd.to_numeric(df['Mapped Position'], errors='coerce')

# Clean up other columns
df['Query Cover'] = df['Query Cover'].str.rstrip('%').astype(float)
df['E value'] = df['E value'].astype(float)
df['Per. ident'] = df['Per. ident'].astype(float)

# Group by Gene
grouped = df.groupby('GENE')

# Function to display specific gene tables
def display_gene_table(gene=None):
    if gene:
        # Display only the selected gene
        if gene in grouped.groups:
            group = grouped.get_group(gene)
            sorted_group = group.sort_values(by=['Mapped Chromosome', 'Mapped Position'], na_position='last')
            print(f"\nGene: {gene}")
            print(sorted_group[['Mapped Chromosome', 'Mapped Position', 'Query Cover', 'E value', 'Per. ident']])
        else:
            print(f"Gene '{gene}' not found.")
    else:
        # Display all genes if no specific gene is chosen
        for gene, group in grouped:
            sorted_group = group.sort_values(by=['Mapped Chromosome', 'Mapped Position'], na_position='last')
            print(f"\nGene: {gene}")
            print(sorted_group[['Mapped Chromosome', 'Mapped Position', 'Query Cover', 'E value', 'Per. ident']])

# Example usage:
# To print all genes
display_gene_table()

# To print a specific gene (e.g., 'BLACE')
# display_gene_table('RNU6-9')



Gene: BLACE
    Mapped Chromosome  Mapped Position  Query Cover       E value  Per. ident
214                 1        143637945        100.0  2.230540e-57        99.2
216                 2        222897069        100.0  1.037770e-55        99.2
221                 3        183608210        100.0  1.342440e-54        99.2
217                 4         76916963        100.0  1.037770e-55        98.4
229                 5         43088267        100.0  8.079390e-52        96.8
227                 6          5037910        100.0  2.246370e-52        97.6
213                 7        155367809        100.0  4.794220e-59       100.0
218                 8         30247028        100.0  1.037770e-55        98.4
233                 9         92831559        100.0  3.758970e-50        97.6
231                10         69992977        100.0  1.045130e-50        96.0
230                11          8978287        100.0  2.905860e-51        97.6
222                12         26785216        100.0