In [17]:
# Pangenome mapping of genes:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

path = "../results/aligned_genes_summary.csv"

# Read the CSV file into a DataFrame
df = pd.read_csv(path)

# Convert the 'Mapped Chromosome' to a categorical type for proper sorting
df['Mapped Chromosome'] = pd.Categorical(df['Mapped Chromosome'], 
                                          categories=['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 
                                                      'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 
                                                      'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX', 'chrY'], 
                                          ordered=True)

# Group by Gene
grouped = df.groupby('Gene')

# Function to display specific gene tables
def display_gene_table(gene=None):
    if gene:
        # Display only the selected gene
        if gene in grouped.groups:
            group = grouped.get_group(gene)
            sorted_group = group.sort_values(by='Mapped Chromosome')
            print(f"\nGene: {gene}, ({sorted_group.iloc[1][['Chromosome', 'Position']].to_dict()})")
            print(sorted_group[['Mapped Chromosome', 'Mapped Position']])
        else:
            print(f"Gene '{gene}' not found.")
    else:
        # Display all genes if no specific gene is chosen
        for gene, group in grouped:
            sorted_group = group.sort_values(by='Mapped Chromosome')
            print(f"\nGene: {gene}({sorted_group.iloc[1][['Chromosome', 'Position']].to_dict()})")
            print(sorted_group[['Mapped Chromosome', 'Mapped Position']])

# Example usage:
# To print all genes
# display_gene_table()

# To print a specific gene (e.g., 'BLACE')
display_gene_table('BLACE')




Gene: BLACE, ({'Chromosome': 'chr7', 'Position': '155367809-155367933'})
   Mapped Chromosome  Mapped Position
0               chr1         34246237
11              chr2        230124893
14              chr3        109484133
15              chr4         86253850
16              chr5         44688516
17              chr6        129640543
18              chr7        160791435
19              chr8         31930484
20              chr9         95861852
1              chr10        100403328
2              chr11         76641106
3              chr12         98899542
4              chr13         46452310
5              chr14         66529225
6              chr15         91751148
7              chr16          5444152
8              chr17         34021640
9              chr18         13474122
10             chr19         22846119
12             chr20         49102948
13             chr22         41883530


In [None]:
import pandas as pd

# Path to your input CSV file
path = "../data/BLASTn_results_summary.csv"

# Read the CSV file into a DataFrame
df = pd.read_csv(path)

# Extract mapped chromosome from 'Hit Title'
df['Mapped Chromosome'] = df['Hit Title'].str.extract(r'chromosome (\d+|X|Y)', expand=False)

# Convert 'Mapped Position' to numeric if needed (e.g., if it's stored as a string in the CSV)
df['Mapped Position'] = pd.to_numeric(df['Mapped Position'], errors='coerce')

# Clean up other columns
df['Query Cover'] = df['Query Cover'].str.rstrip('%').astype(float)
df['E value'] = df['E value'].astype(float)
df['Per. ident'] = df['Per. ident'].astype(float)

# Group by Gene
grouped = df.groupby('GENE')

# Function to display specific gene tables
def display_gene_table(gene=None):
    if gene:
        # Display only the selected gene
        if gene in grouped.groups:
            group = grouped.get_group(gene)
            sorted_group = group.sort_values(by=['Mapped Chromosome', 'Mapped Position'])
            print(f"\nGene: {gene}")
            print(sorted_group[['Mapped Chromosome', 'Mapped Position', 'Query Cover', 'E value', 'Per. ident']])
        else:
            print(f"Gene '{gene}' not found.")
    else:
        # Display all genes if no specific gene is chosen
        for gene, group in grouped:
            sorted_group = group.sort_values(by=['Mapped Chromosome', 'Mapped Position'])
            print(f"\nGene: {gene}")
            print(sorted_group[['Mapped Chromosome', 'Mapped Position', 'Query Cover', 'E value', 'Per. ident']])

# Example usage:
# To print all genes
# display_gene_table()

# To print a specific gene (e.g., 'BLACE')
display_gene_table('BLACE')



Gene: BLACE
    Mapped Chromosome Mapped Position  Query Cover       E value  Per. ident
214                 1            None        100.0  2.230540e-57        99.2
231                10            None        100.0  1.045130e-50        96.0
230                11            None        100.0  2.905860e-51        97.6
222                12            None        100.0  4.828260e-54        98.4
232                13            None        100.0  1.045130e-50        96.8
223                14            None        100.0  4.828260e-54        98.4
224                15            None        100.0  4.828260e-54        97.6
219                16            None        100.0  1.037770e-55        98.4
215                17            None        100.0  2.230540e-57        99.2
234                18            None        100.0  2.262320e-47        94.4
228                19            None        100.0  2.246370e-52        96.8
216                 2            None        100.0  1.037770e-5