In [3]:
import re
import pandas as pd
import time
import numpy as np
import csv
import math
import seaborn as sns
import matplotlib.pyplot as plt

In [33]:
def plot_fraction_economist_style(file1, file2, number):
    # Function to read and process the data
    def process_data(file):
        data = pd.read_csv(file, header=None, delim_whitespace=True, names=['Type', 'Value'])
        df = pd.DataFrame()
        for i in range(0, len(data), 3):
            kmerSize = data.iloc[i]['Value']
            total = data.iloc[i + 1]['Value']
            unique = data.iloc[i + 2]['Value']
            df = pd.concat([df, pd.DataFrame([{'kmerSize': kmerSize, 'total': total, 'unique': unique}])], ignore_index=True)
        df['total'] = pd.to_numeric(df['total'])
        df['unique'] = pd.to_numeric(df['unique'])
        df['fraction'] = df['unique'] / df['total']
        return df
    
    # Processing both files
    df1 = process_data(file1)
    df2 = process_data(file2)
    
    # Plotting
    plt.figure(figsize=(8, 5))
    sns.set_style("whitegrid")
    sns.set_palette("pastel")

    # Plotting both datasets
    plt.bar(df1['kmerSize'], df1['fraction'], color="skyblue", edgecolor='black', alpha=0.6, label=f'k-mer')
    plt.bar(df2['kmerSize'], df2['fraction'], color="orange", edgecolor='black', alpha=0.6, label=f'ry-mer')

    plt.xlabel(f"k-mer size", fontsize=16)
    plt.ylabel('Fraction (unique/total)', fontsize=16)
    plt.title(f"Fraction of unique k-mers and RY-mers in {number} genomes", fontsize=16, fontweight='bold')
    plt.xticks(np.arange(8, 42, 2))
    plt.xticks(rotation=0)
    plt.legend(fontsize=14)
    
    plt.tight_layout()
    plt.savefig(f'kmer_uniqueness/kmer_vs_rymer_{number}.png', dpi=300, bbox_inches="tight")
    plt.show()

In [None]:
plot_fraction_economist_style('/home/projects/metagnm_asm/paper/countKmers/kmercounts/bac1k/kmer_out.txt',\
                              '/home/projects/metagnm_asm/paper/countKmers/kmercounts/bac1k/rymer_out.txt', "1000")