In [2]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

Notebook analyzing context-specific mutation distributions in high-LLR (sum_llrs > 6) SARS-CoV-2 sequences from Australia (MOV-associated country) and France (non-MOV associated country) as an example, alongside a temporal analysis of high-LLR sequence proportions across all countries, highlighting a peak in 2023 consistent with MOV activity.

In [3]:
fra_df = pd.read_csv("/Users/reem/Mov/france_seqs_t6.tsv", sep = "\t")
aus_df = pd.read_csv("/Users/reem/Mov/aus_seqs_t6.tsv",sep="\t")
df = pd.read_csv("/Users/reem/Mov/sum_llrs_country.tsv", sep="\t")


In [None]:
fra_df.head()


Unnamed: 0.1,Unnamed: 0,seqName,LLR,G>A_llr,C>T_llr,A>G_llr,T>C_llr,sum_llrs,country,year
0,534816,hCoV-19/France/ARA-HMN-22072210310/2022,4.269077,,1.219969,,0.545081,6.034127,France,2022
1,1942573,hCoV-19/France/PDL-IPP17744/2021,4.339576,1.461931,1.040008,,,6.841515,France,2021
2,2034542,hCoV-19/France/PAC-LBZCentre-E22062900069/2022,5.919112,1.507319,0.073663,0.165894,0.545081,8.211068,France,2022
3,3021504,hCoV-19/France/ARA-ERFB-2071606492/2021,2.957069,1.532373,0.48903,0.680822,0.545081,6.204374,France,2021
4,3762862,hCoV-19/France/PAC-IHU-23023-N1/2021,2.986585,4.048358,1.361237,0.089892,,8.486071,France,2021


In [5]:
aus_df.head()

Unnamed: 0.1,Unnamed: 0,seqName,LLR,G>A_llr,C>T_llr,A>G_llr,T>C_llr,sum_llrs,country,year
0,27375,hCoV-19/Australia/VIC70446/2022,6.312438,4.260251,0.579955,2.402591,1.429786,14.985023,Australia,2022
1,30691,hCoV-19/Australia/NSW-ICPMR-47243/2023,6.579249,0.274257,0.482328,-0.573054,0.211257,6.974036,Australia,2023
2,51330,hCoV-19/Australia/VIC72734/2022,10.556882,1.799706,0.668536,0.165894,-0.333824,12.857194,Australia,2022
3,66864,hCoV-19/Australia/NSW-SAVID-14313/2022,8.076283,6.980604,1.367416,0.205329,-0.239566,16.390067,Australia,2022
4,136872,hCoV-19/Australia/NSW-SAVID-9772/2022,7.19548,3.644763,1.92479,1.114358,0.995856,14.875247,Australia,2022


In [6]:
fra_df["sum_contexts"] = fra_df[["G>A_llr","C>T_llr","A>G_llr","T>C_llr"]].sum(axis=1)
fra_df["sum_contexts"]
df["sum_contexts"] = df[["G>A_llr","C>T_llr","A>G_llr","T>C_llr"]].sum(axis=1)

In [8]:
aus_df["sum_contexts"] = aus_df[["G>A_llr","C>T_llr","A>G_llr","T>C_llr"]].sum(axis=1)
aus_df["sum_contexts"]

0      8.672584
1      0.394787
2      2.300312
3      8.313784
4      7.679767
         ...   
533    4.736295
534    3.314053
535    8.737511
536    6.023327
537    3.303506
Name: sum_contexts, Length: 538, dtype: float64

In [4]:
df['country'].value_counts().head(20)

country
USA            5299802
England        2466851
Germany         958052
Canada          819120
Japan           700725
France          690124
Denmark         680655
Scotland        389962
India           301993
Spain           297082
Sweden          286858
Austria         268556
Brazil          267434
Wales           254104
Australia       249818
Italy           207812
Netherlands     204001
Belgium         183599
Switzerland     168994
Israel          161133
Name: count, dtype: int64

In [51]:
fig = px.histogram(
    data_frame = df[(df["country"].isin(["Australia", "France"])) & (df['sum_llrs']>6)],
    x="sum_contexts",
    color="country",
    nbins=40,
    histnorm="probability density",  
    barmode="overlay"  
)

fig.update_layout(
    title="Distribution of sum_contexts in Australia vs France",
    xaxis_title="sum_contexts",
    yaxis_title="Density",
    template='simple_white',
    font=dict(family='Arial', size=14),
    width=800,
    height=500,
    legend=dict(title=None)
)

fig.show()

In [50]:

df["year"] = df["year"].astype(str).str.extract(r"(2019|2020|2021|2022|2023|2024|2025)")
print(df['year'].unique())


['2021' '2022' '2023' '2020' '2025' '2024' nan '2019']


In [69]:
print(df['country'].unique())

['USA' 'Spain' 'Germany' 'France' 'Sweden' 'India' 'England' 'Bulgaria'
 'Czech Republic' 'Slovenia' 'Canada' 'Japan' 'South Korea' 'Belgium'
 'Mexico' 'Australia' 'Denmark' 'Italy' 'Russia' 'Lithuania' 'Israel'
 'Ireland' 'Switzerland' 'Turkey' 'Myanmar' 'Austria' 'Greece' 'Croatia'
 'Wales' 'Thailand' 'Gibraltar' 'Slovakia' 'Netherlands' 'Malaysia'
 'Norway' 'Northern Ireland' 'Scotland' 'Qatar' 'Estonia' 'Crimea'
 'Namibia' 'Tunisia' 'Brazil' 'Poland' 'Bonaire' 'Botswana'
 'Inner Mongolia' 'Hubei' 'Vietnam' 'New Zealand' 'Peru' 'Indonesia'
 'Congo' 'Singapore' 'Finland' 'Portugal' 'Ecuador' 'Guatemala' 'Bahrain'
 'Latvia' 'French Guiana' 'Ghana' 'Chongqing' 'Reunion' 'South Africa'
 'Iceland' 'Philippines' 'Guangdong' 'Nigeria' 'Burkina Faso' 'Sichuan'
 'Lebanon' 'Chile' 'Senegal' 'env' 'Liaoning' 'Yunnan' 'Montenegro'
 'Taiwan' 'Ukraine' 'Hong Kong' 'Cambodia' 'Puerto Rico' 'Egypt' 'Romania'
 'Liechtenstein' 'Africa' 'Morocco' 'Qinghai' 'Shanghai' 'Luxembourg'
 'Venezuela' 'Seychel

In [18]:
len(df)
len(df[df['sum_llrs'] >= 6])

2200

In [None]:

def calculate_Mov_proportions(df):
        filtered_df = df[df['sum_llrs'] > 6]
        Total_seqs = df.groupby('country').size()
        Mov_seqs = filtered_df.groupby('country').size()
        proportions =  (Mov_seqs/Total_seqs).fillna(0)
        return proportions

result = df.groupby('year').apply(calculate_Mov_proportions)
results = result.rename('Mov_proportions')
results.to_csv("/Users/reem/Mov/results.tsv", sep="\t")







In [41]:
Brazil = df[(df['country'] == 'Brazil') & (df['year'] == '2022')]
print(len(Brazil))
Brazil_Mov = Brazil[(Brazil['sum_llrs'] > 6) & (Brazil['year'] == '2022')]
print(len(Brazil_Mov))

102808
1


In [14]:
print("Rows with sum_llrs >= 6:", (df['sum_llrs'] >= 6).sum())
print("Total rows:", len(df))

Rows with sum_llrs >= 6: 2200
Total rows: 16857152


In [13]:
print(df['year'].unique())

[2021 2022 2023 2020 2025 2024]


In [78]:
df.head()

Unnamed: 0.1,Unnamed: 0,seqName,LLR,G>A_llr,C>T_llr,A>G_llr,T>C_llr,sum_llrs,country,year,sum_contexts
0,0,hCoV-19/USA/CA-CDPH-500004296/2021,0.133011,,0.069622,0.004225,0.156313,0.36317,USA,2021,0.230159
1,1,hCoV-19/Spain/CL-COV01948/2021,-1.049009,,0.480353,0.319879,,-0.248777,Spain,2021,0.800232
2,2,hCoV-19/USA/OR-OHSU-213401246/2021,1.231103,,-0.505227,-1.811061,-0.447496,-1.532681,USA,2021,-2.763784
3,3,hCoV-19/Germany/SL-RKI-I-1077947/2022,-0.210447,,,0.004225,,-0.206223,Germany,2022,0.004225
4,4,hCoV-19/USA/MA-CDCBI-CRSP_HGQQM7RZS5PYBHGU/2022,-1.07355,,-0.489601,,,-1.563151,USA,2022,-0.489601


In [None]:
plotly_df = pd.read_csv("/Users/reem/Mov/results.tsv", sep="\t")

fig = px.line(
    plotly_df,
    x='year',
    y='Mov_proportions',
    color='country',
    title='Proportion of sequences with sum_llrs > 6 per country',
    render_mode='svg'
)
fig.show()


In [55]:
countries = ['USA', 'England', 'Australia', 'Germany', 'France','Japan']
newdf = plotly_df[plotly_df['country'].isin(countries)]

fig = px.line(
    newdf,
    x='year',
    y='Mov_proportions',
    color='country',
    title='Proportion of Mov_sequences per country',
)
fig.show()