In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats
import numpy as np
import allel
import itertools
import os
from subprocess import call
from tqdm import tqdm, trange
from scipy.stats import chi2_contingency

import statsmodels.api as sm
import statsmodels.formula.api as smf

#allows multiple outputs: all, last, last_expr(default), none, last_expr_or_assign
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "last_expr_or_assign" 

#### Step 1: make contig list

In [None]:
#First make contig list
contig_list = pd.read_table('/data3/arshad_PNAS_data/For_Tae/polistes_contig.txt', sep=',', header=None)
contig_list.rename(columns ={0:"ID", 1:"Length"}, inplace =True)
contig_list.head(5)

In [None]:
contig_list['ID'] = contig_list['ID'].str.split('ID=').str.get(1)
contig_list['Length'] = contig_list['Length'].str.split('=').str.get(1)
contig_list['Length'] = contig_list['Length'].str.split('>').str.get(0).astype(int)
contig_list.head(3)
contig_list.dtypes

In [None]:
contig_list['Length'].sort_values(ascending=False);

In [None]:
contig_plus10k = contig_list[contig_list['Length'] > 100000] #remove anything smaller than single window
contig_plus10k['Length'].sort_values();

In [None]:
#Output the File
#Do it once! only once!
#contig_plus10k['ID'].to_csv('/home/taeyoon/VcfFiles/LdByWindow/pol_contig_list.txt', header=False, index=False)

#### Step 2: Chop by 100k window

In [None]:
#this step uses tabix
#this first requires turning vcf to vcf.gz
# bgzip your.vcf
# tabix -p vcf your.vcf.gz <- this makes index
# tabix your.vcf.gz chr1:10,000,000-20,000,000

#modify chopbywindow.script.txt (is in VcfFiles/LdByWindows)
#this chops them into 100k windows, deposites the output into set directory

#for window of LD test in chopped files, going with 60 for safe measure.
#No MAF cut as per original and purpose
#polistes has to run in geno-r2 (NOT hap-r2)


#### Step 3: Calculating Mean-Median R2 per Window

In [None]:
#take all files and make one dataframe
ld_r2 = []

path_folder = '/data3/TaeFile/Pol_HapLd/'

for file in tqdm(os.listdir(path_folder), total=len(path_folder)):
    window = file.split('_headered')[0]
    df = pd.read_csv(f'{path_folder}/{file}', sep='\t')
    mean_r2 = df['R^2'].mean()
    median_r2 = df['R^2'].median()
    ld_r2.append([window, mean_r2, median_r2])

#destination of the final file is in home directory = windowed_LD.csv

In [None]:
df = pd.DataFrame(ld_r2, columns = ['ID', 'r2_mean', 'r2_median'])
display(df.head())

#### Step 4: Making the Master Dataframe

In [None]:
#AA caller, based on frequency.
#common, high frequency seen as reference/ancestral
#rarer, low frequency seen as alternative/derived
def AA_caller(frequency, reference, alternative):
    if (frequency > 0.5):
        return alternative
    elif (frequency < 0.5):
        return reference
    else:
        return np.nan

In [None]:
#Mutation direction function "strength_classifier"
strong_bases= ['G', 'C']
weak_bases= ['A', 'T']

def strength_classifier(ancestor, derived):
    if (ancestor in strong_bases) and (derived in weak_bases):
        return 'SW'
    elif (ancestor in weak_bases) and (derived in strong_bases):
        return 'WS'
    else:
        return 'NN'

In [None]:
#Define frequency of the Derived State, the mutation
#if ALT = Derived, keep the original AF, which describes the frequency of the ALT
#if REF = Derived, use 1 - AF

def mutation_frequency (Derived, ALT, AF):
    if Derived == ALT: #this means derived is ALT, which AF is associated with
        return AF
    if Derived != ALT: #this means dervied is REF, which is inversely associated with AF
        return (1-AF)
    else:
        return 'Error'

In [None]:
#split_bar = Trimmed_table(Trimmed_table['Alt_Freq' == 0.1], Trimmed_table['Strength' == 'SW'])
#Trimmed_table['Strength'].value_counts().plot(kind='bar')
#Ancestry based on allele frequency

def barcoder(strength, frequency):
    if (strength == 'SW') and (frequency <= 0.1): #make it less or equal, to be generalizable for different data.
        return 'SW-Rare'
    elif (strength == 'SW') and (0.25 <= frequency <= 0.5):
        return 'SW-Common'
    elif (strength == 'WS') and (frequency <= 0.1):
        return 'WS-Rare'
    elif (strength == 'WS') and (0.25 <= frequency <= 0.5):
        return 'WS-Common'
    else:
        return 'NaN'

    '''Rare mutation defined as something with frequency of 0.1,
        Common mutation is something that is found more than 0.5 of the time.'''

In [None]:
#execute everything
chopped_polistes = []

path_folder_2 = '/data3/TaeFile/Pol_HeaderedVcf'

for file in tqdm(os.listdir(path_folder_2), total=len(path_folder_2)):
    window = file.split('_headered')[0]
    
    # Process the file
    df_basket = pd.read_table(f'{path_folder_2}/{file}', sep ='\t', header=None, comment='#')
    df_basket.rename(columns={
        0:"SCAF", 
        1:"POS", 
        2:"Id", 
        3:"REF", 
        4:"ALT", 
        5:"quality", 
        6:"filter", 
        7:"INFO", 
        8:"header", 
        9:"1", 10:"10", 11:"11", 12:"2b", 13:"3", 14:"4", 15:"5", 16:"6", 17:"7", 18:"8"}, inplace=True)
    
    column_picks= ["SCAF", "POS", "REF", "ALT", "INFO"]
    df_basket_picks = df_basket[column_picks]
    
    # Get Allele Frequency
    df_basket_picks['AF'] = df_basket_picks['INFO'].str.split('AF=').str.get(1).str.split(';').str.get(0).astype(float)
    df_basket_picks.drop(columns=['INFO'], inplace=True)
    
    #Drop Allele Frequency of 0 and 1
    df_basket_picks = df_basket_picks[df_basket_picks['AF'] != 1.0] #drop all AF of 1
    df_basket_picks = df_basket_picks[df_basket_picks['AF'] != 0] #drop all AF of 0
    
    #AA base calling
    df_basket_picks["AA"] = df_basket_picks.apply(lambda row: AA_caller(row["AF"], row["REF"], row["ALT"]), 
                                                  axis= 'columns')
    df_basket_picks["Derived"] = df_basket_picks.apply(lambda row: AA_caller(row["AF"], row["ALT"], row["REF"]), 
                                                       axis= 'columns')
    
    #Mutation direction
    df_basket_picks['Dirct'] = df_basket_picks.apply(lambda row: strength_classifier(row['REF'], row['ALT']), 
                                                         axis='columns')
    
    #Mutation Frequency, feed the variables in order of Derived, ALT, AF
    df_basket_picks['MF'] = df_basket_picks.apply(lambda row: mutation_frequency(row['Derived'], row['ALT'], row['AF']), 
                                                  axis='columns')
    
    df_basket_picks['Barcode'] = df_basket_picks.apply(lambda row: barcoder(row['Dirct'], row['MF']), axis='columns')
    #Barcoded_Wasp = df_basket_picks[df_basket_picks['Barcode'] != 'NaN'] #drop anything NaN <- this tosses NN
    #this also got rid of any WS and SW that fell in 0.2 and 0.4 freq window. Now it keeps it all. = better
    Barcoded_Wasp = df_basket_picks #maintain variable transition so that I don't have to touch anything downstream
    
    # dr.kent's stats
    SW_Total_freq = (Barcoded_Wasp['Dirct'].values == 'SW').sum()
    WS_Total_freq = (Barcoded_Wasp['Dirct'].values == 'WS').sum()
    NN_Total_freq = (Barcoded_Wasp['Dirct'].values == 'NN').sum()
    SNP_Total = SW_Total_freq + WS_Total_freq + NN_Total_freq
    
    SW_Rare_freq = (Barcoded_Wasp['Barcode'].values == 'SW-Rare').sum()
    WS_Rare_freq = (Barcoded_Wasp['Barcode'].values == 'WS-Rare').sum()
    
    SW_Common_freq = (Barcoded_Wasp['Barcode'].values == 'SW-Common').sum()
    WS_Common_freq = (Barcoded_Wasp['Barcode'].values == 'WS-Common').sum()
    
    chopped_polistes.append([window, SW_Total_freq, WS_Total_freq, NN_Total_freq, SNP_Total, SW_Rare_freq, WS_Rare_freq, 
                            SW_Common_freq, WS_Common_freq])

In [None]:
final_file = pd.DataFrame(chopped_polistes)
final_file.head()

In [None]:
len(final_file)

In [None]:
final_file.rename(columns={
        0:"ID", 
        1:"SW_Total", 2:"WS_Total", 3:"NN_Total", 4: "SNP_Total",
        5:"SW_Rare", 6:"WS_Rare", 7:"SW_Common", 8:"WS_Common",}, inplace=True)

#W_Total_freq, WS_Total_freq, NN_Total_freq, SNP_Total, SW_Rare_freq, WS_Rare_freq, SW_Common_freq, WS_Common_freq

In [None]:
final_file.head(5)

In [None]:
#Start Merging R2 dataframe with the 'final file'
merged_Polistes = df.merge(final_file, on='ID')
Chopped_Polistes = merged_Polistes.dropna()
Chopped_Polistes.head(5)

In [None]:
len(Chopped_Polistes)

In [None]:
test = final_file[final_file['ID'] == 'PdomSCFr1.2-0173_100000'] #checking if it paired up correctly
test.head()

### Output

In [None]:
#Run only once!
Chopped_Polistes.to_csv('/home/taeyoon/VcfFiles/LdByWindow/PolistesSFiles/Chopped_Polistes.csv', index=False)

In [None]:
Polistes_df = pd.read_csv('/home/taeyoon/VcfFiles/LdByWindow/PolistesSFiles/Chopped_Polistes.csv')
Polistes_df.head(8)

#### Merge with GC content per Window and Adjust Total Value

In [None]:
Polistes_GC = pd.read_csv('/home/taeyoon/GCContent/PdomGC_ready.csv', sep='\t')
Polistes_GC.head()

In [None]:
#merge!
Polistes_GC_df = pd.merge(Polistes_df, Polistes_GC, how='left', on=['ID'])
Polistes_GC_df.head()
len(Polistes_GC_df)

In [None]:
#Adjusted Total values
Polistes_GC_df['SW_T_Adjusted'] = Polistes_GC_df['SW_Total']/Polistes_GC_df['GC_Content']
Polistes_GC_df['WS_T_Adjusted'] = Polistes_GC_df['WS_Total']/(1 - Polistes_GC_df['GC_Content'])

In [None]:
#Lambda, which is SW/WS
Polistes_GC_df['Lambda'] = Polistes_GC_df['SW_T_Adjusted'] / Polistes_GC_df['WS_T_Adjusted']

In [None]:
Polistes_GC_df.head()
len(Polistes_GC_df)

In [None]:
#concise, easier viewing
Polistes_view = Polistes_GC_df.drop(columns=[
    'SW_Total','WS_Total','NN_Total','SNP_Total','SW_Rare','WS_Rare','SW_Common','WS_Common'])
Polistes_view.head()

In [None]:
#mean of lambda
Polistes_view['Lambda'].mean()

In [None]:
#GC content unadjusted
(Polistes_GC_df['SW_Total']/Polistes_GC_df['WS_Total']).mean()

### S1: Looking at Total numbers

In [None]:
#https://www.ncbi.nlm.nih.gov/genome/?term=polistes+dominula%5Borgn%5D
#Polistes dominula (european paper wasp)
#GC% 31.5% (AT will be 68.5%), compare to B.imp 37.9%, it's a little lower.
#this is global value, instead used GC% per 10k window value

In [None]:
#name the the final table to work with x, drop possible NaNs
x = Polistes_GC_df.dropna()
len(x) #pre-dropna is 2096, post treatment is 2095, one Nan dropped.

In [None]:
#plot WS and SW together
plt.figure(figsize=(12,6))
plt.ylim(0,3000)
plt.xlim(0,1)

#WS is blue
WST_adj = sns.regplot(x['r2_mean'], x['WS_T_Adjusted'], marker="+", scatter_kws={'alpha':0.5}, label='WS') 

#SW is orange
sns.regplot(x['r2_mean'], x['SW_T_Adjusted'], marker="+", scatter_kws={'alpha':0.5}, label='SW') 

plt.ylabel('GC% adjusted Mutation Counts')
plt.xlabel('R^2 Mean')
plt.legend(loc='upper right', prop={'size': 20}, markerscale=2)

In [None]:
scipy.stats.linregress(x['r2_mean'], x['WS_T_Adjusted']) #WS is blue

In [None]:
scipy.stats.linregress(x['r2_mean'], x['SW_T_Adjusted']) #SW is orange

In [None]:
#Z-test for coefficients (slopes)
def Z_score(slope1, std_error1, slope2, std_error2):
    numerator = (slope1 - slope2)
    denominator = pow((pow(std_error1,2) + pow(std_error2,2)), 1/2)
    Z = (numerator) / (denominator)
    return Z

In [None]:
#Z_score for SW and WS, adjusted numbers
Z_score(-487.4643627632817, 50.747555732926905, -1691.774761691781, 194.24362089937904)
#result is 5.998658410170709
#two tailed p-value is 1.99e-9, reject null, observed difference is valid.

#### Unadjusted

In [None]:
#unadjusted values
#plot WS and SW together
plt.figure(figsize=(12,6))
plt.ylim(0,1000)
plt.xlim(0,1)

#WS is blue
WST_adj = sns.regplot(x['r2_mean'], x['WS_Total'], marker="+", scatter_kws={'alpha':0.5}, label='WS') 

#SW is orange
sns.regplot(x['r2_mean'], x['SW_Total'], marker="+", scatter_kws={'alpha':0.5}, label='SW') 

plt.ylabel('Mutation Counts')
plt.xlabel('R^2 Mean')
plt.legend(loc='upper right', prop={'size': 20}, markerscale=2)

In [None]:
scipy.stats.linregress(x['r2_mean'], x['WS_Total']) #WS is blue

In [None]:
scipy.stats.linregress(x['r2_mean'], x['SW_Total']) #SW is orange

In [None]:
#Z_score, input order in slope1, std_error1, slope2, std_error2
Z_score(-304.3494499417676, 36.39604121169277, -571.1745603683817, 48.10463877925413)
#score 4.423356026798727
#p-value: 0.000009718, reject null

### Lambda, Odds Ratio

In [None]:
#SW/WS, adjusted
plt.figure(figsize=(12,6))
plt.ylim(0,8)
plt.xlim(0,1)

WST_adj = sns.regplot(x['r2_mean'], x['Lambda'], marker="+", scatter_kws={'alpha':0.5}, label='Lambda') 

plt.ylabel('Lambda')
plt.xlabel('R^2 Mean')
plt.legend(loc='upper right', prop={'size': 20}, markerscale=2)

In [None]:
scipy.stats.linregress(x['r2_mean'], x['Lambda']) 

#### Unadjusted

In [None]:
#unadjusted SW/WS
plt.figure(figsize=(12,6))
plt.ylim(0,3)
plt.xlim(0,1)

WST_adj = sns.regplot(x['r2_mean'], x['SW_Total']/x['WS_Total'], marker="+", scatter_kws={'alpha':0.5}, label='Lambda, raw') 

plt.ylabel('Lambda')
plt.xlabel('R^2 Mean')
plt.legend(loc='upper right', prop={'size': 20}, markerscale=2)

In [None]:
scipy.stats.linregress(x['r2_mean'], x['SW_Total']/x['WS_Total']) 

### S2: The 10%, Rares (Note, unadjusted first)

In [None]:
#Plot rare occuring mutations both direction (SW and WS)
plt.figure(figsize=(12,6))
plt.ylim(0,550)
plt.xlim(0,1)

#WS is blue
WST_adj = sns.regplot(x['r2_mean'], x['WS_Rare'], marker="+", scatter_kws={'alpha':0.5}, label='WS') 

#SW is orange
sns.regplot(x['r2_mean'], x['SW_Rare'], marker="+", scatter_kws={'alpha':0.5}, label='SW') 

plt.ylabel('Rare Mutation Counts')
plt.xlabel('R^2 Mean')
plt.legend(loc='upper right', prop={'size': 20}, markerscale=2)

In [None]:
scipy.stats.linregress(x['r2_mean'], x['WS_Rare']) #WS is blue

In [None]:
scipy.stats.linregress(x['r2_mean'], x['SW_Rare']) #SW is orange

In [None]:
#Z_score, input order in slope1, std_error1, slope2, std_error2
Z_score(-156.3121713769304, 17.77337030619035, -369.5995672084842, 27.379952324000346)
#Z = 6.533973394189548
#p-value = 6.405e-11, reject null

#### Adjusted (Note, the order swtiches)

In [None]:
#adjusted for GC%
plt.figure(figsize=(12,6))
plt.ylim(0,1800)
plt.xlim(0,1)

#WS is blue
WST_adj = sns.regplot(x['r2_mean'], x['WS_Rare']/(1 - x['GC_Content']), marker="+", scatter_kws={'alpha':0.5}, label='WS') 

#SW is orange
sns.regplot(x['r2_mean'], x['SW_Rare']/x['GC_Content'], marker="+", scatter_kws={'alpha':0.5}, label='SW') 

plt.ylabel('GC% Adjusted Rare Mutation Counts')
plt.xlabel('R^2 Mean')
plt.legend(loc='upper right', prop={'size': 20}, markerscale=2)

In [None]:
scipy.stats.linregress(x['r2_mean'], x['WS_Rare']/(1 - x['GC_Content'])) #WS is blue

In [None]:
scipy.stats.linregress(x['r2_mean'], x['SW_Rare']/x['GC_Content']) #SW is orange

In [None]:
#Z_score, input order in slope1, std_error1, slope2, std_error2
Z_score(-238.08174907285255, 24.06764227985072, -1084.6426887599052, 105.00682585428217)
#Z = 7.858195678676494
#p-value = 3.897e-15, reject null

### S3: The 50%, Common (Note, unadjusted first)

In [None]:
#Common, both directions
plt.figure(figsize=(12,6))
plt.ylim(0,200)
plt.xlim(0,1)

#WS is blue
WST_adj = sns.regplot(x['r2_mean'], x['WS_Common'], marker="+", scatter_kws={'alpha':0.5}, label='WS') 

#SW is orange
sns.regplot(x['r2_mean'], x['SW_Common'], marker="+", scatter_kws={'alpha':0.5}, label='SW') 

plt.ylabel('Common Mutation Counts')
plt.xlabel('R^2 Mean')
plt.legend(loc='upper right', prop={'size': 20}, markerscale=2)

In [None]:
scipy.stats.linregress(x['r2_mean'], x['WS_Common']) #WS is blue

In [None]:
scipy.stats.linregress(x['r2_mean'], x['SW_Common']) #SW is orange

In [None]:
#Z_score, input order in slope1, std_error1, slope2, std_error2
Z_score(-89.42915500513821, 12.095709120036611, -104.4788070065201, 13.089381012355068)
#Z = 0.8444232459755118
#p-value = 0.3984, fail to reject null

#### Adjusted for GC content

In [None]:
#adjusted for GC%
plt.figure(figsize=(12,6))
plt.ylim(0,650)
plt.xlim(0,1)

#WS is blue
WST_adj = sns.regplot(x['r2_mean'], x['WS_Common']/(1 - x['GC_Content']), marker="+", scatter_kws={'alpha':0.5}, label='WS') 

#SW is orange
sns.regplot(x['r2_mean'], x['SW_Common']/x['GC_Content'], marker="+", scatter_kws={'alpha':0.5}, label='SW') 

plt.ylabel('GC% Adjusted Common Mutation Counts')
plt.xlabel('R^2 Mean')
plt.legend(loc='upper right', prop={'size': 20}, markerscale=2)

In [None]:
scipy.stats.linregress(x['r2_mean'], x['WS_Common']/(1 - x['GC_Content'])) #WS is blue

In [None]:
scipy.stats.linregress(x['r2_mean'], x['SW_Common']/x['GC_Content']) #SW is orange

In [None]:
#Z_score, input order in slope1, std_error1, slope2, std_error2
Z_score(-134.4124150583511, 16.37546777805637, -308.6510263900377, 52.4629274192974)
#Z = 3.1703260148763035
#p-value = 0.001523, reject null

### Two window comparison, Odds Ratio

In [None]:
#selecting for datapoints within the window or r2_mean 0.2-0.3 and 0.4-0.6
polistes_slice_1 = x[x['r2_mean'].between(0.2, 0.3, inclusive=True)];
polistes_slice_2 = x[x['r2_mean'].between(0.4,0.6, inclusive=True)];

In [None]:
polistes_slice_1.head()

In [None]:
polistes_slice_2.head()

In [None]:
#categorizing data for Odds ratio, adjusted for GC%
#first for first window, denoted slice 1
sliced1_WS_R_adj = polistes_slice_1['WS_Rare'].sum();
sliced1_WS_C_adj = polistes_slice_1['WS_Common'].sum();

sliced1_SW_R_adj = polistes_slice_1['SW_Rare'].sum();
sliced1_SW_C_adj = polistes_slice_1['SW_Common'].sum();

In [None]:
#OR slice 1, WS/SW is:
(sliced1_WS_C_adj/sliced1_WS_R_adj)/(sliced1_SW_C_adj/sliced1_SW_R_adj)
#1.6270959290370026

In [None]:
#Chi square, slice 1
chi_slice1 = np.array ([[sliced1_SW_R_adj, sliced1_SW_C_adj], [sliced1_WS_R_adj, sliced1_WS_C_adj]]) 
#array setup: SW-Rare,Common and WS-Rare,Common
chi2_contingency(chi_slice1)
#this returns chi-sqaure, p, degrees of freedom, and expected values in array
#(12168.411007836867, 0.0, 1, array([[421459.3104242, 185673.6895758], [266726.6895758, 117506.3104242]]))

In [None]:
#categorizing data for Odds ratio, adjusted for GC%
#second window, denoted slice 2
sliced2_WS_R_adj = polistes_slice_2['WS_Rare'].sum();
sliced2_WS_C_adj = polistes_slice_2['WS_Common'].sum();

sliced2_SW_R_adj = polistes_slice_2['SW_Rare'].sum();
sliced2_SW_C_adj = polistes_slice_2['SW_Common'].sum();

In [None]:
#OR slice 2, WS/SW is:
(sliced2_WS_C_adj/sliced2_WS_R_adj)/(sliced2_SW_C_adj/sliced2_SW_R_adj)
#1.5470135292224483

In [None]:
#Chi square, slice 2
chi_slice2 = np.array ([[sliced2_SW_R_adj, sliced2_SW_C_adj], [sliced2_WS_R_adj, sliced2_WS_C_adj]]) 
#array setup: SW-Rare,Common and WS-Rare,Common
chi2_contingency(chi_slice2)
#this returns chi-sqaure, p, degrees of freedom, and expected values in array
#(2527.0749701551485, 0.0, 1, array([[102267.93236487,  48062.06763513],[ 67535.06763513,  31738.93236487]]))