### Load data from excel file

In [None]:
import pandas as pd

file_path = "path/to/SuppTable1.xlsx"
file_path_c = "path/to/SuppTable1_only_complete.xlsx"
file_path_p = "path/to/SuppTable1_only_present.xlsx"

# Read the Excel file into a DataFrame
df = pd.read_excel(file_path)
df_c = pd.read_excel(file_path_c)
df_p = pd.read_excel(file_path_p)

In [None]:
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

def preprocess_dataframe(df, metadata_cols=20):
    # Split into metadata and features
    metadata = df.iloc[:, :metadata_cols]
    features = df.iloc[:, metadata_cols:]

    # Ensure 'Source.curated' is in metadata
    labels = metadata.get('Source.curated', None)

    # Remove columns containing 'T6SS' or 'tss'
    features_filtered = features.loc[:, ~features.columns.str.contains('T6SS|tss', case=False)]
    
    removed_cols = features.shape[1] - features_filtered.shape[1]
    print(f"Removed {removed_cols} columns containing 'T6SS' or 'tss'")

    # Drop NA-containing columns before scaling
    features_no_na = features_filtered.dropna(axis=1)
    print(f"Original features shape: {features.shape}")
    print(f"Filtered (no T6SS/tss) shape: {features_filtered.shape}")
    print(f"After dropping NA columns: {features_no_na.shape}")

    # Standardize the features
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(features_no_na)

    return metadata, features_filtered, scaled_features, labels

# Now apply to all three datasets
metadata_df, features_df, scaled_df, labels_df = preprocess_dataframe(df)
metadata_p, features_p, scaled_p, labels_p = preprocess_dataframe(df_p)
metadata_c, features_c, scaled_c, labels_c = preprocess_dataframe(df_c)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
output_dir='C:/Users/Bart/Kristina_T6SS'
# Apply PCA
pca = PCA(n_components=2)
principal_components = pca.fit_transform(scaled_df)
label_col = 'Cmultiple'
# Convert PCA results into a DataFrame
pca_df = pd.DataFrame(principal_components, columns=["PC1", "PC2"])
pca_df[label_col] = metadata_df[label_col]
order = sorted(pca_df[label_col].dropna().unique())

# Add labels back

# Plot PCA
plt.figure(figsize=(11, 8))
sns.scatterplot(data=pca_df, x="PC1", y="PC2", hue=label_col,hue_order=order, palette='Set2', s=1)
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.legend(loc="upper right", markerscale=5, frameon=False)
plt.title("PCA of T6SS Data (Complete)")
plt.savefig(f"{output_dir}/pca_t6ss_alldata_c_multiple.jpeg", dpi=900, bbox_inches='tight')
plt.show()

# Plot PCA
label_col = 'Pmultiple'
# Convert PCA results into a DataFrame
pca_df = pd.DataFrame(principal_components, columns=["PC1", "PC2"])
pca_df[label_col] = metadata_df[label_col]
order = sorted(pca_df[label_col].dropna().unique())

plt.figure(figsize=(11, 8))
sns.scatterplot(data=pca_df, x="PC1", y="PC2", hue=label_col, hue_order=order, palette='Set2', s=1)
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.legend(loc="upper right", markerscale=5, frameon=False)
plt.title("PCA of T6SS Data (Present)")
plt.savefig(f"{output_dir}/pca_t6ss_alldata_p_multiple.jpeg", dpi=900, bbox_inches='tight')

plt.show()
label_col = 'Source.Niche'
# Convert PCA results into a DataFrame
pca_df = pd.DataFrame(principal_components, columns=["PC1", "PC2"])
pca_df[label_col] = metadata_df[label_col]
# Add labels back

# Plot PCA
plt.figure(figsize=(11, 8))
sns.scatterplot(data=pca_df, x="PC1", y="PC2", hue=label_col, palette='Set2', s=1)
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.legend(loc="upper right", markerscale=5, frameon=False)
plt.savefig(f"{output_dir}/pca_t6ss_alldata_niche_multiple.jpeg", dpi=900, bbox_inches='tight')

plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
import pandas as pd

def plot_pca(features, metadata, label='c', output_dir='C:/Users/Bart/Kristina_T6SS'):
    """
    Perform PCA and plot scatterplots colored by '<label>multiple' and 'Source.Niche',
    filtering for rows where '<label>min1' is not NA.
    
    Parameters:
        features (ndarray): Scaled feature matrix.
        metadata (DataFrame): Corresponding metadata DataFrame.
        label (str): Dataset label, e.g. 'c', 'p'. Should match column prefixes.
        output_dir (str): Directory for saving plots.
    """
    # Dynamic column names based on label
    min1_col = f"{label.upper()}min1"
    multiple_col = f"{label.upper()}multiple"

    # Check existence
    if min1_col not in metadata.columns or multiple_col not in metadata.columns:
        raise ValueError(f"Expected columns '{min1_col}' and/or '{multiple_col}' not found in metadata")

    # Filter for rows where <label>min1 is not NaN
    valid_mask = metadata[min1_col].notna()
    features_filtered = features[valid_mask]
    meta_filtered = metadata[valid_mask]

    # PCA
    pca = PCA(n_components=2)
    pca_result = pca.fit_transform(features_filtered)

    # Combine PCA and metadata
    pca_df = pd.DataFrame(pca_result, columns=['PC1', 'PC2'])
    pca_df[multiple_col] = meta_filtered[multiple_col].values
    order = sorted(pca_df[multiple_col].dropna().unique())


    custom_palette = ['#66c2a5', '#fc8d62', '#ffd92f', '#e78ac3', '#a6d854',
 '#8da0cb', '#e5c494', '#b3b3b3', '#c7a9f4', '#f4a582', '#a1d99b']

    # Set the custom palette in Seaborn
    sns.set_palette(custom_palette)
    # Plot 1: colored by <label>multiple
    plt.figure(figsize=(11, 8))
    sns.scatterplot(data=pca_df, x='PC1', y='PC2', hue=multiple_col,hue_order=order, palette='Set2', s=2)
    plt.title(f"PCA: T6SS Complete ({label.upper()})")
    plt.xlabel("Principal Component 1")
    plt.ylabel("Principal Component 2")
    plt.legend(loc="upper right", markerscale=5, frameon=False)
    plt.savefig(f"{output_dir}/pca_t6ss_{label}_multiple.jpeg", dpi=900, bbox_inches='tight')
    plt.show()

    pca_df['Source.Niche'] = meta_filtered['Source.Niche'].values
    order = sorted(pca_df['Source.Niche'].dropna().unique())
    # Plot 2: colored by Source.Niche
    plt.figure(figsize=(11, 8))
    sns.scatterplot(data=pca_df, x='PC1', y='PC2', hue='Source.Niche', hue_order=order, palette=sns.color_palette(), s=2)
    plt.title(f"PCA: T6SS Complete ({label.upper()})")
    plt.xlabel("Principal Component 1")
    plt.ylabel("Principal Component 2")
    plt.legend(loc="upper right", markerscale=5, frameon=False)
    plt.savefig(f"{output_dir}/pca_t6ss_{label}_sourceniche.jpeg", dpi=900, bbox_inches='tight')
    plt.show()

    pca_df['Source.curated'] = meta_filtered['Source.curated'].values
    order = sorted(pca_df['Source.curated'].dropna().unique())
    # Plot 2: colored by Source.Niche
    plt.figure(figsize=(11, 8))
    sns.scatterplot(data=pca_df, x='PC1', y='PC2', hue='Source.curated', hue_order=order, palette=sns.color_palette(), s=2)
    plt.title(f"PCA: T6SS Complete ({label.upper()})")
    plt.xlabel("Principal Component 1")
    plt.ylabel("Principal Component 2")
    plt.legend(loc="upper right", markerscale=5, frameon=False)
    plt.savefig(f"{output_dir}/pca_t6ss_{label}_sourcecurated.jpeg", dpi=900, bbox_inches='tight')
    plt.show()

plot_pca(scaled_p, metadata_p, label='p')
plot_pca(scaled_c, metadata_c, label='c')  # assuming 'Cmin1' and 'Cmultiple' exist in metadata_c

