AIMP Expressions in Primary Vs Recurrent Tumors

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import mannwhitneyu
from statsmodels.stats.multitest import multipletests

# Load three CSV files
gene_files = {
    "AIMP1": "/Users/humairanoor/Downloads/CGGA AIMP1 - Visualization Tools for Glioma Datasets.csv",
    "AIMP2": "/Users/humairanoor/Downloads/CGGA AIMP2 - Visualization Tools for Glioma Datasets.csv",
    "AIMP3": "/Users/humairanoor/Downloads/CGGA AIMP3 - Visualization Tools for Glioma Datasets.csv"
}

# Read and merge data into one dataframe
data_list = []
for gene, file in gene_files.items():
    df = pd.read_csv(file)  # Assumes columns: "Recurrence" and "mRNA"
    df["Gene"] = gene  # Add gene name as a column
    data_list.append(df)

# Concatenate all data
data = pd.concat(data_list, ignore_index=True)

# Set up the figure
plt.figure(figsize=(8, 6))
sns.set(style="whitegrid")
hue_order = ["Primary", "Recurrent"]

# Create boxplot with jittered data points
ax = sns.boxplot(x="Gene", y="mRNA", hue="Recurrence", data=data,
                 dodge=True, width=0.6, palette="Set2", hue_order=hue_order)
sns.stripplot(x="Gene", y="mRNA", hue="Recurrence", data=data,
              dodge=True, jitter=True, alpha=0.6, linewidth=0.5,
              palette="dark", hue_order=hue_order)

# Perform Mann–Whitney U test
pairs = []
raw_pvalues = []
for gene in gene_files.keys():
    primary = data[(data["Gene"] == gene) & (data["Recurrence"] == "Primary")]["mRNA"]
    recurrent = data[(data["Gene"] == gene) & (data["Recurrence"] == "Recurrent")]["mRNA"]
    
    stat, p = mannwhitneyu(primary, recurrent, alternative="two-sided")
    pairs.append((gene, p))
    raw_pvalues.append(p)

# Adjust p-values for multiple testing
_, adj_pvals, _, _ = multipletests(raw_pvalues, method="fdr_bh")

# Function to convert p-value to significance label
def significance_label(p):
    if p < 0.001:
        return '***'
    elif p < 0.01:
        return '**'
    elif p < 0.05:
        return '*'
    else:
        return 'ns'

# Add significance markers to the plot
y_max = data["mRNA"].max()
y_offset = 0.1 * y_max

for (gene, _), adj_p in zip(pairs, adj_pvals):
    x = list(gene_files.keys()).index(gene)
    plt.text(x, y_max + y_offset, significance_label(adj_p),
             ha='center', fontsize=12, fontweight='bold', color="black")

# Final plot adjustments
plt.ylim(0, y_max + 2 * y_offset)
plt.legend(title="Tumor Type", loc="upper left", bbox_to_anchor=(1.05, 1))
plt.title("AIMPs expression in CGGA Primary vs. Recurrent GBM")
plt.xlabel("Gene")
plt.ylabel("mRNA Expression (log2)")
plt.tight_layout()
plt.show()


AIMP expressions in low vs high-grade tumors

In [None]:
csv_files = {
    "AIMP1": "/Users/humairanoor/Documents/AIMP/CGGA_AIMP1_Expression_Grades.csv",
    "AIMP2": "/Users/humairanoor/Documents/AIMP/CGGA_AIMP2_Expression_Grades.csv",
    "AIMP3": "/Users/humairanoor/Documents/AIMP/CGGA_AIMP3_Expression_Grades.csv"
}

# Load and combine data
data_list = []
for gene, file in csv_files.items():
    df = pd.read_csv(file)  # Assumes columns: "Grade" and "Expression"
    df["Gene"] = gene
    data_list.append(df)

data = pd.concat(data_list, ignore_index=True)

# Merge WHO III and IV into "High Grade", WHO II into "Low Grade"
data["Grade Group"] = data["Grade"].replace({
    "WHO II": "Low Grade",
    "WHO III": "High Grade",
    "WHO IV": "High Grade"
})

# Mann–Whitney U test results
mw_test_results = {}
for gene in csv_files.keys():
    low_grade = data[(data["Gene"] == gene) & (data["Grade Group"] == "Low Grade")]["Expression"]
    high_grade = data[(data["Gene"] == gene) & (data["Grade Group"] == "High Grade")]["Expression"]
    
    stat, p_value = mannwhitneyu(low_grade, high_grade, alternative="two-sided")
    mw_test_results[gene] = p_value

# Define function for significance labeling
def significance_label(p):
    if p < 0.001:
        return '***'
    elif p < 0.01:
        return '**'
    elif p < 0.05:
        return '*'
    else:
        return 'ns'

# Plot setup
plt.figure(figsize=(9, 6))
sns.set(style="whitegrid")
hue_order = ["Low Grade", "High Grade"]

# Boxplot with stripplot
sns.boxplot(x="Gene", y="Expression", hue="Grade Group", data=data, palette="Set2", hue_order=hue_order)
sns.stripplot(x="Gene", y="Expression", hue="Grade Group", data=data,
              dodge=True, jitter=True, alpha=0.6, linewidth=0.5, palette="dark", hue_order=hue_order)

# Offset for annotations
y_max = data["Expression"].max()
y_offset = 0.05 * y_max

# Add p-values to the plot
for i, gene in enumerate(csv_files.keys()):
    p = mw_test_results[gene]
    plt.text(i, y_max + y_offset, significance_label(p),
             ha='center', fontsize=14, fontweight='bold', color="black")

# Adjust legend and axis
plt.legend(title="Tumor Grade", loc="upper left", bbox_to_anchor=(1.05, 1))
plt.xlabel("Gene")
plt.ylabel("Expression Level")
plt.title("AIMP Expression by Tumor Grade (Mann–Whitney U Test)")
plt.ylim(0, y_max + 2 * y_offset)

plt.tight_layout(rect=[0, 0, 0.85, 1])
plt.show()

# Print p-values
print("Gene\tRaw p-value\tSignificance")
for gene, p in mw_test_results.items():
    print(f"{gene}\t{p:.4g}\t\t{significance_label(p)}")
