# 🔬 Simulated Bioinformatics Pipeline

This notebook simulates a complete gene expression analysis pipeline for a hypothetical disease model using randomly generated gene expression data.

---

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from scipy.stats import ttest_ind
from sklearn.cluster import KMeans
import warnings
warnings.filterwarnings("ignore")
sns.set(style="whitegrid")


In [None]:
# Load simulated dataset
df = pd.read_csv("simulated_gene_expression.csv", index_col=0)
df.head()


In [None]:
# Sample groups
control_cols = [col for col in df.columns if "Control" in col]
disease_cols = [col for col in df.columns if "Disease" in col]
print(f"Control samples: {len(control_cols)}, Disease samples: {len(disease_cols)}")


In [None]:
# Perform t-tests for each gene
p_values = []
log_fc = []

for gene in df.index:
    control_vals = df.loc[gene, control_cols]
    disease_vals = df.loc[gene, disease_cols]
    t_stat, p_val = ttest_ind(control_vals, disease_vals)
    p_values.append(p_val)
    log_fc.append(np.log2(disease_vals.mean() / control_vals.mean()))

# Create results dataframe
results = pd.DataFrame({
    "Gene": df.index,
    "log2_FC": log_fc,
    "p_value": p_values
})
results["significant"] = results["p_value"] < 0.05
results.sort_values("p_value", inplace=True)
results.head()


In [None]:
plt.figure(figsize=(10,6))
sns.scatterplot(data=results, x="log2_FC", y=-np.log10(results["p_value"]), hue="significant", palette={True: "red", False: "gray"})
plt.title("Volcano Plot")
plt.xlabel("Log2 Fold Change")
plt.ylabel("-Log10 p-value")
plt.axhline(-np.log10(0.05), linestyle='--', color='blue')
plt.show()


In [None]:
# PCA
pca = PCA(n_components=2)
pca_data = pca.fit_transform(df.T)
pca_df = pd.DataFrame(pca_data, columns=["PC1", "PC2"])
pca_df["group"] = ["Control"] * len(control_cols) + ["Disease"] * len(disease_cols)

sns.scatterplot(data=pca_df, x="PC1", y="PC2", hue="group")
plt.title("PCA of Samples")
plt.show()


In [None]:
tsne = TSNE(n_components=2, perplexity=5, random_state=42)
tsne_data = tsne.fit_transform(df.T)
tsne_df = pd.DataFrame(tsne_data, columns=["Dim1", "Dim2"])
tsne_df["group"] = ["Control"] * len(control_cols) + ["Disease"] * len(disease_cols)

sns.scatterplot(data=tsne_df, x="Dim1", y="Dim2", hue="group")
plt.title("t-SNE of Samples")
plt.show()


In [None]:
# Top differentially expressed genes
top_genes = results.sort_values("p_value").head(30)["Gene"]
sns.clustermap(df.loc[top_genes], cmap="vlag", col_cluster=True, z_score=0, figsize=(12, 10))
plt.title("Heatmap of Top Differentially Expressed Genes")
plt.show()


In [None]:
# Mock pathway annotation
pathways = ["Apoptosis", "Cell Cycle", "Immune Response", "Signal Transduction", "Metabolism"]
np.random.seed(0)
results["Pathway"] = np.random.choice(pathways, size=results.shape[0])

# Visualize enriched pathways among significant genes
pathway_counts = results[results["significant"]]["Pathway"].value_counts()

sns.barplot(x=pathway_counts.index, y=pathway_counts.values)
plt.title("Enriched Pathways (Simulated)")
plt.ylabel("Count of Significant Genes")
plt.xticks(rotation=45)
plt.show()


In [None]:
results.to_csv("results_differential_expression.csv", index=False)
print("Results saved to results_differential_expression.csv")
