# Análisis de expresión génica diferencial (RNA-seq)
Este notebook realiza un análisis completo de expresión génica diferencial basado en un experimento de RNA-Seq para detectar genes sobreexpresados y realizar anotación funcional con Gene Ontology (GO).

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from gprofiler import GProfiler

In [None]:
# Cargar datos RNA-Seq
df = pd.read_csv("RNA-Seq-expression-Norilsk2019.csv")
df.head()

In [None]:
# Renombrar columnas para facilitar análisis
df.rename(columns={
    'log_2 fold change': 'log2FoldChange',
    'Adjusted p-value': 'pvalue',
    'Gene': 'gene_id'
}, inplace=True)
df.columns

In [None]:
# Umbrales
logfc_cutoff = 1
pval_cutoff = 0.05

# Calcular -log10(pvalue)
df['-log10(pval)'] = -np.log10(df['pvalue'].replace(0, 1e-300))
df['significant'] = (df['log2FoldChange'] > logfc_cutoff) & (df['pvalue'] < pval_cutoff)

# Graficar Volcano plot
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='log2FoldChange', y='-log10(pval)', hue='significant',
                palette={True: 'red', False: 'grey'}, alpha=0.6)
plt.axhline(-np.log10(pval_cutoff), color='blue', linestyle='--')
plt.axvline(logfc_cutoff, color='green', linestyle='--')
plt.title('Volcano Plot - Genes diferencialmente expresados')
plt.xlabel('log2 Fold Change')
plt.ylabel('-log10(p-value)')
plt.legend(title='Significativo')
plt.tight_layout()
plt.show()

In [None]:
# Filtrar genes sobreexpresados
overexpressed_genes = df[(df['log2FoldChange'] > logfc_cutoff) & (df['pvalue'] < pval_cutoff)]
print(f"Genes sobreexpresados detectados: {len(overexpressed_genes)}")

In [None]:
# Análisis funcional con g:Profiler
gp = GProfiler(return_dataframe=True)
result = gp.profile(organism='hsapiens', query=overexpressed_genes['gene_id'].tolist())
result[['native', 'name', 'p_value']].head()