# 📘 RNA-seq + Red Génica
Análisis completo: Volcano plot, GO, redes génicas simuladas y reales con STRINGdb

In [None]:

import pandas as pd
import numpy as np

df = pd.read_csv('/mnt/data/RNA-Seq-expression-Norilsk2019.csv')
df = df.rename(columns={
    'log_2 fold change': 'log2FC',
    'Adjusted p-value': 'padj'
})
df['Significance'] = 'No significativo'
df.loc[(df['log2FC'] > 1) & (df['padj'] < 0.05), 'Significance'] = 'Sobre-expresado'
df.loc[(df['log2FC'] < -1) & (df['padj'] < 0.05), 'Significance'] = 'Sub-expresado'
df.head()


In [None]:

import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10,6))
sns.scatterplot(data=df, x='log2FC', y=-np.log10(df['padj']), hue='Significance',
                palette={'Sobre-expresado':'red', 'Sub-expresado':'blue', 'No significativo':'gray'}, alpha=0.7)

plt.axvline(x=1, color='black', linestyle='--')
plt.axvline(x=-1, color='black', linestyle='--')
plt.axhline(y=-np.log10(0.05), color='black', linestyle='--')
plt.title("Volcano Plot")
plt.xlabel("log2 Fold Change")
plt.ylabel("-log10 Adjusted p-value")
plt.legend()
plt.show()


In [None]:

!pip install -q mygene

import mygene
mg = mygene.MyGeneInfo()

genes = df[df['Significance'] == 'Sobre-expresado']['Gene'].dropna().unique().tolist()
annotations = mg.querymany(genes, scopes='ensembl.gene', fields='go', species='human', as_dataframe=True)
go_terms = annotations[['go.BP', 'go.MF', 'go.CC']].dropna(how='all')
go_terms.head()


In [None]:

np.random.seed(42)
samples = [f"S{i+1}" for i in range(10)]
expr_matrix = pd.DataFrame(
    np.random.normal(loc=0, scale=1, size=(len(genes), len(samples))),
    index=genes,
    columns=samples
)
expr_matrix.head()


In [None]:

import networkx as nx

corr_matrix = expr_matrix.T.corr()
threshold = 0.7
G1 = nx.Graph()
for i in corr_matrix.index:
    for j in corr_matrix.columns:
        if i != j and corr_matrix.loc[i, j] > threshold:
            G1.add_edge(i, j, weight=corr_matrix.loc[i, j])


In [None]:

import requests

gene_list = genes[:100]
string_ids = "%0d".join(gene_list)
url = f"https://string-db.org/api/tsv/network?identifiers={string_ids}&species=9606"

response = requests.get(url)
with open("string_network.tsv", "w") as f:
    f.write(response.text)

string_df = pd.read_csv("string_network.tsv", sep='\t')
G2 = nx.from_pandas_edgelist(string_df, 'preferredName_A', 'preferredName_B', edge_attr='combined_score')


In [None]:

def red_metrics(G):
    if not nx.is_connected(G):
        G = G.subgraph(max(nx.connected_components(G), key=len)).copy()
    return {
        "Nodos": G.number_of_nodes(),
        "Aristas": G.number_of_edges(),
        "Grado promedio": np.mean([d for _, d in G.degree()]),
        "Clustering": nx.average_clustering(G),
        "Diámetro": nx.diameter(G),
        "Longitud promedio": nx.average_shortest_path_length(G)
    }

metrics1 = red_metrics(G1)
metrics2 = red_metrics(G2)

pd.DataFrame([metrics1, metrics2], index=['Coexpresión Pearson', 'STRINGdb'])


In [None]:

geo_nodes = max(nx.all_pairs_shortest_path_length(G2), key=lambda x: max(x[1].values()))
start = geo_nodes[0]
end = max(geo_nodes[1], key=geo_nodes[1].get)
path = nx.shortest_path(G2, start, end)
print("Geodésica STRINGdb:")
print(path)

geo_annots = annotations.loc[path][['go.BP', 'go.MF', 'go.CC']]
geo_annots
