In [None]:
import pandas as pd
import networkx as nx

In [None]:
string_df_10090_dt = pd.read_csv("organismos_string/10090_protein_detailed.txt", sep=" ")
string_df_10090_dt

In [None]:
# Proteínas com experimentais ou bancos curados
df_curated = string_df_10090_dt[(string_df_10090_dt['experimental'] > 0) | (string_df_10090_dt['database'] > 0)]
df_curated

## Funções dos grafos

In [None]:
# protein_interaction_df = string_df_4932

protein_interaction_df = df_curated

In [None]:
# Enumerando as proteínas da Cerevisiae

protein_map = { v:k for k, v in enumerate(set(protein_interaction_df.loc[:, "protein1"]).union(
    set(protein_interaction_df.loc[:, "protein2"]))) }


print(len(protein_map))

In [None]:
protein_map

In [None]:
len(protein_map)

In [None]:
# Mapeando as proteínas para gerar o grafo

def mapProtein(protein_interaction_df, protein_map):
    
    protein_interation_masked = pd.DataFrame()
    
    proteins = ['protein1', 'protein2']
    
    for p in proteins:
        protein_interation_masked[p] = protein_interaction_df[p].map(protein_map)
    
    return protein_interation_masked

In [None]:
protein_interation_masked = mapProtein(protein_interaction_df, protein_map)
protein_interation_masked

In [None]:
def generateGraph(protein_interation_masked):

    # Gerar grafo a partir do mapeamento do dataframe do Pandas
    protein_interaction_graph = nx.from_pandas_edgelist(
        protein_interation_masked, 
        source = "protein1", 
        target = "protein2"
    )
    
    return protein_interaction_graph

In [None]:
def generateDF(protein_interaction_graph):
    return nx.to_pandas_edgelist(protein_interaction_graph)

In [None]:
# Geração do grafo de PPI

graph = generateGraph(protein_interation_masked)
df_graph = generateDF(graph)
df_graph

In [None]:
# Medidas de Centralidade
import time

seconds_ini = time.time()

# Degree Centrality
degree = nx.degree_centrality(graph)

In [None]:
# Eigenvector Centrality

eigenvector = nx.eigenvector_centrality(graph)

In [None]:
# Betweenness Centrality

#K atribuído dado o tamanho da rede

betweenness = nx.betweenness_centrality(graph, k=380)


In [None]:
# Closeness Centrality

closeness = {}

for i in range(len(protein_map)):
    closeness_tmp = nx.closeness_centrality(graph, u=i)
    closeness[i] = closeness_tmp
    print(i)

In [None]:
betweenness

In [None]:
# Clustering
clustering = nx.clustering(graph)

seconds_fini = time.time()

print("Seconds since epoch =", seconds_fini - seconds_ini)

In [None]:
# Criando um Dataframe
from collections import OrderedDict

degree_ordered = OrderedDict(sorted(degree.items()))
eigenvector_ordered = OrderedDict(sorted(eigenvector.items()))
betweenness_ordered = OrderedDict(sorted(betweenness.items()))
closeness_ordered = OrderedDict(sorted(closeness.items()))
clustering_ordered = OrderedDict(sorted(clustering.items()))


protein_features = pd.concat([pd.Series(list(protein_map.keys())),
                              pd.Series(list(degree_ordered.values())), 
                              pd.Series(list(eigenvector_ordered.values())),
                              pd.Series(list(betweenness_ordered.values())), 
                              pd.Series(list(closeness_ordered.values())),
                              pd.Series(list(clustering_ordered.values()))], axis=1)

protein_features.columns = ["Protein_key",
                            "DegreeCentrality",
                            "EigenvectorCentrality",
                            "BetweennessCentrality",
                            "ClosenessCentrality",
                            "Clustering"]
                           

In [None]:
protein_features

In [None]:
protein_features.to_csv('data_org/proteins_features_musculus_tmp.csv', index=False)

In [None]:
# Rotulando as proteínas essencias

dataset_essential = pd.read_csv('essential_genes.csv')

protein_features = pd.read_csv('data_org/proteins_features_musculus_tmp.csv')

dataset_essential[dataset_essential['Code_Organism'] == 'DEG2005']

In [None]:
dataset_essential

In [None]:
sac_proteins = list(dataset_essential[dataset_essential['Code_Organism'] == 'DEG2005']['Locus'])

protein_features['is_essential'] = protein_features.apply(lambda x: 1 if x.Protein_key in sac_proteins 
                                                          else 0, axis=1)

In [None]:
protein_features

In [None]:
protein_features['is_essential'].value_counts()

In [None]:
protein_features.to_csv('data_org/proteins_features_musculus.csv', index=False)

In [None]:
# Graph 

# Fonte: https://networkx.org/nx-guides/content/exploratory_notebooks/facebook_notebook.html

from random import randint
import matplotlib.pyplot as plt

pos = nx.spring_layout(graph)


plt.figure(figsize=(15,10))
plt.axis('off') 
nx.draw_networkx(graph, pos=pos, node_size=12, with_labels=False, width=0.15)

### Visualização de dados

In [None]:
df = pd.read_csv("data_org/proteins_features_droso.csv")
df

In [None]:
df_essential = df[df['is_essential'] == 1]
df_essential

In [None]:
df_non_essential = df[df['is_essential'] == 0]
df_non_essential

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['figure.figsize'] = (16,12)

# Define as configurações dos plots
# Cada plot terá o mesmo tamanho de figuras (8,10)
#plt.style.use("ggplot")

plt.figure(1)

sns.set_theme(style="whitegrid", palette="dark")

plt.title('Drosophila melanogaster Data')


# Dados para cada subplot

plt.subplot(321)
sns.histplot(data=df, x="DegreeCentrality", hue="is_essential", 
             stat="probability", common_norm=False, kde=True)

plt.subplot(322)
sns.histplot(data=df, x="EigenvectorCentrality", hue="is_essential", 
            stat="probability", common_norm=False, kde=True)

plt.subplot(323)
sns.histplot(data=df, x="BetweennessCentrality", hue="is_essential", bins = 100,
            stat="probability", common_norm=False, kde=True)

plt.subplot(324)
sns.histplot(data=df, x="ClosenessCentrality", hue="is_essential", 
            stat="probability", common_norm=False, kde=True)

plt.subplot(325)
sns.histplot(data=df, x="Clustering", hue="is_essential", 
            stat="probability", common_norm=False, kde=True)


plt.subplots_adjust(top=0.95, bottom=0.05, left=0.10, right=0.95, hspace=0.4,
                    wspace=0.25)

plt.savefig("results_drosophila.jpg")
plt.show()
