In [1]:
import pandas as pd 
df=pd.read_csv("filtered_dataset3.csv", encoding='latin1')
df.dropna(inplace=True)
df = df[['PREDICATE', 'SUBJECT_NAME', 'OBJECT_NAME']]
filtre = ['PREVENTS']
df = df[df['PREDICATE'].isin(filtre) | (df['OBJECT_NAME'] == 'breast cancer prevention')].drop_duplicates()
print(df)

       PREDICATE                 SUBJECT_NAME  \
111     PREVENTS                    Melatonin   
125     PREVENTS        Therapeutic procedure   
137     PREVENTS     Gonadal Steroid Hormones   
164     PREVENTS         Estrogen Antagonists   
242     PREVENTS                     Vaccines   
...          ...                          ...   
251940  PREVENTS  estrogens, conjugated (USP)   
252089  PREVENTS        Acupuncture procedure   
252534  PREVENTS           Beta-Cryptoxanthin   
252554  PREVENTS             cyclophosphamide   
252873  PREVENTS        phosphatidylinositols   

                                   OBJECT_NAME  
111               Malignant neoplasm of breast  
125               Malignant neoplasm of breast  
137               Malignant neoplasm of breast  
164               Malignant neoplasm of breast  
242               Malignant neoplasm of breast  
...                                        ...  
251940            Malignant neoplasm of breast  
252089            M

In [2]:
import random
import networkx as nx
import pandas as pd

G = nx.Graph()
for _, row in df.iterrows():
    G.add_edge(row['SUBJECT_NAME'], row['OBJECT_NAME'], relation=row['PREDICATE'])

def random_walk(graph, start_node, num_steps):
    current_node = start_node
    visited_nodes = {current_node: 1}
    
    for _ in range(num_steps):
        neighbors = list(graph.neighbors(current_node))
        if not neighbors:
            break
        next_node = random.choice(neighbors)
        if next_node in visited_nodes:
            visited_nodes[next_node] += 1
        else:
            visited_nodes[next_node] = 1
        current_node = next_node
    
    return visited_nodes

def calculate_factuality_scores(df, num_steps=1000):
    scores = {}
    
    subject_nodes = df['SUBJECT_NAME'].unique()
    
    for subject in subject_nodes:
        visited = random_walk(G, subject, num_steps)
        
        for _, row in df.iterrows():
            subject = row['SUBJECT_NAME']
            obj = row['OBJECT_NAME']
            total_visits = sum(visited.values())
            score = visited.get(obj, 0) / total_visits if total_visits > 0 else 0
            scores[(subject, obj)] = score
    
    return scores

factuality_scores = calculate_factuality_scores(df)


df['Factuality_Score'] = df.apply(lambda row: factuality_scores.get((row['SUBJECT_NAME'], row['OBJECT_NAME']), 0), axis=1)

output_file_path = 'score.csv' 
df.to_csv(output_file_path, index=False)

print(f"Les scores de factualité ont été ajoutés et sauvegardés dans '{output_file_path}'.")

Les scores de factualité ont été ajoutés et sauvegardés dans 'score.csv'.


In [3]:
import pandas as pd

file_path = 'score.csv'
df = pd.read_csv(file_path)

female_breast_cancer_types = [
    "Malignant neoplasm of breast", "Breast Carcinoma", "Breast cancer metastatic", "Triple Negative Breast Neoplasms",
    "Breast cancer invasive NOS", "Noninfiltrating Intraductal Carcinoma", "Secondary malignant neoplasm of breast",
    "Carcinoma breast stage IV", "Inflammatory Breast Carcinoma", "estrogen receptor-positive breast cancer",
    "HER2-positive carcinoma of breast", "Carcinoma, Ductal, Breast", "Sporadic Breast Carcinoma", "Invasive carcinoma of breast",
    "Female Breast Carcinoma", "Familial cancer of breast", "Contralateral breast cancer", "Ductal Breast Carcinoma",
    "Breast Cancer, Familial", "Basal-Like Breast Carcinoma", "Early-Stage Breast Carcinoma", "contralateral breast cancer",
    "malignant neoplasm of breast staging", "estrogen receptor-negative breast cancer", "Luminal B Breast Carcinoma",
    "Hereditary Breast and Ovarian Cancer Syndrome", "Carcinoma in situ of breast", "Invasive Ductal Breast Carcinoma",
    "bilateral breast cancer", "Locally advanced breast cancer", "Malignant neoplasm of female breast", "HER2-negative breast cancer",
    "Breast cancer stage II", "Triple-Negative Breast Carcinoma", "Breast cancer stage III", "Lobular carcinoma in situ of breast",
    "Advanced breast cancer diagnosis", "Stage 0 Breast Carcinoma", "Cancer en cuirasse", "Stage IV Inflammatory Breast Carcinoma",
    "Right-Sided Breast Neoplasms", "hereditary breast/ovarian cancer - BRCA1", "Stage 0 Breast Cancer AJCC v6 and v7",
    "Unilateral Breast Carcinoma", "Breast Mucinous Carcinoma", "cellular diagnosis, breast cancer", "Carcinoma breast stage I",
    "Papillary carcinoma of the breast"
]

male_breast_cancer_types = ["Carcinoma of Male Breast"]

recurrent_breast_cancer_types = ["Breast cancer recurrent", "Recurrent Breast Carcinoma"]
def classify_cancer_type(row):
    if row['OBJECT_NAME'] in female_breast_cancer_types:
        return "Female Breast Cancer"
    elif row['OBJECT_NAME'] in male_breast_cancer_types:
        return "Male Breast Cancer"
    elif row['OBJECT_NAME'] in recurrent_breast_cancer_types:
        return "Recurrent Breast Cancer"
    else:
        return "Other"
df['Cancer_Type'] = df.apply(classify_cancer_type, axis=1)
print(df.head())
df.to_csv('score.csv', index=False)

  PREDICATE              SUBJECT_NAME                   OBJECT_NAME  \
0  PREVENTS                 Melatonin  Malignant neoplasm of breast   
1  PREVENTS     Therapeutic procedure  Malignant neoplasm of breast   
2  PREVENTS  Gonadal Steroid Hormones  Malignant neoplasm of breast   
3  PREVENTS      Estrogen Antagonists  Malignant neoplasm of breast   
4  PREVENTS                  Vaccines  Malignant neoplasm of breast   

   Factuality_Score           Cancer_Type  
0          0.297702  Female Breast Cancer  
1          0.297702  Female Breast Cancer  
2          0.297702  Female Breast Cancer  
3          0.297702  Female Breast Cancer  
4          0.297702  Female Breast Cancer  


In [4]:
import pandas as pd

# Load the DataFrame
df = pd.read_csv('score.csv', encoding='latin1')

# Define the categories and keywords
categories = {
    'Lifestyle Factors': ['diet', 'exercise', 'lifestyle', 'activity', 'smoking', 'alcohol'],
    'Medications': ['drug', 'pharmaceutical', 'medication', 'aspirin', 'statin', 'metformin', 'chemo', 'antibiotic'],
    'Medical Procedures': ['surgery', 'treatment', 'procedure', 'therapy', 'transplant', 'biopsy', 'mastectomy', 'chemotherapy', 'radiotherapy'],
    'Behavioral Interventions': ['counseling', 'therapy', 'behavioral', 'psychotherapy'],
    'Supplements': ['vitamin', 'mineral', 'supplement', 'herb', 'nutraceutical', 'probiotic', 'omega-3', 'calcium', 'iron'],
    'Vaccines': ['vaccine', 'vaccination', 'immunization'],
    'Environmental Factors': ['exposure', 'pollution', 'radiation', 'chemical', 'asbestos', 'pesticide'],
    'Genetic Factors': ['brca1', 'brca2', 'genetic', 'mutation', 'hereditary'],
    'Hormonal Treatments': ['hormone', 'estrogen', 'progesterone', 'androgen', 'tamoxifen'],
    'Alternative Therapies': ['acupuncture', 'homeopathy', 'chiropractic', 'naturopathy', 'yoga', 'meditation'],
    'Dietary Interventions': ['diet', 'nutrition', 'caloric', 'ketogenic', 'plant-based', 'fiber'],
    'Physical Activity': ['exercise', 'workout', 'fitness', 'aerobic', 'anaerobic'],
    'Screening and Monitoring': ['screening', 'monitoring', 'mammogram', 'biopsy', 'self-exam'],
    'Public Health Measures': ['quarantine', 'isolation', 'social distancing', 'sanitization', 'lockdown'],
    'Personal Protective Equipment': ['mask', 'glove', 'face shield', 'gown']
}

def classify_subject(subject_name):
    subject_name_lower = subject_name.lower()
    for category, keywords in categories.items():
        if any(keyword in subject_name_lower for keyword in keywords):
            return category
    return 'Other'  # Default category if no keyword matches

# Apply the classification
df['Category'] = df['SUBJECT_NAME'].apply(classify_subject)

# Display the results
print(df)

# Save the results to a new CSV file
output_file = 'score.csv'
df.to_csv(output_file, index=False)


     PREDICATE                 SUBJECT_NAME  \
0     PREVENTS                    Melatonin   
1     PREVENTS        Therapeutic procedure   
2     PREVENTS     Gonadal Steroid Hormones   
3     PREVENTS         Estrogen Antagonists   
4     PREVENTS                     Vaccines   
...        ...                          ...   
1172  PREVENTS  estrogens, conjugated (USP)   
1173  PREVENTS        Acupuncture procedure   
1174  PREVENTS           Beta-Cryptoxanthin   
1175  PREVENTS             cyclophosphamide   
1176  PREVENTS        phosphatidylinositols   

                                 OBJECT_NAME  Factuality_Score  \
0               Malignant neoplasm of breast          0.297702   
1               Malignant neoplasm of breast          0.297702   
2               Malignant neoplasm of breast          0.297702   
3               Malignant neoplasm of breast          0.297702   
4               Malignant neoplasm of breast          0.297702   
...                                    

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load the DataFrame
df = pd.read_csv("score.csv", encoding='latin1')

# Combine SUBJECT_NAME and OBJECT_NAME into a single text feature
df['text'] = df['SUBJECT_NAME'] + " " + df['OBJECT_NAME']

# Define features and labels
X = df['text']
y = df['Category']

# Separate data into rows with known labels and rows with 'Other'
known_labels_df = df[df['Category'] != 'Other']
unknown_labels_df = df[df['Category'] == 'Other']

# Split known labels into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    known_labels_df['text'], known_labels_df['Category'], test_size=0.3, random_state=42
)

# Convert text to TF-IDF features
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train the model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

# Make predictions and evaluate the model
y_pred = model.predict(X_test_tfidf)
print("Accuracy:", accuracy_score(y_test, y_pred))

# Predict categories for rows labeled as 'Other'
X_unknown_tfidf = vectorizer.transform(unknown_labels_df['text'])
unknown_labels_df['Category'] = model.predict(X_unknown_tfidf)

# Combine the DataFrames
df = pd.concat([known_labels_df, unknown_labels_df], axis=0)

# Save the results to a new CSV file
df.to_csv("score.csv", index=False)


Accuracy: 0.7191011235955056


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unknown_labels_df['Category'] = model.predict(X_unknown_tfidf)


In [3]:
import pandas as pd
df = pd.read_csv("score.csv", encoding='latin1')
df = df.drop(columns=['text'])

# Enregistrer le DataFrame modifié dans un nouveau fichier CSV
df.to_csv('score.csv', index=False)

In [1]:
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt
import os

# Charger le fichier CSV dans un DataFrame
file_path = 'score.csv' 
df = pd.read_csv(file_path)



# Obtenir les catégories uniques
categories = df['Category'].unique()

# Itérer sur chaque catégorie pour créer et sauvegarder les graphes
for category in categories:
    # Filtrer les données pour la catégorie actuelle et le Cancer_Type spécifique
    df_category = df[(df['Category'] == category) & (df['Cancer_Type'] == 'Male Breast Cancer')]
    
    # Créer le graphe pour la catégorie
    G_category = nx.Graph()
    for _, row in df_category.iterrows():
        G_category.add_edge(row['SUBJECT_NAME'], row['Cancer_Type'], relation=row['Factuality_Score'])
    
    # Visualiser le graphe
    plt.figure(figsize=(11, 5))
    pos = nx.spring_layout(G_category, seed=42)
    
    # Dessiner les nœuds et les arêtes
    nx.draw(G_category, pos, with_labels=True, node_color='lightblue', node_size=900, edge_color='gray', font_size=10, font_weight='bold', alpha=0.9)
    
    # Dessiner les étiquettes des arêtes
    edge_labels = nx.get_edge_attributes(G_category, 'relation')
    nx.draw_networkx_edge_labels(G_category, pos, edge_labels=edge_labels)
    
    
    # Enregistrer l'image du graphe
    file_name = f'assets/graph/prevents_male/male_cancer_graph_{category.replace(" ", "_")}.png'
    plt.savefig(file_name, dpi=110, bbox_inches='tight')
    plt.close()  # Fermer la figure pour libérer la mémoire


In [2]:
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt
import os

# Charger le fichier CSV dans un DataFrame
file_path = 'score.csv' 
df = pd.read_csv(file_path)

# Créer le répertoire 'static' s'il n'existe pas
if not os.path.exists('static'):
    os.makedirs('static')

# Obtenir les catégories uniques
categories = df['Category'].unique()

# Itérer sur chaque catégorie pour créer et sauvegarder les graphes
for category in categories:
    # Filtrer les données pour la catégorie actuelle et le Cancer_Type spécifique
    df_category = df[(df['Category'] == category) & (df['Cancer_Type'] == 'Female Breast Cancer')]
    
    # Créer le graphe pour la catégorie
    G_category = nx.Graph()
    for _, row in df_category.iterrows():
        G_category.add_edge(row['SUBJECT_NAME'], row['Cancer_Type'], relation=row['Factuality_Score'])
    
    # Visualiser le graphe
    plt.figure(figsize=(11, 5))
    pos = nx.spring_layout(G_category, seed=42)
    
    # Dessiner les nœuds et les arêtes
    nx.draw(G_category, pos, with_labels=True, node_color='lightblue', node_size=900, edge_color='gray', font_size=10, font_weight='bold', alpha=0.9)
    
    # Dessiner les étiquettes des arêtes
    edge_labels = nx.get_edge_attributes(G_category, 'relation')
    nx.draw_networkx_edge_labels(G_category, pos, edge_labels=edge_labels)
    

    
    # Enregistrer l'image du graphe
    file_name = f'assets/graph/prevents_female/female_cancer_graph_{category.replace(" ", "_")}.png'
    plt.savefig(file_name, dpi=110, bbox_inches='tight')
    plt.close()  

In [3]:
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt
import os

# Charger le fichier CSV dans un DataFrame
file_path = 'score.csv' 
df = pd.read_csv(file_path)

# Créer le répertoire 'static' s'il n'existe pas
if not os.path.exists('static'):
    os.makedirs('static')

# Obtenir les catégories uniques
categories = df['Category'].unique()

# Itérer sur chaque catégorie pour créer et sauvegarder les graphes
for category in categories:
    # Filtrer les données pour la catégorie actuelle et le Cancer_Type spécifique
    df_category = df[(df['Category'] == category) & (df['Cancer_Type'] == 'Recurrent Breast Cancer')]
    
    # Créer le graphe pour la catégorie
    G_category = nx.Graph()
    for _, row in df_category.iterrows():
        G_category.add_edge(row['SUBJECT_NAME'], row['Cancer_Type'], relation=row['Factuality_Score'])
    
    # Visualiser le graphe
    plt.figure(figsize=(1, 5))
    pos = nx.spring_layout(G_category, seed=42)
    
    # Dessiner les nœuds et les arêtes
    nx.draw(G_category, pos, with_labels=True, node_color='lightblue', node_size=900, edge_color='gray', font_size=10, font_weight='bold', alpha=0.9)
    
    # Dessiner les étiquettes des arêtes
    edge_labels = nx.get_edge_attributes(G_category, 'relation')
    nx.draw_networkx_edge_labels(G_category, pos, edge_labels=edge_labels)
    

    
    # Enregistrer l'image du graphe
    file_name = f'assets/graph/prevents_reccu/reccu_cancer_graph_{category.replace(" ", "_")}.png'
    plt.savefig(file_name, dpi=110, bbox_inches='tight')
    plt.close()  

In [2]:
from pyvis.network import Network
import pandas as pd
import os

# Charger le fichier CSV dans un DataFrame
file_path = 'score.csv'
df = pd.read_csv(file_path)

# Créer le répertoire 'graphs' s'il n'existe pas
output_dir = 'assets/graph/prevent_female'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Obtenir les catégories uniques
categories = df['Category'].unique()

# Itérer sur chaque catégorie pour créer et sauvegarder les graphes
for category in categories:
    # Filtrer les données pour la catégorie actuelle et le Cancer_Type spécifique
    df_category = df[(df['Category'] == category) & (df['Cancer_Type'] == 'Female Breast Cancer')]
    
    # Créer le graphe pour la catégorie
    net = Network(notebook=True)
    for _, row in df_category.iterrows():
        # Formater la factualité avec 5 décimales
        formatted_score = f"{row['Factuality_Score']:.7f}"
        
        # Ajouter les nœuds
        net.add_node(row['SUBJECT_NAME'], title=row['SUBJECT_NAME'], size=15, color='lightblue')
        net.add_node(row['Cancer_Type'], title=row['Cancer_Type'], size=15, color='lightblue')
        
        # Ajouter les arêtes avec labels
        net.add_edge(
            row['SUBJECT_NAME'], 
            row['Cancer_Type'], 
            title=f"Factuality Score: {formatted_score}", 
            label=formatted_score,  
            color='gray'
        )
    
    # Sauvegarder le graphe dans le répertoire 'graphs'
    file_path = os.path.join(output_dir, f'{category}_female_breast_cancer_graph.html')
    net.show(file_path)


assets/graph/prevent_female\Medical Procedures_female_breast_cancer_graph.html
assets/graph/prevent_female\Hormonal Treatments_female_breast_cancer_graph.html
assets/graph/prevent_female\Vaccines_female_breast_cancer_graph.html
assets/graph/prevent_female\Medications_female_breast_cancer_graph.html
assets/graph/prevent_female\Lifestyle Factors_female_breast_cancer_graph.html
assets/graph/prevent_female\Supplements_female_breast_cancer_graph.html
assets/graph/prevent_female\Screening and Monitoring_female_breast_cancer_graph.html
assets/graph/prevent_female\Environmental Factors_female_breast_cancer_graph.html
assets/graph/prevent_female\Genetic Factors_female_breast_cancer_graph.html
assets/graph/prevent_female\Dietary Interventions_female_breast_cancer_graph.html


In [3]:
from pyvis.network import Network
import pandas as pd
import os

# Charger le fichier CSV dans un DataFrame
file_path = 'score.csv'
df = pd.read_csv(file_path)

# Créer le répertoire 'graphs' s'il n'existe pas
output_dir = 'assets/graph/prevent_male'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Obtenir les catégories uniques
categories = df['Category'].unique()

# Itérer sur chaque catégorie pour créer et sauvegarder les graphes
for category in categories:
    # Filtrer les données pour la catégorie actuelle et le Cancer_Type spécifique
    df_category = df[(df['Category'] == category) & (df['Cancer_Type'] == 'Male Breast Cancer')]
    
    # Créer le graphe pour la catégorie
    net = Network(notebook=True)
    for _, row in df_category.iterrows():
        # Formater la factualité avec 5 décimales
        formatted_score = f"{row['Factuality_Score']:.7f}"
        
        # Ajouter les nœuds
        net.add_node(row['SUBJECT_NAME'], title=row['SUBJECT_NAME'], size=15, color='lightblue')
        net.add_node(row['Cancer_Type'], title=row['Cancer_Type'], size=15, color='lightblue')
        
        # Ajouter les arêtes avec labels
        net.add_edge(
            row['SUBJECT_NAME'], 
            row['Cancer_Type'], 
            title=f"Factuality Score: {formatted_score}", 
            label=formatted_score,  
            color='gray'
        )
    
    # Sauvegarder le graphe dans le répertoire 'graphs'
    file_path = os.path.join(output_dir, f'{category}_male_breast_cancer_graph.html')
    net.show(file_path)


assets/graph/prevent_male\Medical Procedures_male_breast_cancer_graph.html
assets/graph/prevent_male\Hormonal Treatments_male_breast_cancer_graph.html
assets/graph/prevent_male\Vaccines_male_breast_cancer_graph.html
assets/graph/prevent_male\Medications_male_breast_cancer_graph.html
assets/graph/prevent_male\Lifestyle Factors_male_breast_cancer_graph.html
assets/graph/prevent_male\Supplements_male_breast_cancer_graph.html
assets/graph/prevent_male\Screening and Monitoring_male_breast_cancer_graph.html
assets/graph/prevent_male\Environmental Factors_male_breast_cancer_graph.html
assets/graph/prevent_male\Genetic Factors_male_breast_cancer_graph.html
assets/graph/prevent_male\Dietary Interventions_male_breast_cancer_graph.html


In [4]:
from pyvis.network import Network
import pandas as pd
import os

# Charger le fichier CSV dans un DataFrame
file_path = 'score.csv'
df = pd.read_csv(file_path)

# Créer le répertoire 'graphs' s'il n'existe pas
output_dir = 'assets/graph/reccu_prevent'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Obtenir les catégories uniques
categories = df['Category'].unique()

# Itérer sur chaque catégorie pour créer et sauvegarder les graphes
for category in categories:
    # Filtrer les données pour la catégorie actuelle et le Cancer_Type spécifique
    df_category = df[(df['Category'] == category) & (df['Cancer_Type'] == 'Recurrent Breast Cancer')]
    
    # Créer le graphe pour la catégorie
    net = Network(notebook=True)
    for _, row in df_category.iterrows():
        # Formater la factualité avec 5 décimales
        formatted_score = f"{row['Factuality_Score']:.7f}"
        
        # Ajouter les nœuds
        net.add_node(row['SUBJECT_NAME'], title=row['SUBJECT_NAME'], size=15, color='lightblue')
        net.add_node(row['Cancer_Type'], title=row['Cancer_Type'], size=15, color='lightblue')
        
        # Ajouter les arêtes avec labels
        net.add_edge(
            row['SUBJECT_NAME'], 
            row['Cancer_Type'], 
            title=f"Factuality Score: {formatted_score}", 
            label=formatted_score,  
            color='gray'
        )
    
    # Sauvegarder le graphe dans le répertoire 'graphs'
    file_path = os.path.join(output_dir, f'{category}_reccu_breast_cancer_graph.html')
    net.show(file_path)


assets/graph/reccu_prevent\Medical Procedures_reccu_breast_cancer_graph.html
assets/graph/reccu_prevent\Hormonal Treatments_reccu_breast_cancer_graph.html
assets/graph/reccu_prevent\Vaccines_reccu_breast_cancer_graph.html
assets/graph/reccu_prevent\Medications_reccu_breast_cancer_graph.html
assets/graph/reccu_prevent\Lifestyle Factors_reccu_breast_cancer_graph.html
assets/graph/reccu_prevent\Supplements_reccu_breast_cancer_graph.html
assets/graph/reccu_prevent\Screening and Monitoring_reccu_breast_cancer_graph.html
assets/graph/reccu_prevent\Environmental Factors_reccu_breast_cancer_graph.html
assets/graph/reccu_prevent\Genetic Factors_reccu_breast_cancer_graph.html
assets/graph/reccu_prevent\Dietary Interventions_reccu_breast_cancer_graph.html


In [1]:
import pandas as pd
import os

# Charger le fichier CSV dans un DataFrame
file_path = 'score.csv'
df = pd.read_csv(file_path)

# Créer le répertoire 'assets/tables' s'il n'existe pas
output_dir = 'assets/tables/reccu_prevents'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Obtenir les catégories uniques
categories = df['Category'].unique()

# Itérer sur chaque catégorie pour extraire les nœuds et sauvegarder dans des fichiers CSV
for category in categories:
    # Filtrer les données pour la catégorie actuelle et le Cancer_Type spécifique
    df_category = df[(df['Category'] == category) & (df['Cancer_Type'] == 'Recurrent Breast Cancer')]
    
    # Préparer une liste pour stocker les informations des nœuds
    nodes = []

    # Ajouter les nœuds
    for _, row in df_category.iterrows():
        # Formater la factualité avec 7 décimales
        formatted_score = f"{row['Factuality_Score']:.7f}"
        
        # Ajouter le nœud pour SUBJECT_NAME
        nodes.append({
            'title': row['SUBJECT_NAME'],
            'factuality_score': formatted_score
        })
        
        # Ajouter le nœud pour Cancer_Type (si ce n'est pas déjà ajouté)
        if not any(node['title'] == row['Cancer_Type'] for node in nodes):
            nodes.append({
                'title': row['Cancer_Type'],
                'factuality_score': formatted_score
            })
    
    # Convertir la liste des nœuds en DataFrame, en retirant les doublons
    nodes_df = pd.DataFrame(nodes).drop_duplicates()
    
    # Sauvegarder les données des nœuds dans un fichier CSV spécifique à chaque catégorie
    nodes_csv_path = os.path.join(output_dir, f'nodes_with_factuality_scores_{category}.csv')
    nodes_df.to_csv(nodes_csv_path, index=False)


In [2]:
import pandas as pd
import os

# Charger le fichier CSV dans un DataFrame
file_path = 'score.csv'
df = pd.read_csv(file_path)

# Créer le répertoire 'assets/tables' s'il n'existe pas
output_dir = 'assets/tables/female_prevents'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Obtenir les catégories uniques
categories = df['Category'].unique()

# Itérer sur chaque catégorie pour extraire les nœuds et sauvegarder dans des fichiers CSV
for category in categories:
    # Filtrer les données pour la catégorie actuelle et le Cancer_Type spécifique
    df_category = df[(df['Category'] == category) & (df['Cancer_Type'] == 'Female Breast Cancer')]
    
    # Préparer une liste pour stocker les informations des nœuds
    nodes = []

    # Ajouter les nœuds
    for _, row in df_category.iterrows():
        # Formater la factualité avec 7 décimales
        formatted_score = f"{row['Factuality_Score']:.7f}"
        
        # Ajouter le nœud pour SUBJECT_NAME
        nodes.append({
            'title': row['SUBJECT_NAME'],
            'factuality_score': formatted_score
        })
        
        # Ajouter le nœud pour Cancer_Type (si ce n'est pas déjà ajouté)
        if not any(node['title'] == row['Cancer_Type'] for node in nodes):
            nodes.append({
                'title': row['Cancer_Type'],
                'factuality_score': formatted_score
            })
    
    # Convertir la liste des nœuds en DataFrame, en retirant les doublons
    nodes_df = pd.DataFrame(nodes).drop_duplicates()
    
    # Sauvegarder les données des nœuds dans un fichier CSV spécifique à chaque catégorie
    nodes_csv_path = os.path.join(output_dir, f'nodes_with_factuality_scores_{category}.csv')
    nodes_df.to_csv(nodes_csv_path, index=False)


In [3]:
import pandas as pd
import os

# Charger le fichier CSV dans un DataFrame
file_path = 'score.csv'
df = pd.read_csv(file_path)

# Créer le répertoire 'assets/tables' s'il n'existe pas
output_dir = 'assets/tables/male_prevents'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Obtenir les catégories uniques
categories = df['Category'].unique()

# Itérer sur chaque catégorie pour extraire les nœuds et sauvegarder dans des fichiers CSV
for category in categories:
    # Filtrer les données pour la catégorie actuelle et le Cancer_Type spécifique
    df_category = df[(df['Category'] == category) & (df['Cancer_Type'] == 'Male Breast Cancer')]
    
    # Préparer une liste pour stocker les informations des nœuds
    nodes = []

    # Ajouter les nœuds
    for _, row in df_category.iterrows():
        # Formater la factualité avec 7 décimales
        formatted_score = f"{row['Factuality_Score']:.7f}"
        
        # Ajouter le nœud pour SUBJECT_NAME
        nodes.append({
            'title': row['SUBJECT_NAME'],
            'factuality_score': formatted_score
        })
        
        # Ajouter le nœud pour Cancer_Type (si ce n'est pas déjà ajouté)
        if not any(node['title'] == row['Cancer_Type'] for node in nodes):
            nodes.append({
                'title': row['Cancer_Type'],
                'factuality_score': formatted_score
            })
    
    # Convertir la liste des nœuds en DataFrame, en retirant les doublons
    nodes_df = pd.DataFrame(nodes).drop_duplicates()
    
    # Sauvegarder les données des nœuds dans un fichier CSV spécifique à chaque catégorie
    nodes_csv_path = os.path.join(output_dir, f'nodes_with_factuality_scores_{category}.csv')
    nodes_df.to_csv(nodes_csv_path, index=False)
