In [254]:
import pandas as pd
import plotly.express as px
import re

In [255]:
risk_factor = "../data/Kaggle/target_tables/8_risk_factors"

In [256]:
df_Age = pd.read_csv(risk_factor + "/Age.csv")

In [257]:
df_Diabetes = pd.read_csv(risk_factor + "/Diabetes.csv")

In [258]:
df_overweight = pd.read_csv(risk_factor + "/Overweight or obese.csv")

In [259]:
df_Age.columns

Index(['Unnamed: 0', 'Date', 'Study', 'Study Link', 'Journal', 'Severe',
       'Severe lower bound', 'Severe upper bound', 'Severe p-value',
       'Severe Significant', 'Severe Adjusted', 'Severe Calculated',
       'Fatality', 'Fatality lower bound', 'Fatality upper bound',
       'Fatality p-value', 'Fatality Significant', 'Fatality Adjusted',
       'Fatality Calculated', 'Multivariate adjustment', 'Study Type',
       'Sample Size', 'Study Population', 'Added on', 'Critical only',
       'Discharged vs. death?'],
      dtype='object')

In [260]:
datasets = {
    'Age Data': df_Age,
    'Diabetes Data': df_Diabetes,
    'Overweight Data': df_overweight
}

In [None]:
import plotly.graph_objects as go
def severe_Fatality_plot_districution(dataset, name):
    df_analyse_Severe = dataset.dropna(subset=["Severe_number", "Severe_p-value"]).copy()
    fig = go.Figure()

    fig.add_trace(go.Histogram(
        x=df_analyse_Severe["Severe_number"], 
        name="Sévérité", 
        opacity=0.6
    ))


    fig.add_trace(go.Histogram(
        x=df_analyse_Severe["Severe_p-value"], 
        name="p-value Sévérité", 
        opacity=0.6,
        marker_color="red"
    ))
    # Ajouter une ligne verticale à p = 0.05
    fig.add_shape(
        type="line",
        x0=0.05, x1=0.05,
        y0=0, y1=1,
        yref="paper", 
        line=dict(color="black", width=2, dash="dash")
    )

   # Ajouter un texte juste à côté de la ligne
    fig.add_annotation(
        x=0.17, 
        y=0.9, 
        yref="paper",
        text="Seuil de signification (p = 0.05)",
        showarrow=False,
        font=dict(color="black", size=12)
    )

    fig.update_layout(
        title=f"Distribution de la Sévérité et de la p-value pour {name}",
        xaxis_title="Valeurs",
        yaxis_title="Fréquence",
        barmode="overlay", 
    )

    fig.show()



    df_analyse_Fatality = dataset.dropna(subset=["Fatality_number", "Fatality_p-value"]).copy()
    fig = go.Figure()

    fig.add_trace(go.Histogram(
        x=df_analyse_Fatality["Fatality_number"], 
        name="Létalité", 
        opacity=0.6
    ))

    # Ajouter une ligne verticale à p = 0.05
    fig.add_shape(
        type="line",
        x0=0.05, x1=0.05,
        y0=0, y1=1,
        yref="paper", 
        line=dict(color="black", width=2, dash="dash")
    )


   # Ajouter un texte juste à côté de la ligne
    fig.add_annotation(
        x=0.17,  
        y=0.9,  
        yref="paper",
        text="Seuil de signification (p = 0.05)",
        showarrow=False,
        font=dict(color="black", size=12)
    )


    fig.add_trace(go.Histogram(
        x=df_analyse_Fatality["Fatality_p-value"], 
        name="p-value Létalité", 
        opacity=0.6,
        marker_color="red"
    ))

    fig.update_layout(
        title=f"Distribution de la Létalité et de la p-value pour {name}",
        xaxis_title="Valeurs",
        yaxis_title="Fréquence",
        barmode="overlay",  
    )

    fig.show()

In [None]:

def sample_size_plot_distribution(dataset,name):
    df_analyse_sample_size = dataset.dropna(subset=["Sample_Size"]).copy()
    

    fig_sample = px.box(df_analyse_sample_size, y="Sample_Size", title=f"Distribution de la Taille de l'Échantillon pour {name}", 
                         color_discrete_sequence=["green"])
    
    fig_sample.show()


In [263]:
def rename_columns(dataset):
    dataset.columns = [col.replace(" ", "_") for col in dataset.columns]

In [None]:
def proportion_presence(dataset,name):


    nan_count = dataset["Severe"].isna().sum()
    non_nan_count = dataset["Severe"].notna().sum()


    nan_data = pd.DataFrame({
        "Type": ["NaN", "Non NaN"],
        "Count": [nan_count, non_nan_count]
    })


    fig = px.pie(nan_data, names="Type", values="Count", title=f"Proportion de NaN dans Severe pour {name}")
    fig.show()




    nan_count = dataset["Fatality"].isna().sum()
    non_nan_count = dataset["Fatality"].notna().sum()

    nan_data = pd.DataFrame({
        "Type": ["NaN", "Non NaN"],
        "Count": [nan_count, non_nan_count]
    })


    fig = px.pie(nan_data, names="Type", values="Count", title=f"Proportion de NaN dans Fatality pour {name}")
    fig.show()

In [None]:
def proportion_significant(dataset,name):

    significant = dataset[dataset["Severe_Significant"] == "Significant"]["Severe_Significant"].count()
    non_significant = dataset[dataset["Severe_Significant"] != "Significant"]["Severe_Significant"].count()


    signi_data = pd.DataFrame({
        "Type": ["significant", "non_significant"],
        "Count": [significant, non_significant]
    })


    fig = px.pie(signi_data, names="Type", values="Count", 
             title=f"Proportion de Significant et Non-Significant pour {name}",
             color="Type",
             color_discrete_map={"significant": "royalblue", "non_significant": "lightgray"})
    fig.show()


    significant = dataset[dataset["Fatality_Significant"] == "Significant"]["Fatality_Significant"].count()
    non_significant = dataset[dataset["Fatality_Significant"] != "Significant"]["Fatality_Significant"].count()


    signi_data = pd.DataFrame({
        "Type": ["significant", "non_significant"],
        "Count": [significant, non_significant]
    })

    fig = px.pie(signi_data, names="Type", values="Count", 
             title=f"Proportion de Significant et Non-Significant Léthale pour {name}",
             color="Type",
             color_discrete_map={"significant": "royalblue", "non_significant": "lightgray"})
    fig.show()



In [None]:
def extract_sample_studies(dataset):
    def extract_values(sample_str):
        if isinstance(sample_str, str):
            sample_str = sample_str.replace(",", "")

            numbers = re.findall(r"\d+", sample_str)
            
            if len(numbers) == 2:  
                return int(numbers[1]), int(numbers[0])
            elif len(numbers) == 1: 
                return int(numbers[0]), 0

        return None, 0  

    dataset[["Sample_Size", "Studies"]] = dataset["Sample_Size"].apply(lambda x: pd.Series(extract_values(x)))

    return dataset

In [267]:
for name, dataset in datasets.items():
    rename_columns(dataset)
    dataset = extract_sample_studies(dataset)
    dataset["Severe_number"] = dataset["Severe"].astype(str).apply(lambda x: float(re.search(r"\d+(\.\d+)?", x).group()) if re.search(r"\d+(\.\d+)?", x) else None)
    dataset["Fatality_number"] = dataset["Fatality"].astype(str).apply(lambda x: float(re.search(r"\d+(\.\d+)?", x).group()) if re.search(r"\d+(\.\d+)?", x) else None)
    severe_Fatality_plot_districution(dataset,name)
    sample_size_plot_distribution(dataset,name)
    proportion_presence(dataset,name)
    proportion_significant(dataset,name)
    
df_Age = datasets["Age Data"]
df_Diabetes = datasets["Diabetes Data"]
df_overweight = datasets["Overweight Data"]

10


22


26


36


24


13
