# Student Depression Analysis

### Set up

In [1]:
import plotly.express as px
import pandas as pd
df = pd.read_csv("data/student_depression_dataset.csv")

# Palette colors
COL_OK = "#1E88E5"  # blu → condizione/non-rischio
COL_RISK = "#D81B60"  # magenta → rischio
TEMPLATE = "presentation"  # font grande & pulito


def bump_fonts(fig, base=20):
    """Aumenta i font di titoli, assi e legenda."""
    fig.update_layout(
        font=dict(size=base),
        title_font=dict(size=base + 2),
        legend_font=dict(size=base),
    )
    fig.update_xaxes(title_font=dict(size=base), tickfont=dict(size=base - 2))
    fig.update_yaxes(title_font=dict(size=base), tickfont=dict(size=base - 2))
    return fig


### Graph A – Age distribution

##### Most of the sample is young: roughly three-quarters of respondents are between 18 and 30 years old, with a median age of 25.

In [2]:

# GRAPH A – Age distribution (histogram)
fig_age = px.histogram(
    df,
    x="Age",
    nbins=20,
    color_discrete_sequence=[COL_OK],
    template=TEMPLATE,
    title="<b>Distribuzione dell'età: 75 % tra 18 e 30 anni</b>",
    labels={"Age": "Età (anni)", "count": "Frequenza"},
).update_layout(bargap=0.05)
bump_fonts(fig_age)
fig_age.show()

### Graph B – CGPA distribution

#### Grades cluster tightly around a median CGPA of about 7 (on a 0–10 scale); extreme low or high scores are rare.

In [3]:

# GRAPH B – CGPA distribution (box plot)
fig_cgpa = px.box(
    df,
    y="CGPA",
    color_discrete_sequence=[COL_OK],
    template=TEMPLATE,
    points="outliers",  # «outliers», «all», «suspectedoutliers», oppure False
    notched=False,
    title="<b>Distribuzione CGPA: mediana ≈ 7</b>",
    labels={"CGPA": "CGPA"},
)

fig_cgpa.update_traces(boxmean=True)
bump_fonts(fig_cgpa)
fig_cgpa.show()

### Graph C – Work/Study hours by degree

#### Across the seven most common degrees, students dedicate a similar amount of time to study/work (≈ 8 hours per day)—no programme stands out for heavier or lighter workload.

In [4]:
# GRAPH C – Work/Study hours by degree (top 7)
df.columns = df.columns.str.strip()

top_deg = df["Degree"].value_counts().head(7).index.tolist()
sub = df[df["Degree"].isin(top_deg)]

fig_degree_hours = px.box(
    sub,
    x="Degree",
    y="Work/Study Hours",
    category_orders={"Degree": top_deg},
    color_discrete_sequence=[COL_RISK],
    template=TEMPLATE,
    points="outliers",
    title="<b>Ore di studio/lavoro simili tra i corsi più diffusi</b>",
    labels={"Work/Study Hours": "Ore di studio/lavoro (h/giorno)"},
)

fig_degree_hours.update_traces(boxmean=True)
bump_fonts(fig_degree_hours).update_layout(xaxis_tickangle=-35)

fig_degree_hours.show()

### Graph D – Study satisfaction by degree

#### Regardless of degree, average study satisfaction hovers around 3 out of 5, indicating moderate satisfaction with little variation between programmes.


In [5]:
# GRAPH D – Average study satisfaction by degree
mean_sat = (
    sub.groupby("Degree", as_index=False)["Study Satisfaction"]
    .mean()
    .round(2)
    .sort_values("Degree")
)
fig_sat = px.bar(
    mean_sat,
    x="Degree",
    y="Study Satisfaction",
    text_auto=".2f",
    color_discrete_sequence=["#FFC107"],
    template=TEMPLATE,
    title="<b>Soddisfazione media ≈ 3/5 per tutti i corsi</b>",
    labels={"Study Satisfaction": "Soddisfazione media (1‒5)"},
).update_layout(xaxis_tickangle=-35, yaxis_range=[0, 5])
bump_fonts(fig_sat)
fig_sat.show()

### Graph E – Suicidal thoughts → depression

#### Students who report suicidal thoughts show about four-times the prevalence of depression (≈ 79 % vs 21 %), highlighting a strong association between the two.


In [6]:
# GRAPH E – Suicidal thoughts vs depression (%)
suic = (
    df.groupby(["Have you ever had suicidal thoughts ?", "Depression"])
    .size()
    .reset_index(name="count")
)
suic["DepressionLabel"] = suic["Depression"].map({0: "No depression", 1: "Depressed"})
suic["percent"] = (
    suic.groupby("Have you ever had suicidal thoughts ?")["count"]
    .transform(lambda x: x / x.sum() * 100)
    .round(1)
)
fig_suic = px.bar(
    suic,
    x="Have you ever had suicidal thoughts ?",
    y="percent",
    color="DepressionLabel",
    color_discrete_map={"No depression": COL_OK, "Depressed": COL_RISK},
    template=TEMPLATE,
    text="percent",
    barmode="stack",
    title="<b>Pensieri suicidi → 4× prevalenza di depressione</b>",
    labels={
        "Have you ever had suicidal thoughts ?": "Pensieri suicidari",
        "percent": "Percentuale (%)",
    },
).update_traces(texttemplate="%{text:.1f}%")
bump_fonts(fig_suic).update_layout(legend_title_text="", yaxis_range=[0, 100])
fig_suic.show()

### Graph F – Financial stress → depression

#### Depression prevalence climbs steadily with financial stress, rising from roughly 32 % at stress level 1 to over 70 % at level 5.

In [15]:
# GRAPH F – Financial stress vs depression (%)
fin = (
    df
    .groupby(["Financial Stress", "Depression"])
    .size()
    .reset_index(name="count")
)
fin["DepressionLabel"] = fin["Depression"].map({0: "No depression", 1: "Depressed"})
fin["percent"] = (
    fin
    .groupby("Financial Stress")["count"]
    .transform(lambda x: x / x.sum() * 100)
    .round(1)
)
fig_fin = px.bar(
    fin,
    x="Financial Stress",
    y="percent",
    color="DepressionLabel",
    # forza "Depressed" come primo livello (quindi bar sotto) e "No depression" sopra
    category_orders={"DepressionLabel": ["Depressed", "No depression"]},
    color_discrete_map={"No depression": COL_OK, "Depressed": COL_RISK},
    template=TEMPLATE,
    text="percent",
    barmode="stack",
    title="<b>Lo stress finanziario aumenta la possibilità di depressione</b>",
    labels={
        "Financial Stress": "Stress finanziario (1‒5)",
        "percent": "Percentuale (%)",
    },
)
fig_fin.update_traces(texttemplate="%{text:.1f}%")
bump_fonts(fig_fin).update_layout(
    legend_title_text="",
    yaxis_range=[0, 100]
)
fig_fin.show()

### Graph G – Work/Study-hours vs academic pressure

#### Higher academic pressure is associated with longer study hours


In [25]:
# GRAPH G – Work/Study hours by academic pressure (box plot)
fig_hours_box = px.box(
    df[df["Academic Pressure"].between(1, 5)],
    x="Academic Pressure",
    y="Work/Study Hours",
    category_orders={"Academic Pressure": [1, 2, 3, 4, 5]},
    points="outliers",
    template=TEMPLATE,
    color_discrete_sequence=[COL_RISK],
    title="<b>Higher academic pressure is associated with longer study hours</b>",
    labels={"Academic Pressure": "Pressure (1‒5)", "Work/Study Hours": "Hours per day"},
)
fig_hours_box.update_traces(boxmean=True)
bump_fonts(fig_hours_box).update_layout(xaxis_range=[0.5, 5.5])
fig_hours_box.show()

### Graph H – Sleep duration vs depression

#### A clear monotonic pattern: the less people sleep, the higher the prevalence of depression. Seven-to-eight-hour sleepers show the lowest rate (< 30 %), while the “< 5 hours” group exceeds 70 %.


In [None]:
import pandas as pd
import plotly.express as px

mapping = {
    "'Less than 5 hours'": "Less than 5 hours",
    "'5-6 hours'":        "5-6 hours",
    "'7-8 hours'":        "7-8 Hours"
}
df_filtered = df[df["Sleep Duration"].isin(mapping.keys())].copy()
df_filtered["SleepHoursCat"] = df_filtered["Sleep Duration"].map(mapping)

sleep = (
    df_filtered
    .groupby(["SleepHoursCat", "Depression"])
    .size()
    .reset_index(name="count")
)
sleep["DepressionLabel"] = sleep["Depression"].map({0: "No", 1: "Yes"})
sleep["percent"] = (
    sleep
    .groupby("SleepHoursCat")["count"]
    .transform(lambda x: x / x.sum() * 100)
    .round(1)
)

fig_sleep_dep = px.bar(
    sleep,
    x="SleepHoursCat",
    y="percent",
    color="DepressionLabel",
    color_discrete_map={"No": COL_OK, "Yes": COL_RISK},
    template=TEMPLATE,
    barmode="stack",
    text="percent",
    title="<b>Less sleep → higher depression prevalence</b>",
    labels={
        "SleepHoursCat": "Sleep duration",
        "percent": "Percent (%)"
    },
    category_orders={
        "SleepHoursCat": ["Less than 5 hours", "5-6 hours", "7-8 Hours"],
        "DepressionLabel": ["Yes", "No"]   # "Yes" (depressed) sotto, "No" sopra
    }
)

fig_sleep_dep.update_traces(texttemplate="%{text:.1f}%")
bump_fonts(fig_sleep_dep).update_layout(
    legend_title_text="",
    yaxis_range=[0, 100]
)
fig_sleep_dep.show()


### Graph I

#### Depression prevalence is similar across genders

In [10]:
#  GRAPH I – Depression prevalence by gender (%)
gender_tab = (
    df.groupby(["Gender", "Depression"]).size()
      .reset_index(name="count")
)
gender_tab["DepressionLabel"] = gender_tab["Depression"].map({0: "No", 1: "Yes"})
gender_tab["percent"] = (
    gender_tab.groupby("Gender")["count"]
              .transform(lambda x: x / x.sum() * 100)
              .round(1)
)

fig_gender_dep = px.bar(
    gender_tab,
    x="Gender",
    y="percent",
    color="DepressionLabel",
    color_discrete_map={"No": COL_OK, "Yes": COL_RISK},
    template=TEMPLATE,
    barmode="stack",
    text="percent",
    title="<b>Depression prevalence is similar across genders</b>",
    labels={"percent": "Percent (%)"},
)
fig_gender_dep.update_traces(texttemplate="%{text:.1f}%")
bump_fonts(fig_gender_dep).update_layout(legend_title_text="", yaxis_range=[0, 100])
fig_gender_dep.show()

### Graph J

#### Higher academic pressure slightly lowers median CGPA


In [11]:
#  GRAPH J – CGPA vs academic pressure (box plot)
pressure_box = px.box(
    df[df["Academic Pressure"].between(1, 5)],
    x="Academic Pressure",
    y="CGPA",
    points="outliers",
    template=TEMPLATE,
    color_discrete_sequence=[COL_RISK],
    title="<b>Higher academic pressure does not effect median CGPA</b>",
    labels={"Academic Pressure": "Pressure (1-5)", "CGPA": "CGPA"},
)
pressure_box.update_traces(boxmean=True)
bump_fonts(pressure_box).update_layout(xaxis_range=[0.5, 5.5], yaxis_range=[3.9, 11.2])
pressure_box.show()


### Graph K

#### Depression is more common in larger cities

In [12]:
#  GRAPH K – Depression vs city size (%)
#Percentuale di studenti con depressione per città
city_depression_rate = df.groupby("City")["Depression"].mean().reset_index()
city_depression_rate["Depression"] = city_depression_rate["Depression"] * 100  # percentuale

fig = px.bar(city_depression_rate.sort_values("Depression", ascending=False),
              x="City", y="Depression",
              title="Depression Rate per City (%)",
              labels={"Depression": "Depression Rate (%)"})
fig.update_layout(xaxis_tickangle=-45)
fig.show()


## Parte 2, grafico solo con classificazioni.
# Piccola: < 3 milioni
# Media: 3–5 milioni
#Grande: > 5 milioni

# Crea un dizionario che mappa il nome della città alla classificazione.
city_classification = {
    "Delhi": "Grande",
    "Mumbai": "Grande",
    "Kolkata": "Grande",
    "Bangalore": "Grande",
    "Chennai": "Grande",
    "Hyderabad": "Grande",
    "Ahmedabad": "Grande",
    "Surat": "Grande",
    "Pune": "Grande",
    "Jaipur (Jalpur)": "Media",
    "Lucknow": "Media",
    "Indore": "Media",
    "Kanpur": "Media",
    "Nagpur": "Piccola",
    "Patna": "Piccola",
    "Thane": "Piccola",
    "Bnopar": "Piccola",
    "Visakhapatnam": "Piccola",
    "Vadodara": "Piccola",
    "Rajkot": "Piccola",
    "Faridabad": "Piccola",
    "Ludhiana": "Piccola",
    "Meerut": "Piccola",
    "Kalyan": "Piccola",
    "Vasai-Virar": "Piccola",
    "Nasnik": "Piccola",
    "Jalpur": "Piccola",
    "Varanas": "Piccola",
    "Ghaziabad": "Piccola",
    "Harsha": "Piccola"
}

# Crea la colonna 'CityClass'
df["CityClass"] = df["City"].map(city_classification)

# Raggruppa per (CityClass, Depressed) e conta gli elementi
counts_by_class = df.groupby(["CityClass", "Depression"]).size().unstack(fill_value=0)

# Somma per riga (totali per ogni CityClass)
row_sums = counts_by_class.sum(axis=1)

# Converti i conteggi in percentuali
percentages = (counts_by_class.T / row_sums).T * 100

# Per plot, facciamo un reset dell'indice in modo da avere colonne esplicite
percentages_reset = percentages.reset_index()

city_tab = (
    df.groupby(["CityClass", "Depression"]).size()
      .unstack(fill_value=0)
)
city_pct = (city_tab.T / city_tab.sum(axis=1)).T * 100
city_pct = city_pct.reset_index().melt(id_vars="CityClass", var_name="Depression", value_name="percent")
city_pct["DepLabel"] = city_pct["Depression"].map({0: "No", 1: "Yes"})

fig_city = px.bar(
    city_pct,
    x="CityClass",
    y="percent",
    color="DepLabel",
    barmode="stack",
    color_discrete_map={"No": COL_OK, "Yes": COL_RISK},
    template=TEMPLATE,
    text="percent",
    labels={"CityClass": "City size", "percent": "Percent (%)"},
    title="<b>Depression is slighlty more common in larger cities</b>",
)
fig_city.update_traces(texttemplate="%{text:.1f}%")
bump_fonts(fig_city).update_layout(legend_title_text="", yaxis_range=[0, 100])
fig_city.show()

### Graph L
#### Roughly three-fifths of respondents are ,ale, two-fifths female

In [None]:
# GRAPH L – Gender composition (pie chart)
gender_counts = df["Gender"].value_counts().reset_index()
gender_counts.columns = ["Gender", "count"]

fig_gender_pie = px.pie(
    gender_counts,
    names="Gender",
    values="count",
    template=TEMPLATE,
    color_discrete_sequence=[COL_OK, COL_RISK],
    title="<b>Dataset as more males</b>",
    hole=0                         
)
bump_fonts(fig_gender_pie)
fig_gender_pie.update_traces(textposition="inside", texttemplate="%{percent:.1%}")
fig_gender_pie.show()
