# **Categorizing Dataset** using Symptom Weights

In [3]:
import pandas as pd
import numpy as np

# 1. Load Data
df = pd.read_csv("datasets/dataset.csv")
df_weights = pd.read_csv("datasets/Symptom_Weights.csv", header=None, names=["Symptom", "Weight"])

# 2. Clean & Map Weights
df_weights["Symptom"] = df_weights["Symptom"].str.strip()
weight_map = dict(zip(df_weights["Symptom"], df_weights["Weight"]))

# Clean dataset whitespace
for col in df.columns:
    df[col] = df[col].str.strip()

# 3. Calculate Severity Scores
severity_scores = []
for _, row in df.iterrows():
    score = 0
    # Sum weights for all symptoms in the row (columns 1 to end)
    for symptom in row.iloc[1:]:
        if pd.notna(symptom):
            s_clean = str(symptom).strip()
            score += weight_map.get(s_clean, 0)
    severity_scores.append(score)

# 4. Determine Thresholds (33% Mild, 33% Moderate, 33% Severe)
t1 = np.percentile(severity_scores, 33)
t2 = np.percentile(severity_scores, 66)

def categorize(score):
    if score <= t1: return "Mild"
    elif score <= t2: return "Moderate"
    else: return "Severe"

# 5. Create DataFrame Column
df["Severity"] = [categorize(s) for s in severity_scores]

# 6. Reorder Columns to put Severity Second
cols = list(df.columns)
cols.insert(1, cols.pop(cols.index("Severity")))
df = df[cols]

# 7. Save to CSV
df.to_csv("datasets/categorized_dataset.csv", index=False)
print("File saved successfully.")

File saved successfully.
