In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import (LogisticRegression, SGDClassifier)
from sklearn.ensemble import (RandomForestClassifier,  GradientBoostingClassifier)
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (classification_report, roc_auc_score, jaccard_score)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import (StandardScaler, MultiLabelBinarizer)
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.multioutput import MultiOutputClassifier
from sklearn.tree import DecisionTreeClassifier
from collections import Counter
from sklearn.base import clone

In [None]:
df1 = pd.read_csv('/kaggle/input/vaers1fill/vaers1fill.csv', encoding="ISO-8859-1")
df = df1.head(100000).copy()
df.columns

In [None]:
# Lấy tất cả triệu chứng từ SYMPTOM1-5 và ghép thành 1 list
all_symptoms = []
for col in ['SYMPTOM1', 'SYMPTOM2', 'SYMPTOM3', 'SYMPTOM4', 'SYMPTOM5']:
    symptoms = df[col].dropna().str.lower().tolist()  # Chuyển về chữ thường để chuẩn hóa
    all_symptoms.extend(symptoms)

# Đếm tần suất triệu chứng
from collections import Counter
symptom_counts = Counter(all_symptoms)

print("Top 20 triệu chứng phổ biến:")
for symptom, count in symptom_counts.most_common(20):
    print(f"{symptom}: {count} reports")

plt.figure(figsize=(10, 6))
pd.Series(dict(symptom_counts.most_common(20))).plot(kind='barh')
plt.title('Top 20 Symptoms Frequency')
plt.show()

In [None]:
print(len(df1))

In [None]:
df = df1.head(100000).copy()

symptom_cols = ['SYMPTOM1', 'SYMPTOM2', 'SYMPTOM3', 'SYMPTOM4', 'SYMPTOM5']
outcome_cols = ['DIED', 'L_THREAT', 'DISABLE', 'HOSPITAL', 'ER_VISIT']

# Gán nhãn SEVERE từ các cột hậu quả
df['SEVERE'] = df[outcome_cols].eq('Y').any(axis=1).astype(int)

# Loại bỏ dòng không có triệu chứng
df_symptoms = df[symptom_cols + ['SEVERE']].dropna(subset=symptom_cols, how='all')

def combine_symptoms(row):
    return list(filter(pd.notnull, [row[col] for col in symptom_cols]))

df_symptoms['SYMPTOM_LIST'] = df_symptoms.apply(combine_symptoms, axis=1)

mlb = MultiLabelBinarizer()
X = mlb.fit_transform(df_symptoms['SYMPTOM_LIST'])
y = df_symptoms['SEVERE'].values

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [None]:
num_label_1 = np.sum(y == 1)
percent_label_1 = np.mean(y == 1) * 100

print(f"Số lượng nhãn 1 (nghiêm trọng): {num_label_1}")
print(f"Tỷ lệ nhãn 1 (nghiêm trọng): {percent_label_1:.2f}%")

In [None]:

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

## Part 1a  
Dự đoán triệu chứng có dẫn đến tình trạng nghiệm trọng hay không, và lấy ra 20 triệu chứng quan trọng nhất ảnh hưởng đến việc dự đoán

In [None]:
models = {
    'Random Forest': RandomForestClassifier(class_weight='balanced', random_state=42),
    'Logistic Regression': LogisticRegression(max_iter=300, random_state=42),
    'Support Vector Machine' : SGDClassifier(loss='hinge', class_weight='balanced', random_state=42),
    'MLP (2 layers)': MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=250, random_state=42)
}

# Lưu trữ đặc trưng quan trọng từ các mô hình
feature_importance_dict = {}

for name, model in models.items():
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    if hasattr(model, "predict_proba"):
        y_pred_proba = model.predict_proba(X_test)[:, 1]
    else:
        y_pred_proba = model.decision_function(X_test)  # Dùng score để tính AUC
    print(f"=== {name} ===")
    print(classification_report(y_test, y_pred, digits=4))

    auc = roc_auc_score(y_test, y_pred_proba)
    print(f"AUC-ROC Score: {auc:.4f}")

    # Lấy 20 đặc trưng quan trọng nhất
    if name == 'Random Forest':
        imp_rf = model.feature_importances_
        idx_rf = np.argsort(imp_rf)[::-1][:20]
        feature_importance_dict['Random Forest'] = idx_rf
        print("Top 20 Features (Random Forest):", idx_rf)

    elif name == 'Logistic Regression':
        imp_lr = np.abs(model.coef_).ravel()
        idx_lr = np.argsort(imp_lr)[::-1][:20]
        feature_importance_dict['Logistic Regression'] = idx_lr
        print("Top 20 Features (Logistic Regression):", idx_lr)

    elif name == 'Support Vector Machine':
        if hasattr(model, 'coef_'):
            imp_svm = np.abs(model.coef_).ravel()
            idx_svm = np.argsort(imp_svm)[::-1][:20]
            feature_importance_dict['Support Vector Machine'] = idx_svm
            print("Top 20 Features (SVM):", idx_svm)

    elif name == 'MLP (2 layers)':
        imp_mlp = np.sum(np.abs(model.coefs_[0]), axis=1)
        idx_mlp = np.argsort(imp_mlp)[::-1][:20]
        feature_importance_dict['MLP (2 layers)'] = idx_mlp
        print("Top 20 Features (MLP):", idx_mlp)

    print("\n" + "-"*50 + "\n")

# Tổng hợp các đặc trưng quan trọng từ tất cả các mô hình
all_importances = []
for model_name, features in feature_importance_dict.items():
    all_importances.extend(features)

feature_counts = Counter(all_importances)

top_20_features = [feature for feature, _ in feature_counts.most_common(20)]

print("Top 20 Features (Total from All Models):", top_20_features)


In [None]:
# Lấy tên triệu chứng từ MultiLabelBinarizer
symptom_names = mlb.classes_

feature_importance_dict_with_names = {}
for model_name, feature_indices in feature_importance_dict.items():
    feature_importance_dict_with_names[model_name] = [symptom_names[i] for i in feature_indices]

for model_name, features in feature_importance_dict_with_names.items():
    print(f"Top 20 Features ({model_name}):")
    for i, feature in enumerate(features, 1):
        print(f"{i}. {feature}")
    print("\n" + "-"*50 + "\n")

all_importances_with_names = []
for model_name, features in feature_importance_dict_with_names.items():
    all_importances_with_names.extend(features)

feature_counts_with_names = Counter(all_importances_with_names)

# Chọn 20 triệu chứng quan trọng nhất (theo tần suất xuất hiện)
top_20_features_with_names = [feature for feature, _ in feature_counts_with_names.most_common(20)]

print("Top 20 Most Important Symptoms (Across All Models):")
for i, feature in enumerate(top_20_features_with_names, 1):
    print(f"{i}. {feature}")

## Part 1b
Sử dụng các thuật toán giảm chiều dữ liệu để so sánh độ chính xác khi dự đoán tình trạng nghiêm trọng

In [None]:
print(f"Số chiều ban đầu của dữ liệu: {X_train.shape[1]}")

# PCA - Principal Component Analysis
pca = PCA(n_components = 5)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

print(f"Số chiều sau PCA: {pca.n_components_}")


# LDA - Linear Discriminant Analysis
lda = LDA(n_components=1)  
X_train_lda = lda.fit_transform(X_train, y_train)
X_test_lda = lda.transform(X_test)

print(f"Số chiều sau LDA: {X_train_lda.shape[1]}")


models = {
    'Random Forest': RandomForestClassifier(class_weight='balanced', random_state=42),
    'Logistic Regression': LogisticRegression(max_iter=300, random_state=42),
    'Support Vector Machine' : SGDClassifier(loss='hinge', class_weight='balanced', random_state=42),
    'MLP (2 layers)': MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=250, random_state=42)
}


print("=== SO SÁNH CÁC MÔ HÌNH VỚI GIẢM CHIỀU DỮ LIỆU ===\n")

print("== Sử dụng PCA (5 thành phần) ==")
for name, model in models.items():
    model.fit(X_train_pca, y_train)
    y_pred = model.predict(X_test_pca)
    if hasattr(model, "predict_proba"):
        y_pred_proba = model.predict_proba(X_test_pca)[:, 1]
    else:
        y_pred_proba = model.decision_function(X_test_pca)  
    print(f"=== {name} ===")
    print(classification_report(y_test, y_pred, digits=4))
    auc = roc_auc_score(y_test, y_pred_proba)
    print(f"AUC-ROC Score: {auc:.4f}")
    print("\n" + "-"*50 + "\n")

print("== Sử dụng LDA (2 thành phần) ==")
for name, model in models.items():
    model.fit(X_train_lda, y_train)
    y_pred = model.predict(X_test_lda)
    if hasattr(model, "predict_proba"):
        y_pred_proba = model.predict_proba(X_test_lda)[:, 1]
    else:
        y_pred_proba = model.decision_function(X_test_lda)  
    print(f"=== {name} ===")
    print(classification_report(y_test, y_pred, digits=4))
    auc = roc_auc_score(y_test, y_pred_proba)
    print(f"AUC-ROC Score: {auc:.4f}")
    print("\n" + "-"*50 + "\n")


# Part 2

In [None]:
df = df1.head(100000).copy()

df.iloc[:, 52:57] = df.iloc[:, 52:57].fillna(0)     
df.iloc[:, 65:79] = df.iloc[:, 65:79].fillna(0)    

df = df[(df['SEX'] != 'U') &
        (df['VAX_MANU'] != 'UNKNOWN MANUFACTURER') &
        (df['AGE_YRS'] > 11)]

symptom_cols = ['SYMPTOM1','SYMPTOM2','SYMPTOM3','SYMPTOM4','SYMPTOM5']
history_cols = df.columns[65:79].tolist()
numeric_raw = ['AGE_YRS', 'othermeds', 'allergies', 'disable', 'curr_ill']
covariates = ["AGE_YRS", "SEX", "VAX_MANU", "othermeds", "allergies", "disable"] + history_cols

df.dropna(subset=covariates, inplace=True)  
scaler = StandardScaler()
df[numeric_raw] = scaler.fit_transform(df[numeric_raw])

top_20_symptoms = top_20_features_with_names

def extract_symptoms(row):
    return [s.strip() for s in row[symptom_cols].astype(str).tolist() if s.strip()!='' and s.strip() in top_20_symptoms]

mlb = MultiLabelBinarizer(classes=top_20_symptoms)
y = mlb.fit_transform(df.apply(extract_symptoms, axis=1))

X = pd.get_dummies(df[["AGE_YRS", "SEX", "VAX_MANU", "othermeds", "allergies", "disable"] + history_cols],
                   drop_first=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [None]:
X.columns

In [None]:
# ensemble
clf1 = LogisticRegression(max_iter=300, class_weight='balanced', random_state=42)
clf2 = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
clf3 = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=250, random_state=42)
clf4 = GradientBoostingClassifier(n_estimators=100, random_state=42)
clf5 = SGDClassifier(loss='hinge', class_weight='balanced', random_state=42)
clf6 = DecisionTreeClassifier(class_weight='balanced', random_state=42)

base_models = [clf1, clf2, clf3, clf4, clf5, clf6]
trained_models = []

for base_clf in base_models:
    clf = MultiOutputClassifier(clone(base_clf))
    clf.fit(X_train, y_train)
    trained_models.append(clf)

# Custom soft voting ưu tiên mẫu dương với quy tắc ít nhất 2 mô hình đoán là 1
n_labels = y_test.shape[1]
y_pred_custom = np.zeros_like(y_test)

for i in range(n_labels):
    # Lấy xác suất của label 1 từ từng model cho nhãn thứ i
    probas = []
    for model in trained_models:
        estimator = model.estimators_[i]
        if hasattr(estimator, "predict_proba"):
            proba = estimator.predict_proba(X_test)[:, 1]
        elif hasattr(estimator, "decision_function"):
            scores = estimator.decision_function(X_test)
            proba = 1 / (1 + np.exp(-scores))
        else:
            raise ValueError(f"Model {estimator} doesn't support predict_proba or decision_function.")
        probas.append(proba)
    probas = np.array(probas)

    # Quy tắc voting: nếu có ít nhất 2 model đoán là 1 thì predict = 1
    y_pred_custom[:, i] = (probas >= 0.5).sum(axis=0) >= 2  
    y_pred_custom[:, i] = y_pred_custom[:, i].astype(int)

print(classification_report(y_test, y_pred_custom))


In [None]:
print("\n[ENSEMBLE-CUSTOM] Đánh giá theo từng triệu chứng:")
for i, sym in enumerate(top_20_symptoms):
    print(f"\n>>> {sym}:")
    print(classification_report(y_test[:, i], y_pred_custom[:, i], zero_division=0))

# Jaccard score
print("\n[ENSEMBLE-CUSTOM] Jaccard Score (samples average):",
      jaccard_score(y_test, y_pred_custom, average='samples'))

# AUC-ROC
print("\n[ENSEMBLE-CUSTOM] AUC-ROC cho từng triệu chứng:")
for i, sym in enumerate(top_20_symptoms):
    if np.sum(y_test[:, i]) == 0:
        print(f"{sym}: Bỏ qua (không có nhãn dương trong test)")
        continue
    proba = trained_models[1].estimators_[i].predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test[:, i], proba)
    print(f"{sym}: AUC = {auc:.4f}")