In [12]:
import pandas as pd
import random
from itertools import combinations
import csv
import math
import time
random.seed(time.time())


class MedicalDataAugmentation:
    def __init__(self):
        # C√°c t·ª´ n·ªëi t·ª± nhi√™n gi·ªØa c√°c tri·ªáu ch·ª©ng
        self.connection_words = ["v√†", "c√πng v·ªõi", "k√®m theo", "ƒë·ªìng th·ªùi", "ngo√†i ra c√≤n"]

        # C√°c m·∫´u c√¢u t·ª± nhi√™n
        self.sentence_patterns = [
            "t√¥i b·ªã {symptoms}",
            "t√¥i c√≥ tri·ªáu ch·ª©ng {symptoms}",
            "t√¥i ƒëang g·∫∑p ph·∫£i {symptoms}",
            "t√¥i c·∫£m th·∫•y {symptoms}",
            "t√¥i xu·∫•t hi·ªán {symptoms}",
            "t√¥i c√≥ d·∫•u hi·ªáu {symptoms}",
            "t√¥i g·∫∑p ph·∫£i {symptoms}",
            "t√¥i ƒëang c√≥ {symptoms}",
            "t√¥i ph√°t hi·ªán {symptoms}"
        ]

    def load_data(self, file_path):
        """ƒê·ªçc d·ªØ li·ªáu t·ª´ file Excel"""
        try:
            df = pd.read_excel(file_path)
            print(f"‚úÖ ƒê√£ ƒë·ªçc {len(df)} b·ªánh t·ª´ file: {file_path}")
            return df
        except Exception as e:
            print(f"‚ùå L·ªói khi ƒë·ªçc file: {e}")
            return None

    def parse_symptoms(self, symptoms_text):
        """T√°ch c√°c tri·ªáu ch·ª©ng t·ª´ chu·ªói text"""
        return [s.strip() for s in symptoms_text.split(',') if s.strip()]

    def shuffle_symptoms(self, symptom_tuple):
        symptoms = list(symptom_tuple)
        random.shuffle(symptoms)
        return symptoms


    def create_symptom_combinations(self, symptoms, ratio=2/3):
        """T·∫°o c√°c t·ªï h·ª£p tri·ªáu ch·ª©ng"""
        if not symptoms:
            return []

        total_symptoms = len(symptoms)
        num_to_select = max(1, math.ceil(total_symptoms * ratio))

        # N·∫øu tri·ªáu ch·ª©ng √≠t h∆°n s·ªë c·∫ßn ch·ªçn, th√¨ tr·∫£ v·ªÅ nguy√™n tri·ªáu ch·ª©ng
        if total_symptoms <= num_to_select:
            return [tuple(symptoms)]

        # T·∫°o t·∫•t c·∫£ t·ªï h·ª£p ƒë·ªô d√†i num_to_select
        return list(combinations(symptoms, num_to_select))

    def create_natural_sentence(self, symptoms_list):
        """T·∫°o c√¢u t·ª± nhi√™n t·ª´ danh s√°ch tri·ªáu ch·ª©ng"""
        if not symptoms_list:
            return ""

        pattern = random.choice(self.sentence_patterns)

        if len(symptoms_list) == 1:
            symptoms_text = symptoms_list[0]
        else:
            connector = random.choice(self.connection_words)
            symptoms_text = f"{symptoms_list[0]} {connector} {', '.join(symptoms_list[1:])}"

        return pattern.format(symptoms=symptoms_text)

    def shuffle_symptoms(self, symptom_tuple):
        symptoms = list(symptom_tuple)
        random.shuffle(symptoms)
        return symptoms


    def augment_disease_data(self, disease_name, symptoms_text, target_samples=20):
        """T·∫°o d·ªØ li·ªáu tƒÉng c∆∞·ªùng cho m·ªôt b·ªánh"""
        symptoms = self.parse_symptoms(symptoms_text)
        print(f"ü¶† B·ªánh: {disease_name} - {len(symptoms)} tri·ªáu ch·ª©ng")

        combinations_list = self.create_symptom_combinations(self.shuffle_symptoms(symptoms), ratio=1/4)

        samples = []
        used_combinations = set()

        for _ in range(target_samples):
            if combinations_list:
                available = [c for c in combinations_list if c not in used_combinations]
                if not available:
                    available = combinations_list
                    used_combinations.clear()

                selected = random.choice(available)
                used_combinations.add(selected)
                sentence = self.create_natural_sentence(list(selected))
                samples.append({
                    'disease': disease_name,
                    'symptoms': sentence,
                    'original_symptoms': ', '.join(selected)
                })
            else:
                # fallback: d√πng h·∫øt tri·ªáu ch·ª©ng g·ªëc
                sentence = self.create_natural_sentence(symptoms)
                samples.append({
                    'disease': disease_name,
                    'symptoms': sentence,
                    'original_symptoms': ', '.join(symptoms)
                })

        return samples


    def process_all_diseases(self, df, output_file='/content/drive/MyDrive/Colab Notebooks/NLP/health_api/data_processed/' + 'augmented_medical_data.csv'):
        """TƒÉng c∆∞·ªùng d·ªØ li·ªáu cho t·∫•t c·∫£ c√°c b·ªánh v√† l∆∞u v√†o CSV"""
        all_samples = []

        for index, row in df.iterrows():
            disease_name = row.iloc[0]
            symptoms_text = row.iloc[1]

            print(f"\nüîÑ ƒêang x·ª≠ l√Ω: {disease_name}")
            samples = self.augment_disease_data(disease_name, symptoms_text, target_samples=15)
            all_samples.extend(samples)
            print(f"‚úÖ ƒê√£ t·∫°o {len(samples)} c√¢u cho {disease_name}")

        # L∆∞u file CSV
        with open(output_file, 'w', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=['disease', 'symptoms', 'original_symptoms'])
            writer.writeheader()
            writer.writerows(all_samples)

        print(f"\nüìÅ ƒê√£ l∆∞u {len(all_samples)} c√¢u v√†o file: {output_file}")
        return all_samples


In [None]:
processor = MedicalDataAugmentation()

# ƒê·ªçc d·ªØ li·ªáu t·ª´ file Excel
df = processor.load_data('data/trieu_chung_theo_benh.xlsx')

if df is not None:
    # X·ª≠ l√Ω v√† t·∫°o d·ªØ li·ªáu tƒÉng c∆∞·ªùng
    samples = processor.process_all_diseases(df)

    # Hi·ªÉn th·ªã th·ªëng k√™
    print("\n=== TH·ªêNG K√ä D·ªÆ LI·ªÜU ===")
    disease_counts = {}
    for sample in samples:
        disease = sample['disease']
        disease_counts[disease] = disease_counts.get(disease, 0) + 1

    for disease, count in disease_counts.items():
        print(f"{disease}: {count} samples")

    print(f"\nT·ªïng c·ªông: {len(samples)} samples")

    # Hi·ªÉn th·ªã m·ªôt s·ªë m·∫´u
    print("\n=== M·ªòT S·ªê M·∫™U D·ªÆ LI·ªÜU ===")
    for i, sample in enumerate(samples[:5]):
        print(f"Sample {i+1}:")
        print(f"  B·ªánh: {sample['disease']}")
        print(f"  Tri·ªáu ch·ª©ng: {sample['symptoms']}")
        print(f"  Tri·ªáu ch·ª©ng g·ªëc: {sample['original_symptoms']}")
        print()

‚úÖ ƒê√£ ƒë·ªçc 16 b·ªánh t·ª´ file: /content/drive/MyDrive/Colab Notebooks/NLP/health_api/data/trieu_chung_theo_benh.xlsx

üîÑ ƒêang x·ª≠ l√Ω: ti√™u ch·∫£y
ü¶† B·ªánh: ti√™u ch·∫£y - 16 tri·ªáu ch·ª©ng
‚úÖ ƒê√£ t·∫°o 15 c√¢u cho ti√™u ch·∫£y

üîÑ ƒêang x·ª≠ l√Ω: t√°o b√≥n
ü¶† B·ªánh: t√°o b√≥n - 9 tri·ªáu ch·ª©ng
‚úÖ ƒê√£ t·∫°o 15 c√¢u cho t√°o b√≥n

üîÑ ƒêang x·ª≠ l√Ω: vi√™m m≈©i h·ªçng
ü¶† B·ªánh: vi√™m m≈©i h·ªçng - 13 tri·ªáu ch·ª©ng
‚úÖ ƒê√£ t·∫°o 15 c√¢u cho vi√™m m≈©i h·ªçng

üîÑ ƒêang x·ª≠ l√Ω: kh√¥ m·∫Øt
ü¶† B·ªánh: kh√¥ m·∫Øt - 6 tri·ªáu ch·ª©ng
‚úÖ ƒê√£ t·∫°o 15 c√¢u cho kh√¥ m·∫Øt

üîÑ ƒêang x·ª≠ l√Ω: h·∫° canxi m√°u
ü¶† B·ªánh: h·∫° canxi m√°u - 8 tri·ªáu ch·ª©ng
‚úÖ ƒê√£ t·∫°o 15 c√¢u cho h·∫° canxi m√°u

üîÑ ƒêang x·ª≠ l√Ω: s·ªët ph√°t ban
ü¶† B·ªánh: s·ªët ph√°t ban - 11 tri·ªáu ch·ª©ng
‚úÖ ƒê√£ t·∫°o 15 c√¢u cho s·ªët ph√°t ban

üîÑ ƒêang x·ª≠ l√Ω: vi√™m x∆∞∆°ng
ü¶† B·ªánh: vi√™m x∆∞∆°ng - 17 tri·ªáu ch·ª©ng
‚úÖ ƒê√£ t·∫°o 15 c√¢u cho vi√™m x∆∞∆°ng

üîÑ