<a href="https://colab.research.google.com/github/HaqTetsuya/rusdi-prototype-1/blob/main/test_fuzzy_string.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install rapidfuzz

Collecting rapidfuzz
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m29.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz
Successfully installed rapidfuzz-3.13.0


In [6]:
# This approach uses a hybrid method: pattern matching + context validation

import re
from rapidfuzz import fuzz

class EntityExtractor:
    def __init__(self):
        # Core entities with their primary and alternative representations
        self.entities = {
            "wifi": {
                "primary": ["wifi", "wi-fi", "internet"],
                "context": ["koneksi", "hotspot", "jaringan", "online", "signal", "akses"]
            },
            "ruang_baca": {
                "primary": ["ruang baca", "reading room"],
                "context": ["baca", "membaca", "buku", "literatur", "perpustakaan", "literature"]
            },
            "ruang_diskusi": {
                "primary": ["ruang diskusi", "discussion room", "ruang meeting"],
                "context": ["diskusi", "meeting", "rapat", "diskusikan", "berdiskusi", "pertemuan"]
            },
            "komputer": {
                "primary": ["komputer", "pc", "komputer umum", "komputer publik"],
                "context": ["laptop", "komputer", "pc", "computer", "desktop", "browsing"]
            },
            "kantin": {
                "primary": ["kantin", "cafeteria", "cafetaria", "food court"],
                "context": ["makan", "makanan", "minum", "jajan", "lapar", "kuliner", "santap"]
            },
            "toilet": {
                "primary": ["toilet", "kamar mandi", "wc", "restroom"],
                "context": ["buang air", "kencing", "bak", "cuci tangan", "mandi"]
            },
            "mushola": {
                "primary": ["mushola", "mushalla", "tempat sholat", "ruang ibadah"],
                "context": ["sholat", "salat", "ibadah", "sembahyang", "beribadah", "solat"]
            },
            "parkir": {
                "primary": ["parkir", "parkiran", "tempat parkir", "area parkir"],
                "context": ["mobil", "motor", "kendaraan", "memarkir", "memarkirkan", "parkiran"]
            },
            "printer": {
                "primary": ["printer", "fotokopi", "fotocopy", "scan", "scanner"],
                "context": ["cetak", "print", "mencetak", "dokumen", "kertas", "copy", "printing"]
            }
        }

        # Build regex patterns for efficient initial matching
        self.patterns = {}
        for entity, terms in self.entities.items():
            # Build regex pattern with word boundaries for all primary terms
            primary_pattern = '|'.join([r'\b' + re.escape(term) + r'\b' for term in terms["primary"]])

            # Build combined pattern for primary and context terms
            all_terms = terms["primary"] + terms["context"]
            all_pattern = '|'.join([re.escape(term) for term in all_terms])

            self.patterns[entity] = {
                "primary": re.compile(primary_pattern, re.IGNORECASE),
                "all": re.compile(all_pattern, re.IGNORECASE),
            }

    def extract_entities(self, text):
        text = text.lower()
        detected_entities = set()

        # Step 1: Check for direct matches using regex (most efficient and precise)
        for entity, patterns in self.patterns.items():
            if patterns["primary"].search(text):
                detected_entities.add(entity)

        # Step 2: If not enough entities found, check for context matches
        if len(detected_entities) < 2:
            potential_entities = {}

            for entity, patterns in self.patterns.items():
                if entity not in detected_entities:
                    # Check if any contextual terms are present
                    context_matches = patterns["all"].findall(text)
                    if context_matches:
                        # Calculate a score based on how many context terms were found
                        # and how much of the text they cover
                        score = len(context_matches) * 10

                        # Add scores for fuzzy matches to primary terms
                        for primary_term in self.entities[entity]["primary"]:
                            # Check for variations/misspellings
                            max_ratio = max([fuzz.ratio(primary_term, word) for word in text.split()])
                            if max_ratio > 75:  # Only count significant matches
                                score += max_ratio / 10

                        potential_entities[entity] = score

            # Filter potential entities by score threshold
            for entity, score in potential_entities.items():
                if score >= 15:  # Adjust threshold as needed
                    detected_entities.add(entity)

        # Special case handling for common errors hapus jika perlu
        if "tempat sholat" in text.lower() or "sholat" in text.lower():
            detected_entities = {entity for entity in detected_entities if entity != "ruang_baca"}
            detected_entities = {entity for entity in detected_entities if entity != "ruang_diskusi"}
            detected_entities = {entity for entity in detected_entities if entity != "kantin"}
            detected_entities.add("mushola")

        # Handle PC reference for komputer entity
        if "pc" in text.lower().split() or "komputer" in text.lower():
            detected_entities.add("komputer")

        # Special case for parking
        if "parkir" in text.lower() or "memarkir" in text.lower() or "memarkirkan" in text.lower():
            detected_entities = {entity for entity in detected_entities if entity != "ruang_baca"}
            detected_entities = {entity for entity in detected_entities if entity != "ruang_diskusi"}
            detected_entities = {entity for entity in detected_entities if entity != "kantin"}
            detected_entities.add("parkir")

        return list(detected_entities)

# Initialize extractor
extractor = EntityExtractor()

# Test with the same messages
test_messages = [
    "Apakah ada wifi gratis?",
    "Saya mau ke ruang diskus dan ruang baca",
    "Apakah ada komputer buat umum dan toilet di sini?",
    "Dimana saya bisa fotocopy dan print?",
    "Ada cafetaria dan mushola?",
    "Saya ingin tahu tentang parkir dan internet",
    "Dimana saya bisa scan dokumen?",
    "Apakah ada tempat untuk sholat?",
    "Dimana tempat untuk memarkirkan kendaraan?",
    "Boleh pinjam PC sebentar?",
]

print("TESTING CONTEXT-AWARE ENTITY EXTRACTION:\n")
for msg in test_messages:
    print(f"User: {msg}")
    print(f"Detected Entities: {extractor.extract_entities(msg)}\n")

TESTING CONTEXT-AWARE ENTITY EXTRACTION:

User: Apakah ada wifi gratis?
Detected Entities: ['wifi']

User: Saya mau ke ruang diskus dan ruang baca
Detected Entities: ['ruang_baca']

User: Apakah ada komputer buat umum dan toilet di sini?
Detected Entities: ['komputer', 'toilet']

User: Dimana saya bisa fotocopy dan print?
Detected Entities: ['printer']

User: Ada cafetaria dan mushola?
Detected Entities: ['kantin', 'mushola']

User: Saya ingin tahu tentang parkir dan internet
Detected Entities: ['parkir', 'wifi']

User: Dimana saya bisa scan dokumen?
Detected Entities: ['printer']

User: Apakah ada tempat untuk sholat?
Detected Entities: []

User: Dimana tempat untuk memarkirkan kendaraan?
Detected Entities: ['parkir']

User: Boleh pinjam PC sebentar?
Detected Entities: ['komputer']



In [7]:
import pandas as pd

data_fasilitas = [
    ("Ada fasilitas wifi di perpustakaan?", "fasilitas_perpustakaan"),
    ("Fasilitas apa saja yang tersedia?", "fasilitas_perpustakaan"),
    ("Saya mau tahu tentang ruang diskusi.", "fasilitas_perpustakaan"),
    ("Apakah ada ruang belajar kelompok?", "fasilitas_perpustakaan"),
    ("Perpustakaan punya area membaca nyaman?", "fasilitas_perpustakaan"),
    ("Bisa pakai komputer di sana?", "fasilitas_perpustakaan"),
    ("Apakah ada tempat print atau fotokopi?", "fasilitas_perpustakaan"),
    ("Di perpustakaan ada mushola?", "fasilitas_perpustakaan"),
    ("Apakah ada kantin di dekat perpustakaan?", "fasilitas_perpustakaan"),
    ("Ada parkiran buat kendaraan?", "fasilitas_perpustakaan"),
    ("Apakah tersedia ruang baca pribadi?", "fasilitas_perpustakaan"),
    ("Saya ingin tahu apa saja fasilitas di perpustakaan.", "fasilitas_perpustakaan"),
    ("Ada ruang untuk belajar kelompok di perpustakaan?", "fasilitas_perpustakaan"),
    ("Perpustakaan ada layanan internet gratis?", "fasilitas_perpustakaan"),
    ("Bisa mengakses wifi di perpustakaan?", "fasilitas_perpustakaan"),
    ("Fasilitas untuk diskusi bareng ada?", "fasilitas_perpustakaan"),
    ("Kalau mau scan atau print, bisa di mana?", "fasilitas_perpustakaan"),
    ("Di perpustakaan ada toilet?", "fasilitas_perpustakaan"),
    ("Apakah disediakan komputer umum?", "fasilitas_perpustakaan"),
    ("Perpustakaan punya kantin atau tempat makan?", "fasilitas_perpustakaan"),
    ("Apakah tersedia ruang untuk presentasi?", "fasilitas_perpustakaan"),
    ("Ada ruang quiet zone untuk belajar?", "fasilitas_perpustakaan"),
    ("Fasilitas parkir tersedia?", "fasilitas_perpustakaan"),
    ("Bisa sholat di perpustakaan?", "fasilitas_perpustakaan"),
    ("Fasilitas mushola ada atau tidak?", "fasilitas_perpustakaan"),
    ("Kalau mau cari ruang baca, ada?", "fasilitas_perpustakaan"),
    ("Apa saja fasilitas umum di perpustakaan ini?", "fasilitas_perpustakaan"),
    ("Saya butuh tempat diskusi, ada?", "fasilitas_perpustakaan"),
    ("Apakah ada ruang seminar di perpustakaan?", "fasilitas_perpustakaan"),
    ("Ada komputer buat pengunjung?", "fasilitas_perpustakaan"),
]

# Buat dataframe
df_fasilitas = pd.DataFrame(data_fasilitas, columns=["text", "intent"])

# Export CSV
df_fasilitas.to_csv("intent_fasilitas_perpustakaan.csv", index=False)


In [9]:
import pandas as pd

data_fasilitas_umum = [
    ("Apakah tersedia layanan penelusuran literatur?", "fasilitas_perpustakaan"),
    ("Di perpustakaan bisa bantu cari jurnal?", "fasilitas_perpustakaan"),
    ("Ada fasilitas untuk penelusuran skripsi?", "fasilitas_perpustakaan"),
    ("Bisa bantu saya cari laporan PKL?", "fasilitas_perpustakaan"),
    ("Apakah ada layanan penelusuran jurnal ilmiah?", "fasilitas_perpustakaan"),
    ("Perpustakaan menyediakan fotokopi koleksi?", "fasilitas_perpustakaan"),
    ("Bisa fotokopi buku di perpustakaan?", "fasilitas_perpustakaan"),
    ("Dimana saya bisa fotokopi koleksi perpustakaan?", "fasilitas_perpustakaan"),
    ("Ada fasilitas fotokopi dokumen?", "fasilitas_perpustakaan"),
    ("Bisa fotokopi artikel jurnal di sana?", "fasilitas_perpustakaan"),
    ("Apakah ada ruang baca umum di perpustakaan?", "fasilitas_perpustakaan"),
    ("Saya mau cari ruang baca, ada?", "fasilitas_perpustakaan"),
    ("Dimana lokasi ruang baca di perpustakaan?", "fasilitas_perpustakaan"),
    ("Ada ruang baca nyaman untuk mahasiswa?", "fasilitas_perpustakaan"),
    ("Bolehkah menggunakan ruang baca di sana?", "fasilitas_perpustakaan"),
    ("Apakah ada locker untuk tas?", "fasilitas_perpustakaan"),
    ("Di mana tempat simpan tas dan jaket?", "fasilitas_perpustakaan"),
    ("Bisa menitipkan barang di perpustakaan?", "fasilitas_perpustakaan"),
    ("Ada locker penyimpanan barang pribadi?", "fasilitas_perpustakaan"),
    ("Locker tas tersedia di perpustakaan?", "fasilitas_perpustakaan"),
    ("Apakah ada akses internet gratis di perpustakaan?", "fasilitas_perpustakaan"),
    ("Bisa menggunakan WiFi di perpustakaan?", "fasilitas_perpustakaan"),
    ("Ada fasilitas internet untuk pengunjung?", "fasilitas_perpustakaan"),
    ("Bagaimana cara akses WiFi di perpustakaan?", "fasilitas_perpustakaan"),
    ("Internet tersedia di area perpustakaan?", "fasilitas_perpustakaan"),
    ("Fasilitas apa saja di perpustakaan ini?", "fasilitas_perpustakaan"),
    ("Apa saja layanan yang bisa digunakan di perpustakaan?", "fasilitas_perpustakaan"),
    ("Saya ingin tahu fasilitas umum perpustakaan.", "fasilitas_perpustakaan"),
    ("Ada fasilitas pendukung di perpustakaan?", "fasilitas_perpustakaan"),
    ("Layanan umum apa saja yang ada di perpustakaan?", "fasilitas_perpustakaan"),
]

# Buat dataframe
df_fasilitas_umum = pd.DataFrame(data_fasilitas_umum, columns=["text", "intent"])

# Export CSV
df_fasilitas_umum.to_csv("intent_fasilitas_umum.csv", index=False)
