In [1]:
import json
import pandas as pd
from pathlib import Path
import unicodedata

def normalize_text(text):
    """Normalize text untuk perbandingan yang lebih baik"""
    if not isinstance(text, str):
        return ""
    # Ubah ke lowercase dan hapus whitespace ekstra
    text = text.lower().strip()
    # Hapus karakter khusus dan normalisasi
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('ascii')
    return text

def load_corpus_data(corpus_path):
    """Load data corpus dari file JSON"""
    try:
        with open(corpus_path, 'r', encoding='utf-8') as f:
            corpus_data = json.load(f)
        print(f"Berhasil load {len(corpus_data)} dokumen dari corpus.json")
        return corpus_data
    except Exception as e:
        print(f"Error loading corpus data: {e}")
        return []

def load_excel_data(excel_path):
    """Load data dari file Excel"""
    try:
        df = pd.read_excel(excel_path)
        print(f"Berhasil load data Excel dengan {len(df)} baris")
        print(f"Kolom yang tersedia: {list(df.columns)}")
        return df
    except Exception as e:
        print(f"Error loading Excel data: {e}")
        return None

def create_author_mapping(df):
    """Buat mapping berdasarkan nama penulis yang dinormalisasi"""
    author_mapping = {}
    
    # Pastikan kolom yang diperlukan ada
    required_columns = ['Authors', 'Link Detail', 'Link PDF']
    for col in required_columns:
        if col not in df.columns:
            print(f"Kolom '{col}' tidak ditemukan di Excel")
            print(f"Kolom yang ada: {list(df.columns)}")
            return {}
    
    for index, row in df.iterrows():
        authors = row['Authors']
        if pd.isna(authors):
            continue
            
        # Handle multiple authors (bisa dipisah dengan koma atau titik koma)
        author_list = str(authors).split(';') if ';' in str(authors) else str(authors).split(',')
        
        for author in author_list:
            normalized_author = normalize_text(author)
            if normalized_author:
                author_mapping[normalized_author] = {
                    'Link Detail': str(row['Link Detail']) if pd.notna(row['Link Detail']) else '',
                    'Link PDF': str(row['Link PDF']) if pd.notna(row['Link PDF']) else ''
                }
    
    print(f"Dibuat mapping untuk {len(author_mapping)} penulis unik")
    return author_mapping

def enhance_corpus_data(corpus_data, author_mapping):
    """Tambahkan link detail dan PDF ke corpus data"""
    enhanced_count = 0
    
    for doc in corpus_data:
        if 'authors' in doc:
            doc_authors = doc['authors']
            
            # Normalisasi nama penulis dari corpus
            normalized_doc_authors = normalize_text(doc_authors)
            
            # Cari kecocokan di mapping
            match_found = False
            
            # Coba cari exact match dulu
            if normalized_doc_authors in author_mapping:
                match_found = True
                links = author_mapping[normalized_doc_authors]
            else:
                # Coba cari partial match
                for author_key in author_mapping.keys():
                    if author_key in normalized_doc_authors or normalized_doc_authors in author_key:
                        match_found = True
                        links = author_mapping[author_key]
                        break
            
            if match_found:
                doc['link_detail'] = links['Link Detail']
                doc['link_pdf'] = links['Link PDF']
                enhanced_count += 1
            else:
                doc['link_detail'] = ''
                doc['link_pdf'] = ''
                print(f"Tidak ditemukan link untuk penulis: {doc_authors}")
    
    print(f"Berhasil menambahkan link ke {enhanced_count} dari {len(corpus_data)} dokumen")
    return corpus_data

def save_enhanced_data(corpus_data, output_path):
    """Simpan data yang sudah diperkaya ke file JSON"""
    try:
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(corpus_data, f, indent=2, ensure_ascii=False)
        print(f"Data berhasil disimpan ke: {output_path}")
        return True
    except Exception as e:
        print(f"Error saving enhanced data: {e}")
        return False

def main():
    # Tentukan path file
    base_path = Path(r"C:\Users\Widnyana\Documents\TUGAS AKHIR\Program TA\skripsi-search-engine\data\processed")
    corpus_path = base_path / "corpus.json"
    excel_path = base_path / "clean_dataTA_SearchEngine.xlsx"
    output_path = base_path / "database_TA.json"
    
    print("=" * 50)
    print("PROSES ENHANCEMENT DATA CORPUS")
    print("=" * 50)
    
    # Step 1: Load data corpus
    print("\n1. Memuat data corpus...")
    corpus_data = load_corpus_data(corpus_path)
    
    if not corpus_data:
        print("Tidak ada data corpus yang dimuat. Proses dihentikan.")
        return
    
    # Step 2: Load data Excel
    print("\n2. Memuat data Excel...")
    df = load_excel_data(excel_path)
    
    if df is None or df.empty:
        print("Tidak ada data Excel yang dimuat. Proses dihentikan.")
        return
    
    # Step 3: Buat mapping dari Excel
    print("\n3. Membuat mapping penulis dari Excel...")
    author_mapping = create_author_mapping(df)
    
    if not author_mapping:
        print("Tidak berhasil membuat mapping penulis. Proses dihentikan.")
        return
    
    # Step 4: Tampilkan beberapa contoh mapping
    print("\n4. Contoh mapping penulis yang dibuat:")
    sample_authors = list(author_mapping.keys())[:5]
    for author in sample_authors:
        print(f"  - {author}:")
        print(f"    Link Detail: {author_mapping[author]['Link Detail'][:50]}...")
        print(f"    Link PDF: {author_mapping[author]['Link PDF'][:50]}...")
    
    # Step 5: Enhance corpus data
    print("\n5. Menambahkan link ke corpus data...")
    enhanced_corpus = enhance_corpus_data(corpus_data, author_mapping)
    
    # Step 6: Tampilkan beberapa contoh hasil
    print("\n6. Contoh dokumen yang sudah ditingkatkan:")
    for i, doc in enumerate(enhanced_corpus[:3]):
        print(f"\nDokumen {i+1}:")
        print(f"  ID: {doc.get('doc_id', 'N/A')}")
        print(f"  Penulis: {doc.get('authors', 'N/A')}")
        print(f"  Link Detail: {doc.get('link_detail', 'N/A')[:50]}...")
        print(f"  Link PDF: {doc.get('link_pdf', 'N/A')[:50]}...")
    
    # Step 7: Simpan hasil
    print("\n7. Menyimpan hasil ke database_TA.json...")
    success = save_enhanced_data(enhanced_corpus, output_path)
    
    if success:
        print("\n" + "=" * 50)
        print("PROSES SELESAI!")
        print(f"Hasil disimpan di: {output_path}")
        print(f"Total dokumen: {len(enhanced_corpus)}")
        print("=" * 50)
        
        # Statistik
        docs_with_links = sum(1 for doc in enhanced_corpus if doc.get('link_detail') or doc.get('link_pdf'))
        print(f"\nSTATISTIK:")
        print(f"  - Dokumen dengan link detail: {sum(1 for doc in enhanced_corpus if doc.get('link_detail'))}")
        print(f"  - Dokumen dengan link PDF: {sum(1 for doc in enhanced_corpus if doc.get('link_pdf'))}")
        print(f"  - Dokumen dengan minimal satu link: {docs_with_links}")
    else:
        print("\nGagal menyimpan hasil.")

if __name__ == "__main__":
    main()

PROSES ENHANCEMENT DATA CORPUS

1. Memuat data corpus...
Berhasil load 2754 dokumen dari corpus.json

2. Memuat data Excel...
Berhasil load data Excel dengan 2754 baris
Kolom yang tersedia: ['Title', 'Authors', 'Advisors', 'Keywords', 'Publisher', 'Abstract', 'Issue Date', 'Link Detail', 'Link PDF', 'BAB 1', 'BAB 2', 'BAB 3', 'BAB 4', 'BAB 5']

3. Membuat mapping penulis dari Excel...
Dibuat mapping untuk 2748 penulis unik

4. Contoh mapping penulis yang dibuat:
  - windy fajriah fitri:
    Link Detail: https://repository.uinjkt.ac.id/dspace/handle/1234...
    Link PDF: https://repository.uinjkt.ac.id/dspace/bitstream/1...
  - muhammad taqy pratama putra:
    Link Detail: https://repository.uinjkt.ac.id/dspace/handle/1234...
    Link PDF: https://repository.uinjkt.ac.id/dspace/bitstream/1...
  - rio galeh prayoga:
    Link Detail: https://repository.uinjkt.ac.id/dspace/handle/1234...
    Link PDF: https://repository.uinjkt.ac.id/dspace/bitstream/1...
  - yasmine amalia ismail:
    Link

In [4]:
import pandas as pd
import json
import os
from datetime import datetime

# =========================
# PATH
# =========================
excel_path = r"C:\Users\Widnyana\Documents\TUGAS AKHIR\Program TA\skripsi-search-engine\data\processed\clean_dataTA_SearchEngine.xlsx"
output_json = "database_skripsi.json"

# =========================
# LOAD EXCEL
# =========================
df = pd.read_excel(excel_path)

# =========================
# FUNGSI AMAN KONVERSI KE STRING
# =========================
def safe_str(value):
    if pd.isna(value):
        return ""
    if isinstance(value, (pd.Timestamp, datetime)):
        return value.strftime("%Y-%m-%d")
    return str(value)

# =========================
# BUILD JSON STRUCTURE
# =========================
documents = []

for idx, row in df.iterrows():
    doc = {
        "doc_id": f"doc_{idx}",
        "title": safe_str(row.get("Title", "")),
        "keywords": safe_str(row.get("Keywords", "")),
        "abstract": safe_str(row.get("Abstract", "")),
        "authors": safe_str(row.get("Authors", "")),
        "issue_date": safe_str(row.get("Issue Date", "")),
        "publisher": safe_str(row.get("Publisher", "")),
        "fields": {
            "Title": safe_str(row.get("Title", "")),
            "Keywords": safe_str(row.get("Keywords", "")),
            "Abstract": safe_str(row.get("Abstract", "")),
            "BAB 1": safe_str(row.get("BAB 1", "")),
            "BAB 2": safe_str(row.get("BAB 2", "")),
            "BAB 3": safe_str(row.get("BAB 3", "")),
            "BAB 4": safe_str(row.get("BAB 4", "")),
            "BAB 5": safe_str(row.get("BAB 5", "")),
            "Authors": safe_str(row.get("Authors", "")),
            "Advisors": safe_str(row.get("Advisors", "")),
            "Issue Date": safe_str(row.get("Issue Date", "")),
            "Publisher": safe_str(row.get("Publisher", ""))
        },
        "link_detail": safe_str(row.get("Link Detail", "")),
        "link_pdf": safe_str(row.get("Link PDF", ""))
    }

    documents.append(doc)

# =========================
# SAVE JSON
# =========================
with open(output_json, "w", encoding="utf-8") as f:
    json.dump(documents, f, ensure_ascii=False, indent=2)

print(f"Total dokumen: {len(documents)}")
print(f"File berhasil dibuat: {os.path.abspath(output_json)}")


Total dokumen: 2754
File berhasil dibuat: c:\Users\Widnyana\Documents\TUGAS AKHIR\Program TA\skripsi-search-engine\data\processed\database_skripsi.json
