<a href="https://colab.research.google.com/github/Marvel280904/All-Python-Project/blob/main/Checking_Scraping_Korean_School_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import requests
from io import StringIO

def analyze_school_data():
    # URLs dari GitHub Anda
    final_url = "https://raw.githubusercontent.com/Marvel280904/All-Python-Project/refs/heads/main/ALL_korean_schools.csv"
    backup_url = "https://raw.githubusercontent.com/Marvel280904/All-Python-Project/refs/heads/main/backup_schools_1780_records.csv"

    print("üìä MEMUAT DATA SEKOLAH...")
    print("=" * 60)

    try:
        # Load data dari URLs
        print("1. Loading Final Data (ALL_korean_schools.csv)...")
        final_df = pd.read_csv(final_url)

        print("2. Loading Backup Data (backup_schools_1780_records.csv)...")
        backup_df = pd.read_csv(backup_url)

        print("‚úÖ Data berhasil dimuat!")
        print(f"   Final Data: {len(final_df)} records")
        print(f"   Backup Data: {len(backup_df)} records")
        print()

    except Exception as e:
        print(f"‚ùå Error loading data: {e}")
        return

    # ANALISIS 1: CEK DUPLIKAT DI FILE FINAL
    print("üîç ANALISIS 1: CEK DUPLIKAT DI FILE FINAL")
    print("-" * 40)

    # Cek duplikat berdasarkan nama sekolah
    duplicate_names = final_df[final_df.duplicated('school_name', keep=False)]
    duplicate_urls = final_df[final_df.duplicated('source_url', keep=False)]

    print(f"Duplikat berdasarkan Nama Sekolah: {len(duplicate_names)}")
    print(f"Duplikat berdasarkan URL: {len(duplicate_urls)}")

    if len(duplicate_names) > 0:
        print("\n‚ö†Ô∏è  SEKOLAH DUPLIKAT (berdasarkan nama):")
        for idx, row in duplicate_names.iterrows():
            print(f"   - {row['school_name']}")
    else:
        print("‚úÖ Tidak ada duplikat berdasarkan nama sekolah")

    if len(duplicate_urls) > 0:
        print("\n‚ö†Ô∏è  DUPLIKAT URL:")
        for idx, row in duplicate_urls.iterrows():
            print(f"   - {row['source_url']} -> {row['school_name']}")
    else:
        print("‚úÖ Tidak ada duplikat berdasarkan URL")

    print()

    # ANALISIS 2: CEK KELENGKAPAN DATA
    print("üîç ANALISIS 2: CEK KELENGKAPAN DATA")
    print("-" * 40)

    # Bersihkan nama sekolah untuk perbandingan yang lebih akurat
    def clean_school_name(name):
        if pd.isna(name):
            return ""
        return str(name).strip().lower()

    # Apply cleaning
    final_df['clean_name'] = final_df['school_name'].apply(clean_school_name)
    backup_df['clean_name'] = backup_df['school_name'].apply(clean_school_name)

    # Cek sekolah yang ada di backup tapi tidak ada di final
    backup_names = set(backup_df['clean_name'])
    final_names = set(final_df['clean_name'])

    missing_in_final = backup_names - final_names

    print(f"Sekolah di Backup: {len(backup_names)}")
    print(f"Sekolah di Final: {len(final_names)}")
    print(f"Sekolah yang HILANG di Final: {len(missing_in_final)}")
    print()

    if len(missing_in_final) > 0:
        print("‚ùå SEKOLAH YANG ADA DI BACKUP TAPI TIDAK ADA DI FINAL:")
        missing_count = 0
        for school_name in sorted(missing_in_final):
            if school_name:  # Skip empty names
                # Cari data lengkap dari backup
                backup_school = backup_df[backup_df['clean_name'] == school_name].iloc[0]
                print(f"\n{missing_count + 1}. {backup_school['school_name']}")
                print(f"   üìç Alamat: {backup_school.get('address', 'Tidak ada')}")
                print(f"   üîó URL: {backup_school['source_url']}")
                missing_count += 1

                if missing_count >= 20:  # Batasi output
                    remaining = len(missing_in_final) - 20
                    print(f"\n... dan {remaining} sekolah lainnya")
                    break
    else:
        print("‚úÖ SEMUA SEKOLAH DARI BACKUP ADA DI FILE FINAL!")

    print()

    # ANALISIS 3: STATISTIK DETAIL
    print("üìà ANALISIS 3: STATISTIK DETAIL")
    print("-" * 40)

    print(f"Total records Backup: {len(backup_df)}")
    print(f"Total records Final: {len(final_df)}")
    print(f"Perbedaan jumlah: {len(backup_df) - len(final_df)} records")
    print()

    # Cek data quality
    print("Data Quality Final File:")
    print(f"  - Sekolah dengan nama: {final_df['school_name'].notna().sum()}")
    print(f"  - Sekolah dengan alamat: {final_df['address'].notna().sum()}")
    print(f"  - Sekolah tanpa alamat: {final_df['address'].isna().sum()}")
    print()

    # ANALISIS 4: COMPARE UNIQUE VALUES
    print("üîç ANALISIS 4: PERBANDINGAN UNIK")
    print("-" * 40)

    # Hitung unique values berdasarkan kombinasi nama + URL
    backup_unique = backup_df.drop_duplicates(subset=['school_name', 'source_url']).shape[0]
    final_unique = final_df.drop_duplicates(subset=['school_name', 'source_url']).shape[0]

    print(f"Unique schools (nama+URL) di Backup: {backup_unique}")
    print(f"Unique schools (nama+URL) di Final: {final_unique}")

    # Cek jika ada sekolah di final yang tidak ada di backup
    extra_in_final = final_names - backup_names
    if len(extra_in_final) > 0:
        print(f"\n‚ö†Ô∏è  Sekolah di Final tapi TIDAK ada di Backup: {len(extra_in_final)}")
        for school_name in list(extra_in_final)[:5]:  # Tampilkan 5 pertama
            if school_name:
                final_school = final_df[final_df['clean_name'] == school_name].iloc[0]
                print(f"   - {final_school['school_name']}")

    print()

    # ANALISIS 5: SUMMARY
    print("üéØ SUMMARY KESELURUHAN")
    print("-" * 40)

    if len(duplicate_names) == 0 and len(missing_in_final) == 0:
        print("‚úÖ SANGAT BAIK! File final:")
        print("   - Tidak ada duplikat")
        print("   - Semua sekolah dari backup sudah termasuk")
        print("   - Data sudah bersih dan lengkap")
    else:
        if len(duplicate_names) > 0:
            print("‚ùå PERBAIKI: Ada duplikat di file final")
        if len(missing_in_final) > 0:
            print(f"‚ùå PERBAIKI: {len(missing_in_final)} sekolah hilang dari file final")

        print("\nüí° REKOMENDASI:")
        if len(missing_in_final) > 0:
            print("   - Tambahkan sekolah yang hilang dari backup ke file final")
        if len(duplicate_names) > 0:
            print("   - Hapus duplikat dari file final")

def export_missing_schools():
    """Export sekolah yang hilang ke file CSV"""
    print("\nüíæ EKSPOR DATA YANG HILANG...")

    try:
        final_url = "https://raw.githubusercontent.com/Marvel280904/All-Python-Project/refs/heads/main/ALL_korean_schools.csv"
        backup_url = "https://raw.githubusercontent.com/Marvel280904/All-Python-Project/refs/heads/main/backup_schools_1780_records.csv"

        final_df = pd.read_csv(final_url)
        backup_df = pd.read_csv(backup_url)

        # Bersihkan nama
        final_df['clean_name'] = final_df['school_name'].str.strip().str.lower()
        backup_df['clean_name'] = backup_df['school_name'].str.strip().str.lower()

        # Cari yang hilang
        backup_names = set(backup_df['clean_name'])
        final_names = set(final_df['clean_name'])
        missing_in_final = backup_names - final_names

        if len(missing_in_final) > 0:
            # Ambil data lengkap sekolah yang hilang
            missing_schools = backup_df[backup_df['clean_name'].isin(missing_in_final)]

            # Export ke CSV
            missing_schools = missing_schools.drop('clean_name', axis=1)  # Hapus kolom cleaning
            missing_schools.to_csv('missing_schools.csv', index=False, encoding='utf-8-sig')
            print(f"‚úÖ Diexport {len(missing_schools)} sekolah yang hilang ke 'missing_schools.csv'")
        else:
            print("‚úÖ Tidak ada sekolah yang hilang untuk diexport")

    except Exception as e:
        print(f"‚ùå Error exporting: {e}")

# Jalankan analisis
if __name__ == "__main__":
    print("üéì ANALISIS DATA SEKOLAH KOREA")
    print("=" * 60)

    analyze_school_data()

    # Tanya user apakah mau export data yang hilang
    response = input("\nüí° Mau export sekolah yang hilang ke CSV? (y/n): ")
    if response.lower() == 'y':
        export_missing_schools()

    print("\nüéâ ANALISIS SELESAI!")

üéì ANALISIS DATA SEKOLAH KOREA
üìä MEMUAT DATA SEKOLAH...
1. Loading Final Data (ALL_korean_schools.csv)...
2. Loading Backup Data (backup_schools_1780_records.csv)...
‚úÖ Data berhasil dimuat!
   Final Data: 1103 records
   Backup Data: 1780 records

üîç ANALISIS 1: CEK DUPLIKAT DI FILE FINAL
----------------------------------------
Duplikat berdasarkan Nama Sekolah: 37
Duplikat berdasarkan URL: 0

‚ö†Ô∏è  SEKOLAH DUPLIKAT (berdasarkan nama):
   - nan
   - nan
   - Creverse April Incheon Namdong
   - nan
   - Shepherd International Education
   - IEM English Academy
   - nan
   - nan
   - nan
   - nan
   - nan
   - nan
   - nan
   - Creverse CDI Wirye
   - nan
   - Shepherd International Education
   - nan
   - nan
   - nan
   - nan
   - nan
   - nan
   - nan
   - nan
   - IEM English Academy
   - nan
   - nan
   - nan
   - nan
   - nan
   - Creverse CDI Wirye
   - Creverse April Incheon Namdong
   - nan
   - nan
   - nan
   - nan
   - nan
‚úÖ Tidak ada duplikat berdasarkan URL

