# SINTA Web Scraping - ETL dengan Hadoop

Notebook ini akan:
1. Clone repository dari GitHub
2. Install dependencies
3. Install dan setup Hadoop
4. Menjalankan proses ETL

---


## Step 1: Clone Repository


In [None]:
# Clone repository
!git clone https://github.com/Herutriana44/sinta_web_scraping.git

# Pindah ke direktori project
import os
os.chdir('sinta_web_scraping')

print("‚úÖ Repository berhasil di-clone")
print(f"üìÅ Current directory: {os.getcwd()}")


## Step 2: Install Dependencies Python


In [None]:
# Install dependencies dari requirements.txt
!pip install -q selenium webdriver-manager bs4 beautifulsoup4 hdfs3

print("‚úÖ Dependencies Python berhasil diinstall")


## Step 3: Install Java (Required untuk Hadoop)


In [None]:
# Install Java JDK 8 (required untuk Hadoop)
!apt-get update -qq
!apt-get install -y -qq openjdk-8-jdk > /dev/null 2>&1

# Set JAVA_HOME
import os
os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-8-openjdk-amd64'
os.environ['PATH'] = os.environ['JAVA_HOME'] + '/bin:' + os.environ['PATH']

# Verify Java installation
!java -version

print("\n‚úÖ Java berhasil diinstall")


## Step 4: Install dan Setup Hadoop


In [None]:
# Download Hadoop 3.3.6
!wget -q https://archive.apache.org/dist/hadoop/common/hadoop-3.3.6/hadoop-3.3.6.tar.gz

# Extract Hadoop
!tar -xzf hadoop-3.3.6.tar.gz

# Set environment variables
import os
os.environ['HADOOP_HOME'] = os.path.join(os.getcwd(), 'hadoop-3.3.6')
os.environ['HADOOP_CONF_DIR'] = os.path.join(os.environ['HADOOP_HOME'], 'etc/hadoop')
os.environ['PATH'] = os.path.join(os.environ['HADOOP_HOME'], 'bin') + ':' + \
                     os.path.join(os.environ['HADOOP_HOME'], 'sbin') + ':' + os.environ['PATH']

print("‚úÖ Hadoop berhasil di-download dan di-extract")
print(f"üìÅ HADOOP_HOME: {os.environ['HADOOP_HOME']}")


## Step 5: Konfigurasi Hadoop (Pseudo-Distributed Mode)


In [None]:
import os

# Path ke direktori konfigurasi Hadoop
hadoop_conf_dir = os.path.join(os.environ['HADOOP_HOME'], 'etc/hadoop')

# 1. Konfigurasi core-site.xml
core_site_xml = '''<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
    <property>
        <name>fs.defaultFS</name>
        <value>hdfs://localhost:9000</value>
    </property>
</configuration>'''

with open(os.path.join(hadoop_conf_dir, 'core-site.xml'), 'w') as f:
    f.write(core_site_xml)

# 2. Konfigurasi hdfs-site.xml
hdfs_site_xml = '''<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
    <property>
        <name>dfs.replication</name>
        <value>1</value>
    </property>
    <property>
        <name>dfs.namenode.name.dir</name>
        <value>file:///tmp/hadoop-${user.name}/dfs/name</value>
    </property>
    <property>
        <name>dfs.datanode.data.dir</name>
        <value>file:///tmp/hadoop-${user.name}/dfs/data</value>
    </property>
</configuration>'''

with open(os.path.join(hadoop_conf_dir, 'hdfs-site.xml'), 'w') as f:
    f.write(hdfs_site_xml)

# 3. Konfigurasi mapred-site.xml
mapred_site_xml = '''<?xml version="1.0"?>
<configuration>
    <property>
        <name>mapreduce.framework.name</name>
        <value>yarn</value>
    </property>
</configuration>'''

with open(os.path.join(hadoop_conf_dir, 'mapred-site.xml'), 'w') as f:
    f.write(mapred_site_xml)

# 4. Konfigurasi yarn-site.xml
yarn_site_xml = '''<?xml version="1.0"?>
<configuration>
    <property>
        <name>yarn.nodemanager.aux-services</name>
        <value>mapreduce_shuffle</value>
    </property>
</configuration>'''

with open(os.path.join(hadoop_conf_dir, 'yarn-site.xml'), 'w') as f:
    f.write(yarn_site_xml)

# 5. Set JAVA_HOME di hadoop-env.sh
hadoop_env_sh = f'''export JAVA_HOME={os.environ['JAVA_HOME']}
export HADOOP_HOME={os.environ['HADOOP_HOME']}
export HADOOP_CONF_DIR={os.environ['HADOOP_CONF_DIR']}
'''

with open(os.path.join(hadoop_conf_dir, 'hadoop-env.sh'), 'a') as f:
    f.write(hadoop_env_sh)

print("‚úÖ Konfigurasi Hadoop selesai")


## Step 6: Setup SSH (Required untuk Hadoop)


In [None]:
# Install SSH server
!apt-get install -y -qq openssh-server > /dev/null 2>&1

# Setup SSH untuk Hadoop
!ssh-keygen -t rsa -P '' -f ~/.ssh/id_rsa > /dev/null 2>&1
!cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys
!chmod 0600 ~/.ssh/authorized_keys

# Start SSH service
!service ssh start > /dev/null 2>&1

# Test SSH connection
!ssh-keyscan -H localhost >> ~/.ssh/known_hosts 2>/dev/null

print("‚úÖ SSH berhasil dikonfigurasi")


## Step 7: Format HDFS dan Start Hadoop Services


In [None]:
import os
import subprocess
import time

# Format HDFS (hanya perlu dilakukan sekali, tapi aman untuk diulang)
print("üìù Formatting HDFS...")
format_result = subprocess.run(
    ['hdfs', 'namenode', '-format', '-force', '-nonInteractive'],
    env=os.environ,
    capture_output=True,
    text=True
)

if format_result.returncode == 0:
    print("‚úÖ HDFS berhasil di-format")
else:
    print(f"‚ö†Ô∏è Format HDFS: {format_result.stderr}")

# Start NameNode
print("\nüöÄ Starting NameNode...")
namenode_process = subprocess.Popen(
    ['hdfs', '--daemon', 'start', 'namenode'],
    env=os.environ
)
time.sleep(5)

# Start DataNode
print("üöÄ Starting DataNode...")
datanode_process = subprocess.Popen(
    ['hdfs', '--daemon', 'start', 'datanode'],
    env=os.environ
)
time.sleep(5)

# Check status
print("\nüìä Checking Hadoop services status...")
!jps

print("\n‚úÖ Hadoop services sudah berjalan")
print("üåê NameNode Web UI: http://localhost:9870")
print("üìÅ HDFS Path: hdfs://localhost:9000")


## Step 8: Verifikasi HDFS dan Buat Direktori


In [None]:
import subprocess
import os

# Tunggu beberapa detik untuk memastikan HDFS siap
import time
time.sleep(3)

# Test HDFS dengan membuat direktori
print("üìÅ Membuat direktori di HDFS...")

# Buat direktori untuk data SINTA
result = subprocess.run(
    ['hdfs', 'dfs', '-mkdir', '-p', '/user/sinta/journals'],
    env=os.environ,
    capture_output=True,
    text=True
)

if result.returncode == 0:
    print("‚úÖ Direktori HDFS berhasil dibuat: /user/sinta/journals")
else:
    print(f"‚ö†Ô∏è {result.stderr}")

# List direktori HDFS
print("\nüìÇ Isi direktori HDFS:")
!hdfs dfs -ls -R /user/sinta/

print("\n‚úÖ HDFS siap digunakan!")


## Step 9: Jalankan ETL Process


In [None]:
# Import ETL module
from sinta_journals_etl import SINTAJournalsETL

# Inisialisasi ETL dengan HDFS
print("üîß Menginisialisasi ETL dengan HDFS...")
etl = SINTAJournalsETL(
    input_folder="output_journals",
    output_folder="output_data",
    hdfs_enabled=True,
    hdfs_url="http://localhost:9870",
    hdfs_path="/user/sinta/journals",
    hdfs_user=None  # Gunakan user default
)

# Jalankan ETL process
print("\nüöÄ Menjalankan proses ETL...")
etl.run(output_format='both', save_to_hdfs=True)

print("\n‚úÖ ETL Process selesai!")


## Step 10: Verifikasi Hasil ETL


In [None]:
import os
import json
from pathlib import Path

# Cek file output lokal
output_dir = Path('output_data')
if output_dir.exists():
    print("üìÅ File output lokal:")
    for file in sorted(output_dir.glob('*')):
        size = file.stat().st_size / 1024  # KB
        print(f"  - {file.name} ({size:.2f} KB)")
    
    # Tampilkan statistik jika ada
    stats_files = list(output_dir.glob('extraction_stats_*.json'))
    if stats_files:
        latest_stats = sorted(stats_files)[-1]
        with open(latest_stats, 'r') as f:
            stats = json.load(f)
        print("\nüìä Statistik Ekstraksi:")
        print(json.dumps(stats['statistics'], indent=2))

# Cek file di HDFS
print("\nüìÅ File di HDFS:")
!hdfs dfs -ls -R /user/sinta/journals/

print("\n‚úÖ Verifikasi selesai!")


## Optional: Stop Hadoop Services


In [None]:
# Stop Hadoop services jika diperlukan
import subprocess
import os

print("üõë Menghentikan Hadoop services...")

subprocess.run(['hdfs', '--daemon', 'stop', 'datanode'], env=os.environ)
subprocess.run(['hdfs', '--daemon', 'stop', 'namenode'], env=os.environ)

print("‚úÖ Hadoop services dihentikan")


## Alternatif: ETL Tanpa HDFS (Jika Hadoop Bermasalah)

Jika mengalami masalah dengan Hadoop di Colab, jalankan ETL tanpa HDFS:


In [None]:
# Alternatif: ETL tanpa HDFS jika Hadoop bermasalah
from sinta_journals_etl import SINTAJournalsETL

print("üîß Menginisialisasi ETL tanpa HDFS...")
etl_no_hdfs = SINTAJournalsETL(
    input_folder="output_journals",
    output_folder="output_data",
    hdfs_enabled=False  # Nonaktifkan HDFS
)

# Jalankan ETL process
print("\nüöÄ Menjalankan proses ETL...")
etl_no_hdfs.run(output_format='both', save_to_hdfs=False)

print("\n‚úÖ ETL Process selesai (tanpa HDFS)!")
