# **Summary**

## Instalasi dan import library

Menginstal library yang dibutuhkan untuk proses crawling, pengolahan teks, dan analisis jaringan.

In [None]:
!pip install networkx matplotlib nltk Sastrawi tqdm pandas sklearn requests beautifulsoup4

Collecting Sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl.metadata (909 bytes)
Collecting sklearn
  Downloading sklearn-0.0.post12.tar.gz (2.6 kB)
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See above for details.


## Import library

In [None]:
!pip install Sastrawi # Install the correct package

import numpy as np
import pandas as pd
import requests
import re
import networkx as nx

from urllib.request import urlopen
from bs4 import BeautifulSoup

from tqdm import tqdm

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
from nltk.tokenize import sent_tokenize , word_tokenize
from nltk.corpus import stopwords
# Import from the correct package and module
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics.pairwise import cosine_similarity
from sklearn import preprocessing

Collecting Sastrawi
  Using cached Sastrawi-1.0.1-py2.py3-none-any.whl.metadata (909 bytes)
Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/209.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m204.8/209.7 kB[0m [31m9.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Sastrawi
Successfully installed Sastrawi-1.0.1


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## Crawling artikel

Mengambil konten artikel dari URL menggunakan BeautifulSoup.

In [None]:
def crawl_article(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Memastikan permintaan berhasil
        soup = BeautifulSoup(response.content, 'html.parser')

        # Mengambil judul
        title_element = soup.find('h1', class_='jeg_post_title')
        title = title_element.get_text(strip=True) if title_element else 'Judul tidak ditemukan'

        # Mengambil Isi
        content_div = soup.find('div', class_='content-inner')
        if content_div:
            paragraphs = content_div.find_all('p')
            content = "\n".join(p.get_text(strip=True) for p in paragraphs)
        else:
            content = 'Isi artikel tidak ditemukan'

        # Mengambil tanggal
        date_div = soup.find('div', class_='jeg_meta_date')
        date_text = date_div.find('a').get_text(strip=True) if date_div else 'Tanggal tidak ditemukan'

        # Mengambil kategori
        category_meta = soup.find('span', class_='breadcrumb_last_link')
        if category_meta:
            category_link = category_meta.find('a')
            category = category_link.get_text(strip=True) if category_link else 'Kategori tidak ditemukan'
        else:
            category = 'Kategori tidak ditemukan'

        return {'Judul': title, 'Isi': content, 'Tanggal': date_text, 'Kategori': category}

    except requests.RequestException as e:
        print(f"Error fetching article: {e}")
        return None

article_url = input("Masukkan URL artikel: ")

# Melakukan crawl pada satu artikel
article = crawl_article(article_url)

# Membuat DataFrame dari dictionary
if article:
    df = pd.DataFrame([article])
    print(df)
else:
    print("Gagal mengambil artikel.")

Masukkan URL artikel: https://mojok.co/liputan/kuliner/perjalanan-34-tahun-roti-bakar-si-bob-di-jalan-kaliurang-sleman/
                                               Judul  \
0  Roti dan Jagung Bakar Si Bob Jalan Kaliurang: ...   

                                                 Isi           Tanggal  \
0  Dari awal bukanya di tahun 1990, warung Roti &...  28 November 2024   

  Kategori  
0  Kuliner  


## Preprosesing teks

mengubah teks menjadi huruf kecil

In [None]:
# Fungsi untuk mengubah teks menjadi huruf kecil
def clean_lower(text):
    if isinstance(text, str):
        return text.lower()
    return text

df['lower case'] = df['Isi'].apply(clean_lower)
df['lower case']

Unnamed: 0,lower case
0,"dari awal bukanya di tahun 1990, warung roti &..."


Membersihkan tanda baca dan angka

In [None]:
# Fungsi untuk membersihkan tanda baca dan angka (menghapus koma)
def clean_punct(text):
    if isinstance(text, str):
        clean_patterns = re.compile(r'[0-9]|[/(){}\[\]\|@,;_]|[^a-z .]+')  # Koma dihilangkan dari pengecualian
        text = clean_patterns.sub(' ', text)
        text = re.sub(r'\s+', ' ', text).strip()
        return text
    return text

df['tanda baca'] = df['lower case'].apply(clean_punct)
df['tanda baca']

Unnamed: 0,tanda baca
0,dari awal bukanya di tahun warung roti jagung ...


Normalisasi spasi

In [None]:
# Fungsi untuk normalisasi spasi
def _normalize_whitespace(text):
    if isinstance(text, str):
        corrected = re.sub(r'\s+', ' ', text)
        return corrected.strip()
    return text

df['spasi'] = df['tanda baca'].apply(_normalize_whitespace)
df['spasi']

Unnamed: 0,spasi
0,dari awal bukanya di tahun warung roti jagung ...


Tokenisasi kata

In [None]:
# Tokenisasi
def tokenize_text(text):
    if isinstance(text, str):
      return word_tokenize(text)
    return text

df['token'] = df['spasi'].apply(tokenize_text)
df['token']

LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/root/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


Pembersihan stopwords

In [None]:
# Pembersihan stopwords setelah tokenisasi
def clean_stopwords(tokens):
    if isinstance(tokens, list):
        stopword = set(stopwords.words('indonesian'))
        filtered_tokens = [word for word in tokens if word not in stopword]
        return filtered_tokens
    return tokens

df['stopwords'] = df['token'].apply(clean_stopwords)
df['stopwords']

Gabungkan kembali token menjadi string

In [None]:
# Gabungkan kembali token menjadi string
df['processed_text'] = df['stopwords'].apply(lambda tokens: ' '.join(tokens) if isinstance(tokens, list) else '')

prep_result = df['processed_text'].to_dict()
print(prep_result)

melakukan tokenisasi kalimat pada dua teks: satu dari teks yang telah diproses dan satu dari teks asli.

In [None]:
kalimat_preprocessing = nltk.sent_tokenize(prep_result[0])
kalimat = nltk.sent_tokenize(df['Isi'][0])

## TF-IDF dan cosine similitary

Hitung tf-udf

In [None]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_preprocessing = tfidf_vectorizer.fit_transform(kalimat_preprocessing)
terms = tfidf_vectorizer.get_feature_names_out()
tfidf_preprocessing = pd.DataFrame(data=tfidf_preprocessing.toarray(), columns=terms)
tfidf_preprocessing

Menghitung cosine similitary

In [None]:
cossim_prep = cosine_similarity(tfidf_preprocessing, tfidf_preprocessing)

similarity_matrix = pd.DataFrame(cossim_prep,
								 index=range(len(kalimat_preprocessing)),
								 columns=range(len(kalimat_preprocessing)))

similarity_matrix

## Graf dan analisis centrality

Membangun graf berdasarkan similitary dan visualisasi graf

In [None]:
import networkx as nx
import matplotlib.pyplot as plt

G_preprocessing = nx.DiGraph()
for i in range(len(cossim_prep)):
    G_preprocessing.add_node(i)

for i in range(len(cossim_prep)):
    for j in range(len(cossim_prep)):
        similarity_preprocessing = cossim_prep[i][j]
        if similarity_preprocessing > 0.1 and i != j:
            G_preprocessing.add_edge(i, j)

pos = nx.spring_layout(G_preprocessing, k=2)
nx.draw_networkx_nodes(G_preprocessing, pos, node_size=500, node_color='b')
nx.draw_networkx_edges(G_preprocessing, pos, edge_color='red', arrows=True)
nx.draw_networkx_labels(G_preprocessing, pos)

plt.show() # Now plt is defined and the show() function can be called

menghitung closeness centrality

In [None]:
closeness_preprocessing = nx.closeness_centrality(G_preprocessing)

sorted_closeness_preprocessing = sorted(closeness_preprocessing.items(), key=lambda x: x[1], reverse=True)
print("Closeness Centrality:")
for node, closeness in sorted_closeness_preprocessing:
    print(f"Node {node}: {closeness:.4f}")

Menampilkan ringkasan berdasarkan centrality

In [None]:
ringkasan_closeness_preprocessing = ""
print("Tiga Node Tertinggi Closeness Centrality Menggunakan Preprocessing:")
for node, closeness_preprocessing in sorted_closeness_preprocessing[:3]:
    top_sentence = kalimat[node]
    ringkasan_closeness_preprocessing += top_sentence + " "
    print(f"Node {node}: Closeness Centrality = {closeness_preprocessing:.4f}")
    print(f"Kalimat: {top_sentence}\n")