<a href="https://colab.research.google.com/github/MrA-png/preprosessing_dataset-abjad-isyara/blob/main/preprosessing_isyara.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import requests
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

nltk.data.path.append('/root/nltk_data')

# API Key dan URL Supabase
SUPABASE_URL = "<ENTPOINT>"
SUPABASE_API_KEY = "<API-KEY>"

def fetch_data_from_supabase():
    headers = {
        "apikey": SUPABASE_API_KEY,
        "Authorization": f"Bearer {SUPABASE_API_KEY}",
    }
    response = requests.get(SUPABASE_URL, headers=headers)
    if response.status_code == 200:
        return response.json()
    else:
        raise Exception(f"Error fetching data: {response.status_code}, {response.text}")

def preprocess_text(text):
    try:
        # Lowercasing
        print(f"Original Text: {text}")
        text = text.lower()
        print(f"Lowercased Text: {text}")
        # Menghapus angka dan simbol
        text = re.sub(r"[^a-z\s]", "", text)
        print(f"Cleaned Text (No Symbols/Numbers): {text}")
        # Tokenisasi
        tokens = word_tokenize(text)
        print(f"Tokenized Text: {tokens}")
        # Menghapus stopwords
        stop_words = set(stopwords.words("indonesian"))
        tokens = [word for word in tokens if word not in stop_words]
        print(f"Text after Stopword Removal: {tokens}")
        # Lemmatization
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
        print(f"Lemmatized Text: {tokens}")
        # Gabungkan kembali menjadi teks
        return " ".join(tokens)
    except Exception as e:
        print(f"Terjadi kesalahan saat preprocessing: {e}")
        return ""

# Main program
if __name__ == "__main__":
    try:
        # Ambil data dari Supabase
        data = fetch_data_from_supabase()
        # Convert ke DataFrame
        df = pd.DataFrame(data)
        print("Original Dataset:")
        print(df.head())
        # Pastikan ada kolom teks untuk preprocessing
        if "text" in df.columns:
            # Proses preprocessing
            df["text_clean"] = df["text"].apply(lambda x: preprocess_text(x) if pd.notnull(x) else "")
            # Cetak hasil
            print("Dataset after Preprocessing:")
            print(df.head())
            # Simpan hasil ke file CSV
            df.to_csv("preprocessed_dataset.csv", index=False)
            print("Preprocessed dataset saved to 'preprocessed_dataset.csv'.")
        else:
            print("Kolom 'text' tidak ditemukan dalam dataset.")
    except Exception as e:
        print(f"Terjadi kesalahan: {e}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Original Dataset:
                                     id                        created_at  \
0  9b11781a-7cfe-4d55-87cf-34bd9a65584d  2024-12-27T12:16:49.337944+00:00   
1  58f3a70e-fa66-4a4a-b6de-4e31240c9c3e  2024-12-27T12:16:49.337944+00:00   
2  9f76a3e1-e1a5-4ffb-bb05-59154bec5f22  2024-12-27T12:16:49.337944+00:00   
3  dcb587eb-08ad-40ff-9f98-bf2f301b9807  2024-12-27T12:16:49.337944+00:00   
4  9e155280-3247-4b59-8ace-b4715a7509ac  2024-12-27T12:16:49.337944+00:00   

  text                                       path_gesture  
0    a  https://bbmgbfgcmwippuwnutkk.supabase.co/stora...  
1    b  https://bbmgbfgcmwippuwnutkk.supabase.co/stora...  
2    c  https://bbmgbfgcmwippuwnutkk.supabase.co/stora...  
3    d  https://bbmgbfgcmwippuwnutkk.supabase.co/stora...  
4    e  https://bbmgbfgcmwippuwnutkk.supabase.co/stora...  
Original Text: a
Lowercased Text: a
Cleaned Text (No Symbols/Numbers): a
Terjadi kesalahan saat preprocessing: 
***********************************************