### Import Library yang akan digunakan

In [2]:
import pandas as pd
import re
import string
import nltk
from transformers import pipeline
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix,ConfusionMatrixDisplay

In [3]:
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

### Memuat Data Tweet

In [4]:
link = "https://raw.githubusercontent.com/HaikalAkbar13/dicoding_sentimen_analysis/refs/heads/main/tweets_data.csv"
df = pd.read_csv(link)
df.columns

Index(['Tweet ID', 'Tweet Time Stamp', 'Tweet Text', 'Tweet Hashtag',
       'Tweet Translated'],
      dtype='object')

### Assesing dan Cleaning Data

In [5]:
# Drop kolom yang tidak di butuhkan (Timestamp, Tweet Text, Hashtag)
df = df.drop(columns=['Tweet Time Stamp', 'Tweet Text', 'Tweet Hashtag'])
df.head()

Unnamed: 0,Tweet ID,Tweet Translated
0,1609172529712611329,The Embark Internship has allowed our interns ...
1,1609243867504590850,asked ChatGPT: What jobs will be needed for hu...
2,1609261573184581635,"Thanks to ChatGPT, I just learned a new skill!..."
3,1609043258625216512,"Maybe AI won't be taking our jobs... at least,..."
4,1608679261136912384,"Well, @openai #ChatGPT completely shit the bed..."


In [6]:
# Duplicate Value
print(f"Jumlah Duplicated Value sebelum di drop \n{df.duplicated().value_counts()}")
df = df.drop_duplicates()
print(f"Jumlah Duplicated Value setelah di drop \n{df.duplicated().value_counts()}")

Jumlah Duplicated Value sebelum di drop 
False    14073
True        33
Name: count, dtype: int64
Jumlah Duplicated Value setelah di drop 
False    14073
Name: count, dtype: int64


In [7]:
# Null Values
for column in df.columns:
    print(f"Jumlah Null Values pada kolom {column} : {df[column].isna().value_counts()}")

Jumlah Null Values pada kolom Tweet ID : Tweet ID
False    14073
Name: count, dtype: int64
Jumlah Null Values pada kolom Tweet Translated : Tweet Translated
False    14073
Name: count, dtype: int64


### Pre Processing Data

In [8]:
# Membuat Fungsi untuk Pre processing Data
def preprocessing(text):
    # Inisialisasi
    punctuation_set = set(string.punctuation)
    stop_words = set(stopwords.words('english'))
    lemma = WordNetLemmatizer()
    text = text.lower()
    # Strip whitespaces
    text = text.strip()
    # Hapus URL, mentions, hashtag, newline, angka, special characters
    text = re.sub(r"http\S+|www\.\S+|@\w+|#\w+|[^\d\w\S]|\d+|\W|\0|_+|[^\x00-\x7F]+", " ", text)
    # Hapus punctuation
    text = ''.join(char for char in text if char not in punctuation_set)
    # Tokenisasi dan hapus stopwords
    tokens = word_tokenize(text)
    filtered = [word for word in tokens if word not in stop_words]
    # Lemmatization
    text = [lemma.lemmatize(word) for word in filtered]
    text = " ".join(word for word in text)
    return text

In [9]:
df["Text Cleaned"] = df["Tweet Translated"].apply(preprocessing)
df.head(30)

Unnamed: 0,Tweet ID,Tweet Translated,Text Cleaned
0,1609172529712611329,The Embark Internship has allowed our interns ...,embark internship allowed intern grow professi...
1,1609243867504590850,asked ChatGPT: What jobs will be needed for hu...,asked chatgpt job needed human perform artific...
2,1609261573184581635,"Thanks to ChatGPT, I just learned a new skill!...",thanks chatgpt learned new skill
3,1609043258625216512,"Maybe AI won't be taking our jobs... at least,...",maybe ai taking job least yet
4,1608679261136912384,"Well, @openai #ChatGPT completely shit the bed...",well completely shit bed one pharmacologist bi...
5,1608963021824561154,“AI Rewrite: Steve Jobs Stanford Commencement”...,ai rewrite steve job stanford commencement pet...
6,1609304139116302339,🧵 How to replace bullshit jobs with AI 🧵 \n\n2...,replace bullshit job ai year never seen risk o...
7,1609301436428423174,Four Ways #Jobs Will Respond to #Automation \n...,four way respond via
8,1608996363123007489,BGSU program to meet critical workforce needs ...,bgsu program meet critical workforce need adva...
9,1608952318787026947,New books from MIT experts deliver insights on...,new book mit expert deliver insight future rea...


### Labeling Data

In [10]:
# Labeling data menggunakan Model dari Hugging Face
label = ["Positive", "Neutral", "Negative"]
model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"
cls = pipeline("sentiment-analysis",
                      model=model_name, tokenizer=model_name)

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


model.safetensors:   0%|          | 0.00/501M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Device set to use cpu


In [11]:
def labelling(text):
    model_predict = cls(text)
    return model_predict[0]["label"]

In [12]:
df["Sentiment"] = df["Text Cleaned"].apply(labelling)
df.head()

Unnamed: 0,Tweet ID,Tweet Translated,Text Cleaned,Sentiment
0,1609172529712611329,The Embark Internship has allowed our interns ...,embark internship allowed intern grow professi...,positive
1,1609243867504590850,asked ChatGPT: What jobs will be needed for hu...,asked chatgpt job needed human perform artific...,neutral
2,1609261573184581635,"Thanks to ChatGPT, I just learned a new skill!...",thanks chatgpt learned new skill,positive
3,1609043258625216512,"Maybe AI won't be taking our jobs... at least,...",maybe ai taking job least yet,neutral
4,1608679261136912384,"Well, @openai #ChatGPT completely shit the bed...",well completely shit bed one pharmacologist bi...,negative


### Membangun Model

- Model : SVM
- Metode Ekstraksi Fitur : Bag of Word
- Pembagian Data : 70/30

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC

bow = CountVectorizer()

X = bow.fit_transform(df["Text Cleaned"]).toarray()
y = df["Sentiment"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [14]:
def model_eval(model, X_test, y_test):
    hasil = model.predict(X_test)
    hasil_skor = {
        "Akurasi" : accuracy_score(y_test, hasil),
        "Presisi" : precision_score(y_test, hasil, average="macro"),
        "Recall" : recall_score(y_test, hasil, average="macro"),
        "F1 Score" : f1_score(y_test, hasil, average="macro")
    }
    return hasil_skor

In [15]:
# Model SVM
svm_cls = SVC()
svm_cls.fit(X_train, y_train)

In [16]:
svm_eval = {
    "Support Vector Machine": model_eval(svm_cls, X_train, y_train)
}

data_eval = pd.DataFrame({"Model":"Suppor Vector Machine", "Akurasi": svm_eval["Akurasi"], "Presisi": svm_eval["Presisi"], "Recall": svm_eval["Recall"], "F1 Score": svm_eval["Akurasi"]})
data_eval

KeyError: 'Akurasi'