In [48]:
import numpy as np
import pandas as pd
import random

import datasets
import transformers
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback
from tqdm import tqdm_notebook
import torch

import matplotlib.pyplot as plt
import seaborn as sns

import re

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score,accuracy_score,classification_report
from sklearn.preprocessing import LabelEncoder

device = torch.device('cuda')

In [2]:
df = pd.read_csv("Dataset Pertama_train.csv", header=None, names=["kelas", "title", "review"])

In [3]:
df = df.drop('title',axis=1)

## Preprocessing

In [4]:
df['review'] = df['review'].str.lower()

In [5]:
df = df.rename(columns={'review':'text','kelas':'label'})

In [6]:
df['label'] = df['label']-1

## Data Spliting

In [7]:
train,val = train_test_split(df,stratify=df['label'],random_state=42,shuffle=True,test_size=0.4)
val,test = train_test_split(val,stratify=val['label'],random_state=42,shuffle=True,test_size=0.5)

In [8]:
# Mengambil sample untuk data latih
train,_ = train_test_split(train,stratify=train['label'],random_state=42,shuffle=True,train_size=0.01)

In [9]:
# Mengambil sample untuk data validasi
val,_ = train_test_split(val,stratify=val['label'],random_state=42,shuffle=True,train_size=0.01)

In [10]:
print(f'Volume data latih: {train.shape[0]} baris\n')
print(f'Volume data validasi: {val.shape[0]} baris\n')
print(f'Volume data uji: {test.shape[0]} baris\n')

Volume data latih: 21600 baris

Volume data validasi: 7200 baris

Volume data uji: 720000 baris



In [11]:
data = datasets.DatasetDict({
    'train': datasets.Dataset.from_pandas(train.reset_index()),
    'validation': datasets.Dataset.from_pandas(val.reset_index()),
    'test': datasets.Dataset.from_pandas(test.reset_index()),
})

In [12]:
data

DatasetDict({
    train: Dataset({
        features: ['index', 'label', 'text'],
        num_rows: 21600
    })
    validation: Dataset({
        features: ['index', 'label', 'text'],
        num_rows: 7200
    })
    test: Dataset({
        features: ['index', 'label', 'text'],
        num_rows: 720000
    })
})

## Tokenize

In [13]:
tokenizer = AutoTokenizer.from_pretrained('distilbert/distilbert-base-uncased')

In [14]:
sample_text = train.iloc[0,1]

token = tokenizer(sample_text)
input_ids = token.input_ids
tokenizer.convert_ids_to_tokens(input_ids)

['[CLS]',
 'i',
 'love',
 'this',
 'cd',
 '.',
 'i',
 'saw',
 'them',
 'in',
 'person',
 'and',
 'it',
 'was',
 'an',
 'awesome',
 'ex',
 '##per',
 '##ien',
 '##c',
 '.',
 'i',
 'was',
 'never',
 'so',
 'happy',
 'to',
 'be',
 'crying',
 'in',
 'my',
 'life',
 '.',
 'when',
 'i',
 'listen',
 'to',
 'this',
 'cd',
 'i',
 'feel',
 'that',
 'i',
 'have',
 'my',
 'dear',
 'departed',
 'irish',
 'catholic',
 'grandmother',
 'on',
 'one',
 'shoulder',
 'and',
 'my',
 'darling',
 'late',
 'mother',
 'on',
 'the',
 'other',
 '.',
 'the',
 'music',
 'tugs',
 'at',
 'your',
 'heart',
 '.',
 'each',
 'tenor',
 'has',
 'his',
 'own',
 'unique',
 'style',
 '.',
 'you',
 'feel',
 'like',
 'you',
 'should',
 'be',
 'in',
 'a',
 'pub',
 ',',
 'drinking',
 'a',
 'beer',
 'when',
 'listening',
 'to',
 'the',
 'home',
 '##y',
 'sound',
 'of',
 'john',
 'mc',
 '##dermott',
 '.',
 'anthony',
 'ke',
 '##arns',
 'breaks',
 'your',
 'heart',
 'when',
 'he',
 'sings',
 'grace',
 '.',
 'ronan',
 'ty',
 '##nan',

In [15]:
def apply_tokenizer(batch):
    return tokenizer(batch['text'],padding=True,truncation=True)
data_encoded = data.map(apply_tokenizer,batched=False,batch_size=None)

Map: 100%|██████████████████████████████████████████████████████████████| 21600/21600 [00:09<00:00, 2162.14 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████| 7200/7200 [00:03<00:00, 2158.61 examples/s]
Map: 100%|████████████████████████████████████████████████████████████| 720000/720000 [05:27<00:00, 2199.87 examples/s]


## Modeling

In [29]:
from huggingface_hub import login
login(token="API_KEY")

In [30]:
num_label = 2
model_ckpt = 'distilbert/distilbert-base-uncased'
model = (AutoModelForSequenceClassification.from_pretrained(model_ckpt,num_labels=num_label).to(device))

In [31]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels,preds,average='macro')
    acc = accuracy_score(labels,preds)

    return {'f1':f1,'accuracy':acc}

In [33]:
batch_size=16
logging_steps = len(data['train'])//batch_size
model_name = 'distilbert-base-uncased-sentiment-analysis'
learning_rate=2e-5
num_epochs = 5
log_level = 'error'
weight_decay = 0.01
early_stopping_patience=2
training_args = TrainingArguments(
    output_dir=model_name,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    log_level=log_level,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    num_train_epochs=num_epochs,
    weight_decay=weight_decay,
    load_best_model_at_end=True,
    push_to_hub=True,
    disable_tqdm=False,
    eval_strategy='epoch',
    save_strategy='epoch'
)
trainer = Trainer(
    model=model,
    compute_metrics=compute_metrics,
    args=training_args,
    train_dataset=data_encoded['train'],
    eval_dataset=data_encoded['validation'],
    callbacks=[EarlyStoppingCallback(early_stopping_patience=early_stopping_patience)],
    processing_class=tokenizer
)

In [34]:
data_encoded

DatasetDict({
    train: Dataset({
        features: ['index', 'label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 21600
    })
    validation: Dataset({
        features: ['index', 'label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 7200
    })
    test: Dataset({
        features: ['index', 'label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 720000
    })
})

In [35]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.2638,0.216264,0.924827,0.924861
2,0.1539,0.230931,0.926512,0.926528
3,0.0849,0.303791,0.92021,0.920278


TrainOutput(global_step=4050, training_loss=0.167523843270761, metrics={'train_runtime': 4900.8328, 'train_samples_per_second': 22.037, 'train_steps_per_second': 1.377, 'total_flos': 3383465738046720.0, 'train_loss': 0.167523843270761, 'epoch': 3.0})

In [36]:
trainer.push_to_hub(commit_message="done")

CommitInfo(commit_url='https://huggingface.co/hanifimaduddin/distilbert-base-uncased-sentiment-analysis/commit/e70fdfd56831c1ad33199ce81680e388d0485d3e', commit_message='done', commit_description='', oid='e70fdfd56831c1ad33199ce81680e388d0485d3e', pr_url=None, repo_url=RepoUrl('https://huggingface.co/hanifimaduddin/distilbert-base-uncased-sentiment-analysis', endpoint='https://huggingface.co', repo_type='model', repo_id='hanifimaduddin/distilbert-base-uncased-sentiment-analysis'), pr_revision=None, pr_num=None)

In [39]:
pred = trainer.predict(data_encoded['test'])

In [41]:
pred_labeled = pred.predictions.argmax(-1)

In [44]:
test['predcition'] = pred_labeled

In [45]:
test

Unnamed: 0,label,text,predcition
992216,0,"seriously.... can we please get some kind of ""...",0
1382511,0,this book was a great dissapointment. i read a...,0
1793586,0,"i loved this rice cooker, and thought it was a...",0
2484056,1,for those looking for a dvd that will last a l...,1
313689,1,first battery tender i've purchased. used to r...,1
...,...,...,...
2056330,1,please check the color of the shipping box the...,0
2458901,1,i'm ordering my 2nd copy! this is no longer a ...,1
3216830,0,"admittedly, i have not started law school yet,...",0
2404814,0,"well, been waiting...when is the dvd version b...",0


In [49]:
report = classification_report(test['label'],test['predcition'])

In [53]:
df_ = pd.read_csv("Dataset Pertama_train.csv", header=None, names=["kelas", "title", "review"])

In [58]:
test_df = df_.iloc[test.index].copy()

In [61]:
test_df['prediksi'] = pred_labeled+1

In [62]:
test_df

Unnamed: 0,kelas,title,review,prediksi
992216,1,"Can we please get ""propaganda"" included in the...","Seriously.... Can we please get some kind of ""...",1
1382511,1,...rent a movie instead... minus 5 stars,This book was a great dissapointment. I read a...,1
1793586,1,Stopped working after 2 years,"I loved this rice cooker, and thought it was a...",1
2484056,2,Verbatim Ultra-life DVD's,For those looking for a DVD that will last a l...,2
313689,2,First Tender Purchased,First battery tender I've purchased. Used to r...,2
...,...,...,...,...
2056330,2,Check the color of the box,Please check the color of the shipping box the...,1
2458901,2,"FINALLY, SOMEONE IS TALKING!",I'm ordering my 2nd copy! This is no longer a ...,2
3216830,1,"Good, If you plan on going to Harvard","Admittedly, I have not started law school yet,...",1
2404814,1,Where is he DVD version ?,"Well, been waiting...when is the DVD version b...",1


In [69]:
test_df = test_df.reset_index()

In [73]:
test_df.to_csv('hasil_prediksi_distilbert.csv',index=False)

In [74]:
nyoba = pd.read_csv('hasil_prediksi_distilbert.csv')

In [75]:
nyoba

Unnamed: 0,index,kelas,title,review,prediksi
0,992216,1,"Can we please get ""propaganda"" included in the...","Seriously.... Can we please get some kind of ""...",1
1,1382511,1,...rent a movie instead... minus 5 stars,This book was a great dissapointment. I read a...,1
2,1793586,1,Stopped working after 2 years,"I loved this rice cooker, and thought it was a...",1
3,2484056,2,Verbatim Ultra-life DVD's,For those looking for a DVD that will last a l...,2
4,313689,2,First Tender Purchased,First battery tender I've purchased. Used to r...,2
...,...,...,...,...,...
719995,2056330,2,Check the color of the box,Please check the color of the shipping box the...,1
719996,2458901,2,"FINALLY, SOMEONE IS TALKING!",I'm ordering my 2nd copy! This is no longer a ...,2
719997,3216830,1,"Good, If you plan on going to Harvard","Admittedly, I have not started law school yet,...",1
719998,2404814,1,Where is he DVD version ?,"Well, been waiting...when is the DVD version b...",1
