<a href="https://colab.research.google.com/github/Muhammadridho100902/google_collab/blob/main/Twitter_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! chmod 600 /content/kaggle.json
! KAGGLE_CONFIG_DIR=/content/ kaggle datasets download -d jp797498e/twitter-entity-sentiment-analysis

twitter-entity-sentiment-analysis.zip: Skipping, found more recently modified local copy (use --force to force download)


In [2]:
import zipfile
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re
import nltk

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

import pickle

In [3]:
dataset_zip = zipfile.ZipFile('twitter-entity-sentiment-analysis.zip')
dataset_zip.extractall()
dataset_zip.close()

In [4]:
def getOneHotEncode(label):
  # penggunaan lower untuk meng antisipasi label yang berbeda seperti "Positive" & "positive"
  if label.lower() == 'positive':
    return [1,0,0]
  elif label.lower() == 'negative':
    return [0,1,0]
  else:
    return [0,0,1]

In [5]:
nltk.download('stopwords')
stopwords.words('english')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
def remove_url (text):
  # http://google.com adalah bla bla bla
  # https://youtube.com
  return re.sub(r'http\S+', '', text)

def remove_number(text):
  # 123 saya blablabla
  return re.sub(r'\d+', '', text)

def remove_stopwords(text):
  words = text.split() # saya mau ke batam -> [saya, mau, ke, batam]
  new_text = ""
  for word in words:
    if word not in stopwords.words('english'):
      new_text += word
      new_text += " "

  return new_text

def lematized_text(text):
  lematizer = WordNetLemmatizer()
  words = text.split()
  new_text = ""
  for word in words:
    lematized_word = lematizer.lemmatize(word)
    new_text += lematized_word
    new_text += " "
  return new_text

def stemmed_text(text):
  stemmer = PorterStemmer()
  words = text.split()
  new_text = ""
  for word in words:
    stemmed_word = stemmer.stem(word)
    new_text += stemmed_word
    new_text += " "
  return new_text

def preprocessing (text):
  #lowercase -> Running, running, RUNNING, RunNinG
  text = text.lower()

  # remove url
  text = remove_url(text)

  # remove number
  text = remove_number(text)

  # remove stopwords
  text = remove_stopwords(text)

  # lematisasi
  text = lematized_text(text)

  # stemming
  text = stemmed_text(text)

  return text

In [7]:
text = []
label = []

with open('/content/twitter_training.csv', 'r') as file:
  for sentence in file:
    values = sentence.split(',')
    text.append(values[3])
    label.append(getOneHotEncode(values[2]))

In [8]:
print(text[7])
print(label[7])

"So I spent a couple of hours doing something for fun... If you don't know that I'm a huge @ Borderlands fan and Maya is one of my favorite characters
[1, 0, 0]


# Tokenizing

proses untuk mengubah teks / kata menjadi sebuah token agar bisa diterima oleh machine

In [9]:
def get_avg(text_list):
  sum = 0
  for text in text_list:
    word_num = len(text)
    sum = sum + word_num
  return sum / len(text_list) # total kata di dataset / banyak kalimat yang ada

In [10]:
def tokenizing(text_list):
  vocab_size = 10000

  # penggunaan oov bertujuan untuk handle kata yang lebih dari word tokenizer
  # contoh kita set vocab size di 2
  # kita ingin membuat kata "aku adalah budi" menjadi token
  # maka "Aku" menjadi index 1 "adalah" menjadi index 2 dan "budi" akan menjadi OOV
  # artinya adalah oov merupakan sebuah param untuk handle sebuah kata yang lebih dari vocab size yang kita set

  tokenizer = Tokenizer(num_words= vocab_size, oov_token = '<OOV>')
  tokenizer.fit_on_texts(text_list)
  word_index = tokenizer.word_index # mengubah kata menjadi index

  max_length = get_avg(text_list)
  max_length = int(max_length)
  print(max_length)

  squence = tokenizer.texts_to_sequences(text_list)

  # pad_squence ini berguna untuk menambahkan atau memotong ketika kita menggunakan shape pada tensorflow
  # ada dua yaitu padding dan trunct
  ## padding untunk menambahkan
  # ketika shapenya 4 dan value kita 3 ([1,2,3]) jika mengunakan padding matrix akan bertambah ([1,2,3,4])
  ## trunct untuk memotong
  # ketika shapenya 4 dan value kita 5 ([1,2,3,4,5]) jika mengunakan trunct matrix akan berkurang ([1,2,3,4])


  # setiap kalimat memiliki jumlah kata yang berbeda. untuk handle hal tersebut, kita menggunakan maxlen
  # agar setiap kalimat memiliki jumlah yang sama
  # saat ini kita menggunakan 100 kata dalam setiap kalimat
  # jika di suatu kalimat memiliki 3 kata saja
  # maka squence akan bertambah sampai berjumlah 100
  # akan diisi dengan nilai 0
  # pada saat ini kita menggunakan post, karena akan menambahkan nilai 0 setelah squence sampai berjumlah 100 kata

  padded_squence = pad_sequences(squence, padding = 'pre', truncating= 'pre', maxlen = max_length)

  # export tokenizer
  with open('my_tokenizer.pickle', 'wb') as file:
    pickle.dump(tokenizer, file, protocol=pickle.HIGHEST_PROTOCOL)

  return padded_squence

In [11]:
datas = tokenizing(text)
datas.shape

83


(74682, 83)

In [12]:
labels = np.array(label) # labelnya diubah menjadi array terlebih dahulu

In [13]:
from tensorflow.keras.layers import Embedding

model = tf.keras.Sequential([
    # input shape mengikuti datas.shape, karna kita ingin membuat datanya fleksibel makanya kita set None
    # jika dilihat data diatas berjumlah 74682
    tf.keras.layers.Embedding(10001, output_dim=100, input_length = 83),

    tf.keras.layers.LSTM(100),

    tf.keras.layers.Dense(100, activation = 'relu', input_shape=(None, 75)),

    # output set di 3 karna akan menghasilkan antara "Pos", "Neg", "Neu"
    tf.keras.layers.Dense(3, activation = 'softmax')
])

In [14]:
model.compile(
    loss = 'categorical_crossentropy',
    metrics = ['acc'],
    optimizer = 'adam'
)

In [15]:
model.fit(
    datas,
    labels,
    epochs = 3
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x7a3a9fadb8e0>

In [16]:
model.save('my_model.h5')

  saving_api.save_model(


In [17]:
import tensorflow as tf
import pickle

from tensorflow.keras.models import load_model

In [49]:
with open('/content/my_tokenizer.pickle', 'rb') as file:
  tokenizer = pickle.load(file)

In [None]:
tokenizer.word_index

In [50]:
model = load_model('my_model.h5')

In [51]:
nltk.download('stopwords')
stopwords.words('english')
nltk.download('wordnet')

def remove_url (text):
  # http://google.com adalah bla bla bla
  # https://youtube.com
  return re.sub(r'http\S+', '', text)

def remove_number(text):
  # 123 saya blablabla
  return re.sub(r'\d+', '', text)

def remove_stopwords(text):
  words = text.split() # saya mau ke batam -> [saya, mau, ke, batam]
  new_text = ""
  for word in words:
    if word not in stopwords.words('english'):
      new_text += word
      new_text += " "

  return new_text

def lematized_text(text):
  lematizer = WordNetLemmatizer()
  words = text.split()
  new_text = ""
  for word in words:
    lematized_word = lematizer.lemmatize(word)
    new_text += lematized_word
    new_text += " "
  return new_text

def stemmed_text(text):
  stemmer = PorterStemmer()
  words = text.split()
  new_text = ""
  for word in words:
    stemmed_word = stemmer.stem(word)
    new_text += stemmed_word
    new_text += " "
  return new_text

def preprocessing (text):
  #lowercase -> Running, running, RUNNING, RunNinG
  text = text.lower()

  # remove url
  text = remove_url(text)

  # remove number
  text = remove_number(text)

  # remove stopwords
  text = remove_stopwords(text)

  # lematisasi
  text = lematized_text(text)

  # stemming
  text = stemmed_text(text)

  return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [59]:
text = input('Masukan text yang mau di predict : ')

Masukan text yang mau di predict : i am very disappointed about this movie, very bad


In [60]:
preprocessed_text = preprocessing(text)

In [61]:
print(text)
print(preprocessed_text)

i am very disappointed about this movie, very bad
disappoint movie, bad 


In [62]:
sequence = tokenizer.texts_to_sequences([preprocessed_text])
print(sequence)

[[3126, 641, 149]]


In [65]:
padded_sequences = pad_sequences(sequence, padding = 'pre', truncating= 'pre', maxlen = 83)

In [66]:
print(padded_sequences)

[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0 3126  641  149]]


In [67]:
model.predict(padded_sequences)



array([[0.02630019, 0.954824  , 0.01887592]], dtype=float32)