In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

In [2]:
# Load dataset
path = r"C:\Users\Tiffany\Downloads\KULIAH\SEMESTER V\DEEP LEARNING\EKSPLORASI\LSTM VS RNN\RNN-vs-LSTM-News-Category-Classification\News_Category_Dataset_v3.json"

df = pd.read_json(path, lines=True)
df.head()

print(df[['headline', 'category']])

                                                 headline   category
0       Over 4 Million Americans Roll Up Sleeves For O...  U.S. NEWS
1       American Airlines Flyer Charged, Banned For Li...  U.S. NEWS
2       23 Of The Funniest Tweets About Cats And Dogs ...     COMEDY
3       The Funniest Tweets From Parents This Week (Se...  PARENTING
4       Woman Who Called Cops On Black Bird-Watcher Lo...  U.S. NEWS
...                                                   ...        ...
209522  RIM CEO Thorsten Heins' 'Significant' Plans Fo...       TECH
209523  Maria Sharapova Stunned By Victoria Azarenka I...     SPORTS
209524  Giants Over Patriots, Jets Over Colts Among  M...     SPORTS
209525  Aldon Smith Arrested: 49ers Linebacker Busted ...     SPORTS
209526  Dwight Howard Rips Teammates After Magic Loss ...     SPORTS

[209527 rows x 2 columns]


In [3]:
# Preprocessing Data
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemm = WordNetLemmatizer()

def text_preprocess(text):
    text = text.lower()                            # 1. lowercase
    text = re.sub(r'http\S+', '', text)            # 2. remove url
    text = re.sub(r'[^a-z\s]', '', text)           # 3. remove punctuation & numbers
    tokens = text.split()                          # 4. tokenization
    tokens = [w for w in tokens if w not in stop_words]  # 5. remove stopwords
    tokens = [lemm.lemmatize(w) for w in tokens]   # 6. lemmatization
    return " ".join(tokens)

def csv_preprocess(df):
    # Hapus data duplikasi pada kolom headline
    df = df.drop_duplicates(subset='headline', keep='first')
        
    # Remove barus dengan null values
    df = df.dropna()
        
    # Reset index setelah penghapusan baris
    df = df.reset_index(drop=True)
        
    return df

pd.set_option('display.max_colwidth', None)
df['cleaned_headline'] = df['headline'].apply(text_preprocess)
df = csv_preprocess(df)
df[['headline', 'cleaned_headline', 'category']].head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Tiffany\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Tiffany\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,headline,cleaned_headline,category
0,Over 4 Million Americans Roll Up Sleeves For Omicron-Targeted COVID Boosters,million american roll sleeve omicrontargeted covid booster,U.S. NEWS
1,"American Airlines Flyer Charged, Banned For Life After Punching Flight Attendant On Video",american airline flyer charged banned life punching flight attendant video,U.S. NEWS
2,23 Of The Funniest Tweets About Cats And Dogs This Week (Sept. 17-23),funniest tweet cat dog week sept,COMEDY
3,The Funniest Tweets From Parents This Week (Sept. 17-23),funniest tweet parent week sept,PARENTING
4,Woman Who Called Cops On Black Bird-Watcher Loses Lawsuit Against Ex-Employer,woman called cop black birdwatcher loses lawsuit exemployer,U.S. NEWS


In [4]:
categories_to_keep = ['WELLNESS', 'ENTERTAINMENT', 'TRAVEL', 'STYLE & BEAUTY', 'PARENTING', 'FOOD & DRINK']

# Create a mapping dictionary for encoding
category_mapping = {
    'WELLNESS': 0,
    'ENTERTAINMENT': 1,
    'TRAVEL': 2,
    'STYLE & BEAUTY': 3,
    'PARENTING': 4,
    'FOOD & DRINK': 5
}

# Fix the filtering - use .keys() method
df = df[df['category'].isin(category_mapping.keys())]

# Apply the encoding to the category column
df['category'] = df['category'].map(category_mapping)

print(df['category'].value_counts())

category
0    17870
1    17319
2     9873
3     9323
4     8723
5     6330
Name: count, dtype: int64


In [5]:
# Balance the dataset by limiting each category to the minimum count
min_samples = df['category'].value_counts().min()
print(f"Minimum samples per category: {min_samples}")

# Sample the same number of rows from each category
df_balanced = df.groupby('category', group_keys=False).apply(
    lambda x: x.sample(n=min_samples, random_state=42)
)

# Reset index and shuffle
df_balanced = df_balanced.reset_index(drop=True)
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Update df with balanced dataset
df = df_balanced

print(f"\nBalanced dataset shape: {df.shape}")
print(f"\nCategory distribution after balancing:")
print(df['category'].value_counts().sort_index())

Minimum samples per category: 6330

Balanced dataset shape: (37980, 7)

Category distribution after balancing:
category
0    6330
1    6330
2    6330
3    6330
4    6330
5    6330
Name: count, dtype: int64


  df_balanced = df.groupby('category', group_keys=False).apply(


In [7]:
# Mengubah teks menjadi vector (Tokenizing)
import tensorflow as tf

pad_sequences = tf.keras.preprocessing.sequence.pad_sequences
Tokenizer = tf.keras.preprocessing.text.Tokenizer

tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(df['cleaned_headline'])

sequences = tokenizer.texts_to_sequences(df['cleaned_headline'])

MAX_LEN = 12

X_padded = pad_sequences(sequences, maxlen=MAX_LEN, padding='post', truncating='post')

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X_padded, df['category'], test_size=0.2, random_state=42, stratify=df['category']
)


In [8]:
# Arsitektur model LSTM
import tensorflow as tf

reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2,
                              patience=2, min_lr=0.0001)
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)


model = tf.keras.Sequential([
    tf.keras.layers.Embedding(10000, 128, input_length=15),
    tf.keras.layers.SimpleRNN(128, dropout=0.25, return_sequences=True),
    tf.keras.layers.SimpleRNN(64, dropout=0.25, return_sequences=True),
    tf.keras.layers.SimpleRNN(32),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(6, activation='softmax'),  # ✅ FIXED: Changed sigmoid to softmax
])

model.compile(optimizer='adam',
              loss="sparse_categorical_crossentropy",
              metrics=['accuracy'])

In [17]:
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=30,
    batch_size=64,
    callbacks=[early_stopping, reduce_lr],
)

Epoch 1/30
[1m475/475[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 15ms/step - accuracy: 0.6039 - loss: 1.0794 - val_accuracy: 0.7504 - val_loss: 0.7574 - learning_rate: 0.0010
Epoch 2/30
[1m475/475[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 12ms/step - accuracy: 0.8132 - loss: 0.5894 - val_accuracy: 0.7765 - val_loss: 0.6938 - learning_rate: 0.0010
Epoch 3/30
[1m475/475[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 14ms/step - accuracy: 0.8664 - loss: 0.4277 - val_accuracy: 0.7884 - val_loss: 0.6710 - learning_rate: 0.0010
Epoch 4/30
[1m475/475[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 14ms/step - accuracy: 0.8943 - loss: 0.3410 - val_accuracy: 0.7848 - val_loss: 0.7426 - learning_rate: 0.0010
Epoch 5/30
[1m475/475[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 12ms/step - accuracy: 0.9121 - loss: 0.2844 - val_accuracy: 0.7788 - val_loss: 0.7898 - learning_rate: 0.0010
Epoch 6/30
[1m475/475[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0

In [18]:
model.evaluate(X_test, y_test)

[1m238/238[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.7884 - loss: 0.6710


[0.6709970831871033, 0.7884413003921509]

In [19]:
# Get predictions (probabilities for all 6 classes)
predictions = model.predict(X_test)

# Show shape: should be (num_samples, 6)
print(f"Predictions shape: {predictions.shape}")
print(f"\nFirst 5 predictions:")
print(predictions[:5])

# For each prediction, you get 6 probabilities that sum to 1.0
print(f"\nFirst sample probabilities (sum={predictions[0].sum():.4f}):")
for i, prob in enumerate(predictions[0]):
    print(f"  Class {i}: {prob:.4f} ({prob*100:.2f}%)")

# Get the predicted class (highest probability)
predicted_classes = np.argmax(predictions, axis=1)
print(f"\nFirst 10 predicted classes: {predicted_classes[:10]}")
print(f"First 10 actual classes: {y_test.values[:10]}")

[1m238/238[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
Predictions shape: (7596, 6)

First 5 predictions:
[[0.00265196 0.01181421 0.9730752  0.00713341 0.00217453 0.0031506 ]
 [0.02821628 0.00172325 0.10447009 0.00482159 0.01237246 0.84839636]
 [0.00293026 0.00543158 0.9793553  0.00722898 0.00222596 0.00282796]
 [0.9286876  0.00131662 0.01339749 0.00424482 0.04543672 0.00691683]
 [0.6287197  0.03828933 0.13926685 0.03103549 0.11592986 0.04675887]]

First sample probabilities (sum=1.0000):
  Class 0: 0.0027 (0.27%)
  Class 1: 0.0118 (1.18%)
  Class 2: 0.9731 (97.31%)
  Class 3: 0.0071 (0.71%)
  Class 4: 0.0022 (0.22%)
  Class 5: 0.0032 (0.32%)

First 10 predicted classes: [2 5 2 0 0 2 0 4 5 0]
First 10 actual classes: [2 5 2 0 0 2 0 4 5 0]
