In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Bidirectional, TimeDistributed
from nltk.translate.bleu_score import sentence_bleu


In [2]:
# Load data training
train_data = pd.read_csv('train/train.csv')
articles_train = train_data['article'].values
highlights_train = train_data['highlights'].values

# Load data validasi
val_data = pd.read_csv('validation/validation.csv')
articles_val = val_data['article'].values
highlights_val = val_data['highlights'].values


In [3]:
val_data = val_data.head(1000)
val_data.shape

(1000, 3)

In [5]:
train_data = train_data.head(10000)
train_data.shape

(10000, 3)

In [6]:
print(articles_train[0])
print(highlights_train[0])

By . Associated Press . PUBLISHED: . 14:11 EST, 25 October 2013 . | . UPDATED: . 15:36 EST, 25 October 2013 . The bishop of the Fargo Catholic Diocese in North Dakota has exposed potentially hundreds of church members in Fargo, Grand Forks and Jamestown to the hepatitis A virus in late September and early October. The state Health Department has issued an advisory of exposure for anyone who attended five churches and took communion. Bishop John Folda (pictured) of the Fargo Catholic Diocese in North Dakota has exposed potentially hundreds of church members in Fargo, Grand Forks and Jamestown to the hepatitis A . State Immunization Program Manager Molly Howell says the risk is low, but officials feel it's important to alert people to the possible exposure. The diocese announced on Monday that Bishop John Folda is taking time off after being diagnosed with hepatitis A. The diocese says he contracted the infection through contaminated food while attending a conference for newly ordained b

In [7]:
print(articles_val[0])
print(highlights_val[0])

Sally Forrest, an actress-dancer who graced the silver screen throughout the '40s and '50s in MGM musicals and films such as the 1956 noir While the City Sleeps died on March 15 at her home in Beverly Hills, California. Forrest, whose birth name was Katherine Feeney, was 86 and had long battled cancer. Her publicist, Judith Goffin, announced the news Thursday. Scroll down for video . Actress: Sally Forrest was in the 1951 Ida Lupino-directed film 'Hard, Fast and Beautiful' (left) and the 1956 Fritz Lang movie 'While the City Sleeps' A San Diego native, Forrest became a protege of Hollywood trailblazer Ida Lupino, who cast her in starring roles in films including the critical and commercial success Not Wanted, Never Fear and Hard, Fast and Beautiful. Some of Forrest's other film credits included Bannerline, Son of Sinbad, and Excuse My Dust, according to her iMDB page. The page also indicates Forrest was in multiple Climax! and Rawhide television episodes. Forrest appeared as herself in

In [36]:
# Tokenizer untuk teks artikel
article_tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
article_tokenizer.fit_on_texts(articles_train)
article_sequences_train = article_tokenizer.texts_to_sequences(articles_train)

# Tokenizer untuk teks highlights
highlight_tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
highlight_tokenizer.fit_on_texts(highlights_train)
highlight_sequences_train = highlight_tokenizer.texts_to_sequences(highlights_train)
highlight_sequences_val = highlight_tokenizer.texts_to_sequences(highlights_val)


# Print tokenizer word indices
print("Article Tokenizer Word Index:")
print(dict(list(article_tokenizer.word_index.items())[:20])) 

print("\nHighlight Tokenizer Word Index:")
print(dict(list(highlight_tokenizer.word_index.items())[:20])) 

Article Tokenizer Word Index:
{'<OOV>': 1, 'the': 2, 'to': 3, 'a': 4, 'and': 5, 'of': 6, 'in': 7, 'was': 8, 'for': 9, 'that': 10, 'on': 11, 'is': 12, 'he': 13, 'with': 14, 'said': 15, 'it': 16, 'his': 17, 'at': 18, 'as': 19, 'by': 20}

Highlight Tokenizer Word Index:
{'<OOV>': 1, 'the': 2, 'to': 3, 'in': 4, 'of': 5, 'a': 6, 'and': 7, 'for': 8, 'was': 9, 'on': 10, 'is': 11, 'he': 12, 'with': 13, 'at': 14, 'his': 15, 'has': 16, 'from': 17, 'by': 18, 'says': 19, 'her': 20}


In [None]:
max_article_len = 50
max_highlight_len = 50

article_padded_train = pad_sequences(article_sequences_train, maxlen=max_article_len, padding='post')
highlight_padded_train = pad_sequences(highlight_sequences_train, maxlen=max_highlight_len, padding='post')
article_padded_val = pad_sequences(article_tokenizer.texts_to_sequences(articles_val), maxlen=max_article_len, padding='post')
highlight_padded_val = pad_sequences(highlight_sequences_val, maxlen=max_highlight_len, padding='post')


In [58]:
embedding_dim = 512
rnn_units = 256

model = Sequential([
    Embedding(input_dim=10000, output_dim=embedding_dim, input_length=max_article_len),
    Bidirectional(SimpleRNN(rnn_units, return_sequences=True)),
    TimeDistributed(Dense(10000, activation='softmax'))
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()


Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 50, 512)           5120000   
                                                                 
 bidirectional_4 (Bidirectio  (None, 50, 512)          393728    
 nal)                                                            
                                                                 
 time_distributed_4 (TimeDis  (None, 50, 10000)        5130000   
 tributed)                                                       
                                                                 
Total params: 10,643,728
Trainable params: 10,643,728
Non-trainable params: 0
_________________________________________________________________


In [17]:
# Train model
history = model.fit(
    article_padded_train, highlight_padded_train,
    epochs=2,
    batch_size=32,
    validation_data=(article_padded_val, highlight_padded_val)
)

Epoch 1/2
Epoch 2/2


In [59]:
def decode_sequence(sequence):
    return ' '.join([highlight_tokenizer.index_word.get(i, '') for i in sequence if i != 0])

def generate_summary(article):
    article_sequence = article_tokenizer.texts_to_sequences([article])
    article_padded = pad_sequences(article_sequence, maxlen=max_article_len, padding='post')
    predicted = model.predict(article_padded)
    predicted_sequence = np.argmax(predicted, axis=-1)
    return decode_sequence(predicted_sequence[0])


In [88]:
bleu_scores = []
for i in range(100):
    predicted_summary = generate_summary(articles_val[i])
    reference_summary = decode_sequence(highlight_padded_val[i])
    score = sentence_bleu([reference_summary.split()], predicted_summary.split())
    bleu_scores.append(score)

average_bleu_score = np.mean(bleu_scores)
print(f'Average BLEU Score: {average_bleu_score}')


Average BLEU Score: 1.2591395292141615e-232


In [89]:
print(articles_val[0])

Sally Forrest, an actress-dancer who graced the silver screen throughout the '40s and '50s in MGM musicals and films such as the 1956 noir While the City Sleeps died on March 15 at her home in Beverly Hills, California. Forrest, whose birth name was Katherine Feeney, was 86 and had long battled cancer. Her publicist, Judith Goffin, announced the news Thursday. Scroll down for video . Actress: Sally Forrest was in the 1951 Ida Lupino-directed film 'Hard, Fast and Beautiful' (left) and the 1956 Fritz Lang movie 'While the City Sleeps' A San Diego native, Forrest became a protege of Hollywood trailblazer Ida Lupino, who cast her in starring roles in films including the critical and commercial success Not Wanted, Never Fear and Hard, Fast and Beautiful. Some of Forrest's other film credits included Bannerline, Son of Sinbad, and Excuse My Dust, according to her iMDB page. The page also indicates Forrest was in multiple Climax! and Rawhide television episodes. Forrest appeared as herself in

In [90]:
# User input for summarization
user_input = articles_val[0]

# Generate summary for user input
predicted_summary = generate_summary(user_input)
print("Predicted Summary:", predicted_summary)

Predicted Summary: object luton chain gaal movement pronounced 1971 frost bullied something everton resolved german £16million owes 17th suburb banking saving trends parker very canberra explosions transgender weather create teachers poses image koreans violation wall bieber imported suggestions matched knowing pieces bowling flip dawson carriers divide slavery impossible told brain argues flooded
