In [1]:
input_file = '/content/prepared_data.csv'
output_file = '/content/cleaned_data.csv'

# Count rows from 0, so row 190149 is line 190150
row_to_remove = 190149

with open(input_file, 'r', encoding='utf-8') as infile, \
     open(output_file, 'w', encoding='utf-8') as outfile:

    for i, line in enumerate(infile):
        if i != row_to_remove:
            outfile.write(line)

print(f"Removed row {row_to_remove}, saved to {output_file}")

Removed row 190149, saved to /content/cleaned_data.csv


In [2]:
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

# Try using the CSV module directly for more control
import csv

data = []
with open('/content/cleaned_data.csv', 'r', encoding='utf-8') as f:
    reader = csv.reader(f)
    for i, row in enumerate(reader):
        if i == 2938:  # Skip or fix the problematic row
            continue  # or manually fix this row
        data.append(row)

df = pd.DataFrame(data[1:], columns=data[0])  # Assuming first row is header

# Convert to dictionary for easier processing
df = df.to_dict('records')

In [3]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, GRU, Dense, Input, Attention, Concatenate
from tensorflow.keras.models import Model
import numpy as np
import pandas as pd


In [4]:
# Assuming 'df' already exists somehow (you didn't show its loading part)
# If it's a list of dicts, we convert it
df = pd.DataFrame(df)  # Convert the list of dicts into a dataframe

# Make sure columns are string type
texts = df['content'].astype(str).tolist()
summaries = df['content'].astype(str).tolist()  # Replace with real summaries when ready


In [5]:
# Tokenization and padding function
def preprocess(texts, max_words=10000, max_len=200):
    tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>", filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)
    padded = pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post')
    return padded, tokenizer


In [6]:
# Prepare data
max_len_text = 200
max_len_summary = 50

X, text_tokenizer = preprocess(texts, max_len=max_len_text)
y, summary_tokenizer = preprocess(summaries, max_len=max_len_summary)


In [7]:
# Train/test split
split = int(0.8 * len(X))
X_train, X_val = X[:split], X[split:]
y_train, y_val = y[:split], y[split:]


In [8]:
# Model architecture
vocab_size = 10000
embedding_dim = 128
gru_units = 256

# Encoder
encoder_inputs = Input(shape=(max_len_text,))
enc_emb = Embedding(vocab_size, embedding_dim)(encoder_inputs)
encoder_gru = GRU(gru_units, return_sequences=True, return_state=True)
encoder_outputs, encoder_state = encoder_gru(enc_emb)

# Decoder
decoder_inputs = Input(shape=(max_len_summary - 1,))
dec_emb = Embedding(vocab_size, embedding_dim)(decoder_inputs)
decoder_gru = GRU(gru_units, return_sequences=True, return_state=True)
decoder_outputs, _ = decoder_gru(dec_emb, initial_state=encoder_state)

# Attention
attention = Attention()([decoder_outputs, encoder_outputs])
decoder_concat = Concatenate()([decoder_outputs, attention])

# Output Layer
decoder_dense = Dense(vocab_size, activation='softmax')
output = decoder_dense(decoder_concat)

# Create the model
model = Model([encoder_inputs, decoder_inputs], output)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')


In [9]:
# Train the model
model.fit(
    [X_train, y_train[:, :-1]],
    y_train[:, 1:],
    batch_size=32,
    epochs=3,
    validation_data=([X_val, y_val[:, :-1]], y_val[:, 1:])
)


Epoch 1/3
[1m3909/3909[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12704s[0m 3s/step - loss: 4.1318 - val_loss: 0.0679
Epoch 2/3
[1m3909/3909[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12809s[0m 3s/step - loss: 0.0304 - val_loss: 0.0130
Epoch 3/3
[1m3909/3909[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12841s[0m 3s/step - loss: 0.0083 - val_loss: 0.0099


<keras.src.callbacks.history.History at 0x7d494633f590>

In [12]:
def summarize(text):
    # 1) Text → sequence → padded
    seq = text_tokenizer.texts_to_sequences([text])
    if not seq or not seq[0]:
        return ""
    padded = pad_sequences(seq, maxlen=max_len_text, padding='post')

    # 2) Initialize target_seq with the exact decoder_seq_len
    target_seq = np.zeros((1, decoder_seq_len))
    if '<start>' in summary_tokenizer.word_index:
        target_seq[0, 0] = summary_tokenizer.word_index['<start>']

    # 3) Iteratively predict next token
    for i in range(decoder_seq_len - 1):
        preds = model.predict([padded, target_seq], verbose=0)
        sampled_idx = np.argmax(preds[0, i, :])
        target_seq[0, i + 1] = sampled_idx
        if sampled_idx == summary_tokenizer.word_index.get('<end>', 0):
            break

    # 4) Convert token indices → words
    summary = []
    for idx in target_seq[0]:
        idx = int(idx)
        if idx > 0:
            w = summary_tokenizer.index_word.get(idx, '')
            if w == '<end>':
                break
            summary.append(w)
    return ' '.join(summary)


In [14]:
# decoder_seq_len will be 49 in your case
decoder_seq_len = model.input_shape[1][1]
print(f"Decoder input length (max summary tokens): {decoder_seq_len}")


Decoder input length (max summary tokens): 49


In [1]:
df_head_10 = df.head(10).copy()
df_head_10['gru_summary'] = df_head_10['content'].astype(str).apply(summarize)
df_head_10.to_csv('gru_summarized_head_10_fixed.csv', index=False)
print("Fixed summarization on first 10 articles complete!")


NameError: name 'df' is not defined