In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.models import Sequential

In [3]:
#test_df = pd.read_csv('/content/drive/My Drive/HindiNews_test.csv')
train_df = pd.read_csv('/content/drive/My Drive/hindi_train.csv')

In [None]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.models import Sequential

train_df = train_df.head(100)
#test_df = train_df.head(10)

X_train = train_df['Article']
y_train = train_df['Summary']

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_train_pad = pad_sequences(X_train_seq, padding='post')

vocab_size = len(tokenizer.word_index) + 1
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=100, input_length=X_train_pad.shape[1]))
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(Dense(vocab_size, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')

model.fit(X_train_pad, X_train_pad, epochs=1, batch_size=1)
print("Training complete.")

# Save the model
model.save('bi_lstm_seq2seq_model.h5')

In [None]:
train_df = train_df.head(10000)
#test_df = train_df.head(10)

X_train = train_df['Article']
y_train = train_df['Summary']

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_train_pad = pad_sequences(X_train_seq, padding='post')

In [6]:
from tensorflow.keras.models import load_model

model = load_model('/content/drive/MyDrive/bi_lstm_seq2seq_model.h5')

In [None]:

test_df = train_df.head(1)
X_test = test_df['Article']
y_test = test_df['Summary']
X_test_seq = tokenizer.texts_to_sequences(X_test)
X_test_pad = pad_sequences(X_test_seq, padding='post', maxlen=X_train_pad.shape[1])

y_pred_prob = model.predict(X_test_pad)
y_pred = np.argmax(y_pred_prob, axis=2)
print("Prediction on test data complete.")

summary = tokenizer.sequences_to_texts(y_pred)
print(summary)

In [None]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Layer
from tensorflow.keras.models import Sequential
import pandas as pd

class CustomAttention(Layer):
    def __init__(self, **kwargs):
        super(CustomAttention, self).__init__(**kwargs)

    def build(self, input_shape):
        self.W_heading = self.add_weight(shape=(input_shape[0][-1], 1), initializer="normal", trainable=True)
        self.W_article = self.add_weight(shape=(input_shape[1][-1], 1), initializer="normal", trainable=True)
        super(CustomAttention, self).build(input_shape)

    def call(self, inputs):
        heading_encodings, article_encodings = inputs
        heading_attention_weights = tf.nn.softmax(tf.matmul(heading_encodings, self.W_heading), axis=1)
        article_attention_weights = tf.nn.softmax(tf.matmul(article_encodings, self.W_article), axis=1)
        heading_weighted = tf.matmul(tf.transpose(heading_attention_weights, perm=[0, 2, 1]), heading_encodings)
        article_weighted = tf.matmul(tf.transpose(article_attention_weights, perm=[0, 2, 1]), article_encodings)
        combined_encodings = tf.concat([heading_weighted, article_weighted], axis=1)
        return combined_encodings

train_df = pd.read_csv("hindi_train.csv")
train_df = train_df.head(100)

X_train_heading = train_df['Heading']
X_train_article = train_df['Article']
y_train = train_df['Summary']

tokenizer_heading = Tokenizer()
tokenizer_heading.fit_on_texts(X_train_heading)
X_train_heading_seq = tokenizer_heading.texts_to_sequences(X_train_heading)
X_train_heading_pad = pad_sequences(X_train_heading_seq, padding='post')

tokenizer_article = Tokenizer()
tokenizer_article.fit_on_texts(X_train_article)
X_train_article_seq = tokenizer_article.texts_to_sequences(X_train_article)
X_train_article_pad = pad_sequences(X_train_article_seq, padding='post')

vocab_size = len(tokenizer_heading.word_index) + 1

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=100, input_length=X_train_heading_pad.shape[1]))
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(CustomAttention())
model.add(Dense(vocab_size, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')

model.fit([X_train_heading_pad, X_train_article_pad], X_train_article_pad, epochs=1, batch_size=1)
print("Training complete.")

model.save('bi_lstm_attention_seq2seq_model.h5')

In [None]:
!pip install rouge bert-score seqeval

In [None]:
from rouge import Rouge

generated_summary = summary
target_summary = test_df['Summary'].tolist()

rouge1_scores = []
rouge2_scores = []
rouge4_scores = []
bert_scores = []

rouge = Rouge()
rouge_scores = rouge.get_scores(generated_summary, target_summary)[0]

rouge4 = (rouge_scores['rouge-1']['f'] * rouge_scores['rouge-2']['f']) ** (1/2)

rouge1_scores.append(rouge_scores['rouge-1']['f'])
rouge2_scores.append(rouge_scores['rouge-2']['f'])
rouge4_scores.append(rouge4)

print("ROUGE-L (n=1) F1 Score:", rouge_scores['rouge-1']['f'])
print("ROUGE-L (n=2) F1 Score:", rouge_scores['rouge-2']['f'])
print("ROUGE-4 F1 Score:", rouge4)

In [None]:
from bert_score import score as bert_score

bert_p, bert_r, bert_f1 = bert_score([generated_summary[0]], [target_summary[0]], lang='hi', verbose=False)
bert_scores.append(bert_f1.mean().item())

print("BERTScore F1 Score:", bert_f1.mean().item())