<a href="https://colab.research.google.com/github/Midhilesh4890/Large-Language-models-practice/blob/main/NER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [49]:
!pip install keras pandas scikit-learn numpy tensorflow==2.13.0



In [51]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from keras_contrib.layers import CRF
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input

In [52]:
# Load and prepare the dataset
file_path = 'ner_dataset.csv'
ner_data = pd.read_csv(file_path, encoding='latin1')
ner_data['Sentence #'].fillna(method='ffill', inplace=True)

# Extracting words and tags into separate lists for each sentence
grouped_data = ner_data.groupby('Sentence #').apply(lambda s: [(w, t) for w, t in zip(s['Word'].values.tolist(), s['Tag'].values.tolist())])
sentences = [sentence for sentence in grouped_data]

# Splitting the data
train_sentences, test_sentences = train_test_split(sentences, test_size=0.2, random_state=42)
train_sentences, val_sentences = train_test_split(train_sentences, test_size=0.25, random_state=42)

# For Baseline Model
# Flatten the list of sentences for vectorization
words = [word for sentence in train_sentences for word, tag in sentence]
tags = [tag for sentence in train_sentences for word, tag in sentence]

In [78]:
len(val_sentences)

9592

In [53]:
# Vectorize words
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(words)

# Encode tags
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(tags)

# Baseline model: Logistic Regression
#model = LogisticRegression(max_iter=1000, verbose = 1)
model = LogisticRegression(verbose = 1)
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [54]:
# Prepare validation data
val_words = [word for sentence in val_sentences for word, tag in sentence]
val_tags = [tag for sentence in val_sentences for word, tag in sentence]
X_val = vectorizer.transform(val_words)
y_val = label_encoder.transform(val_tags)

# Predict and evaluate the baseline model
predictions = model.predict(X_val)
# print(classification_report(y_val, predictions, target_names=label_encoder.classes_))

In [55]:
# Define maximum length of a sentence
max_len = 50

# Create a dictionary to map words to indices
word2idx = {w: i + 1 for i, w in enumerate(set(words))}

# Create a dictionary to map tags to indices
# Ensure that the indices are continuous and start from 0
unique_tags = list(set(tags))
tag2idx = {t: i for i, t in enumerate(unique_tags)}

# Initialize X and y datasets
X_train = [[word2idx.get(w[0], 0) for w in s] for s in train_sentences]  # Use get to handle unknown words
X_train = pad_sequences(maxlen=max_len, sequences=X_train, padding="post", value=0)  # Pad sequences

# Prepare y data with correct tag indices
y_train = [[tag2idx.get(w[1], 0) for w in s] for s in train_sentences]  # Use get to handle unknown tags
y_train = pad_sequences(maxlen=max_len, sequences=y_train, padding="post", value=tag2idx.get("O", 0))  # Use 'O' tag for padding

# Convert labels to a one-hot encoding format
y_train = np.array([to_categorical(i, num_classes=len(tag2idx)) for i in y_train])

In [56]:
import tensorflow as tf

input = tf.keras.layers.Input(shape=(max_len,))
model = tf.keras.layers.Embedding(input_dim=len(word2idx) + 1, output_dim=50, input_length=max_len)(input)
model = tf.keras.layers.Dropout(0.1)(model)
model = tf.keras.layers.Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model)
model = tf.keras.layers.TimeDistributed(Dense(17, activation="relu"))(model)
# # CRF layer
# Commenting for now as it is giving compatiblity issues
# crf = tfa.layers.CRF(units=len(tag2idx))
# out = crf(model)  # CRF layer in TensorFlow Addons returns two values

model = Model(input, model)
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

# model = tf.keras.Model(input, out)
# # Compile the model with CRF loss
# model.compile(optimizer="adam", loss = crf.calculate_loss)

# Train the model (example code, real training should be done with more epochs and on GPU)
model.fit(X_train, np.array(y_train), batch_size=32, epochs=5, validation_split=0.1, verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7ef2c27538e0>

In [57]:
# Saving the model
model.save_weights('ner_model.h5')

# Preprocess the test set similarly to the training set
X_test = [[word2idx.get(w[0], 0) for w in s] for s in test_sentences]
X_test = pad_sequences(maxlen=max_len, sequences=X_test,
                       padding="post", value=0)

y_test = [[tag2idx.get(w[1], 0) for w in s] for s in test_sentences]
y_test = pad_sequences(maxlen=max_len, sequences=y_test,
                       padding="post", value=tag2idx["O"])
y_test = [to_categorical(i, num_classes=len(tag2idx)) for i in y_test]

# Load the model
model.load_weights('ner_model.h5')

# Predict on test set
predictions = model.predict(X_test, verbose=1)

# Convert the predictions and true values to label sequences
pred_labels = np.argmax(predictions, axis=-1)
true_labels = np.argmax(y_test, axis=-1)

# Convert indices to tags
idx2tag = {i: w for w, i in tag2idx.items()}
pred_tags = [[idx2tag[i] for i in row] for row in pred_labels]
true_tags = [[idx2tag[i] for i in row] for row in true_labels]



In [66]:
# import random
# # Select 3 random indices from the range of the test set
# random_indices = random.sample(range(len(test_sentences)), 1)

# # Output the word, true tag, and predicted tag for three randomly selected sentences
# for i in random_indices:
#     sentence, true, pred = test_sentences[i], true_tags[i], pred_tags[i]
#     print(f"\nSentence from index {i}:")
#     for word, true_tag, pred_tag in zip(sentence, true, pred):
#         # Printing the word and its true and predicted tag
#         print(f"{word[0]}: true - {true_tag}, pred - {pred_tag}")


Sentence from index 5027:
The: true - O, pred - B-eve
United: true - B-org, pred - B-eve
States: true - I-org, pred - B-eve
and: true - O, pred - B-eve
the: true - O, pred - B-eve
European: true - B-org, pred - B-eve
Union: true - I-org, pred - B-eve
are: true - O, pred - B-eve
calling: true - O, pred - B-eve
on: true - O, pred - B-eve
Congo: true - B-geo, pred - B-eve
and: true - O, pred - B-eve
Rwanda: true - B-geo, pred - B-eve
to: true - O, pred - B-eve
resolve: true - O, pred - B-eve
the: true - O, pred - B-eve
situation: true - O, pred - B-eve
diplomatically: true - O, pred - B-eve
.: true - O, pred - B-eve


In [67]:
# Initialize a list to store the results
results = []

# Iterate over all sentences in the test set
for i, (sentence, true, pred) in enumerate(zip(test_sentences, true_tags, pred_tags)):
    for word, true_tag, pred_tag in zip(sentence, true, pred):
        # Add each word, its true tag, and its predicted tag to the results list
        results.append({"Word": word[0], "True_Tag": true_tag, "Pred_Tag": pred_tag})

# Convert the results to a DataFrame
results_df = pd.DataFrame(results)

# Save the DataFrame to a CSV file
results_df.to_csv("ner_results.csv", index=False)

# Print a message to indicate completion
print("Results saved to ner_results.csv")

Results saved to ner_results.csv


In [68]:
result_df = pd.read_csv('ner_results.csv')

In [70]:
result_df.head()

Unnamed: 0,Word,True_Tag,Pred_Tag
0,The,O,B-eve
1,report,O,B-eve
2,calls,O,B-eve
3,on,O,B-eve
4,President,B-per,B-eve


In [73]:
result_df['Pred_Tag'].value_counts()

B-eve    209702
Name: Pred_Tag, dtype: int64

In [60]:
# Flatten the lists
pred_tags_flat = [tag for sublist in pred_tags for tag in sublist]
true_tags_flat = [tag for sublist in true_tags for tag in sublist]
# Evaluation using classification report
print(classification_report(true_tags_flat, pred_tags_flat))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

       B-art       0.00      0.00      0.00        94
       B-eve       0.00      1.00      0.00        70
       B-geo       0.00      0.00      0.00      7556
       B-gpe       0.00      0.00      0.00      3137
       B-nat       0.00      0.00      0.00        40
       B-org       0.00      0.00      0.00      4148
       B-per       0.00      0.00      0.00      3397
       B-tim       0.00      0.00      0.00      4074
       I-art       0.00      0.00      0.00        84
       I-eve       0.00      0.00      0.00        65
       I-geo       0.00      0.00      0.00      1461
       I-gpe       0.00      0.00      0.00        33
       I-nat       0.00      0.00      0.00        13
       I-org       0.00      0.00      0.00      3391
       I-per       0.00      0.00      0.00      3404
       I-tim       0.00      0.00      0.00      1249
           O       0.00      0.00      0.00    447384

    accuracy              

  _warn_prf(average, modifier, msg_start, len(result))
