<a href="https://colab.research.google.com/github/Midhilesh4890/Large-Language-models-practice/blob/main/NER_Ensemble.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from tensorflow.keras.layers import LSTM, Embedding, Dense, Dropout, Input
from tensorflow.keras.models import Model
from transformers import BertTokenizer, TFBertModel
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [8]:
def load_and_prepare_data(file_path):
    ner_data = pd.read_csv(file_path, encoding='latin1')
    ner_data['Sentence #'].fillna(method='ffill', inplace=True)
    grouped_data = ner_data.groupby('Sentence #').apply(lambda s: [(w, t) for w, t in zip(s['Word'].values.tolist(), s['Tag'].values.tolist())])
    return [sentence for sentence in grouped_data]

def create_mappings(sentences):
    words = [word for sentence in sentences for word, tag in sentence]
    tags = [tag for sentence in sentences for word, tag in sentence]
    word2idx = {w: i + 1 for i, w in enumerate(set(words))}
    tag2idx = {t: i for i, t in enumerate(set(tags))}
    return word2idx, tag2idx

def process_data(sentences, word2idx, tag2idx, max_len):
    X = [[word2idx.get(w[0], 0) for w in s] for s in sentences]
    X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=0)

    y = [[tag2idx.get(w[1], 0) for w in s] for s in sentences]
    y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2idx.get("O", 0))
    y = np.array([to_categorical(i, num_classes=len(tag2idx)) for i in y])

    return X, y

def build_model(word2idx, tag2idx, lstm_units, dense_units):
    max_len = 50
    input_layer = Input(shape=(max_len,))
    embedding_layer = Embedding(input_dim=len(word2idx) + 1, output_dim=50, input_length=max_len)(input_layer)
    lstm_layer = LSTM(units=lstm_units, return_sequences=True)(embedding_layer)
    dropout_layer = Dropout(0.1)(lstm_layer)
    dense_layer = Dense(dense_units, activation='relu')(dropout_layer)
    output_layer = Dense(len(tag2idx), activation='softmax')(dense_layer)
    model = Model(input_layer, output_layer)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [9]:
# Main script
file_path = 'ner_dataset.csv'
sentences = load_and_prepare_data(file_path)
train_sentences, test_sentences = train_test_split(sentences, test_size=0.2, random_state=42)
train_sentences, val_sentences = train_test_split(train_sentences, test_size=0.25, random_state=42)

word2idx, tag2idx = create_mappings(train_sentences)
X_train, y_train = process_data(train_sentences, word2idx, tag2idx, max_len=50)
X_test, y_test = process_data(test_sentences, word2idx, tag2idx, max_len=50)

# Define the model configurations in a list of tuples
model_configs = [(64, 32), (128, 64), (256, 128)]

# Initialize, train, and save each model in a single loop
predictions = []
for i, (lstm_units, dense_units) in enumerate(model_configs, start=1):
    model = build_model(word2idx, tag2idx, lstm_units, dense_units)
    model.fit(X_train, y_train, batch_size=32, epochs=1, validation_split=0.1)
    predictions.append(model.predict(X_test, verbose=1))
    model.save_weights(f'ner_model_{i+1}.h5')



In [10]:
ensemble_predictions = np.mean(np.array(predictions), axis=0)

In [11]:
# Convert the predictions and true values to label sequences
pred_labels = np.argmax(ensemble_predictions, axis=-1)
true_labels = np.argmax(y_test, axis=-1)

# Convert indices to tags
idx2tag = {i: w for w, i in tag2idx.items()}
pred_tags = [[idx2tag[i] for i in row] for row in pred_labels]
true_tags = [[idx2tag[i] for i in row] for row in true_labels]

In [12]:
# Initialize a list to store the results
results = []

# Iterate over all sentences in the test set
for i, (sentence, true, pred) in enumerate(zip(test_sentences, true_tags, pred_tags)):
    for word, true_tag, pred_tag in zip(sentence, true, pred):
        # Add each word, its true tag, and its predicted tag to the results list
        results.append({"Word": word[0], "True_Tag": true_tag, "Pred_Tag": pred_tag})

# Convert the results to a DataFrame
results_df = pd.DataFrame(results)

# Save the DataFrame to a CSV file
results_df.to_csv("ner_results.csv", index=False)

# Print a message to indicate completion
print("Results saved to ner_results.csv")

Results saved to ner_results.csv


In [13]:
result_df = pd.read_csv('ner_results.csv')
result_df['Pred_Tag'].value_counts()

O        182629
B-geo     10188
I-per      3262
B-tim      3192
I-org      2787
B-per      2617
B-gpe      2458
B-org      1472
I-geo       958
I-tim       139
Name: Pred_Tag, dtype: int64