In [2]:
'''
Prompt to chatgpt: I need a best model to classify whether the question is text or arithmetic

'''


import pandas as pd

In [3]:
data = pd.read_csv("../Data/questions_table.csv")
data.columns

Index(['question', 'answer', 'answer_type', 'derivation'], dtype='object')

In [21]:
data["answer_type"].unique()

array(['multi-span', 'span', 'arithmetic', 'count'], dtype=object)

In [4]:
dataset = data[data["answer_type"].isin(["span", "arithmetic"])]

In [23]:
len(dataset)

11265

In [24]:
dataset.head()

Unnamed: 0,question,answer,answer_type,derivation
1,How much is the 2019 rate of inflation?,['2.9'],span,
2,How much is the 2018 rate of inflation?,['2.9'],span,
3,What is the 2019 average rate of inflation?,2.9,arithmetic,(2.9+2.9)/2
4,What is the 2019 average rate of increase in s...,2.7,arithmetic,(2.7+2.7)/2
5,What is the difference between 2019 average ra...,0.2,arithmetic,[(2.9+2.9)/2] - [(2.7+2.7)/2]


In [5]:
dataset_n = dataset[["question", "answer_type"]].copy()

In [26]:
dataset_n.head()

Unnamed: 0,question,answer_type
1,How much is the 2019 rate of inflation?,span
2,How much is the 2018 rate of inflation?,span
3,What is the 2019 average rate of inflation?,arithmetic
4,What is the 2019 average rate of increase in s...,arithmetic
5,What is the difference between 2019 average ra...,arithmetic


In [39]:
from sklearn.model_selection import train_test_split
X = dataset_n["question"]
y = dataset_n["answer_type"]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, shuffle = True)

In [40]:
X_train[:5].values

array(['What was the average settlements for 2017-2019?',
       'What was the estimated useful life of Towers in years?',
       'What is the average quarterly high sale price for 2019?',
       'What does the table show?',
       'What was the working capital in 2019?'], dtype=object)

In [41]:
y_train[:5]

12630    arithmetic
7307           span
11472    arithmetic
1573           span
4538           span
Name: answer_type, dtype: object

Logistic Regression with TF-IDF

In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [43]:
vectorizer = TfidfVectorizer(stop_words="english")
X_tfidf = vectorizer.fit_transform(X_train)
model = LogisticRegression()
model.fit(X_tfidf, y_train)
X_test_tfidf = vectorizer.transform(X_test)

In [44]:
predictions = model.predict(X_test_tfidf)
i = 0
for text, pred, true_label in zip(X_test, predictions, y_test):
    print(f"Input: '{text}' --> Prediction: {pred} --> True Label: {true_label}")
    i += 1
    if i == 5:
        break


Input: 'What was the change in the Total non-current trade and other payables in 2019 from 2018?' --> Prediction: arithmetic --> True Label: arithmetic
Input: 'What is the percentage increase / (decrease) in Fuel Oils from 2018 to 2019?' --> Prediction: arithmetic --> True Label: arithmetic
Input: 'What is the average hardware revenue from 2016 to 2018?' --> Prediction: arithmetic --> True Label: arithmetic
Input: 'What is the percentage change in revenue generated from Partner C from 2018 to 2019?' --> Prediction: arithmetic --> True Label: arithmetic
Input: 'As of March 29, 2019, What is Intangible assets expressed as a percentage of  Gross deferred tax assets?' --> Prediction: arithmetic --> True Label: arithmetic


In [33]:
import os

prompt: How to save the vectorizer and models

In [52]:
import pickle
import logging

In [54]:
def save_model(data, file_name):
    os.makedirs("../artifacts/model", exist_ok = True)
    try:
        file_path = f"../artifacts/model/{str(file_name)}.pkl"
        with open(file_path, "wb") as file:
            pickle.dump(data, file)
        print(f"Data saved succesfully at: {file_path}")
    except Exception as e:
        logging.error(f"Failed to save model due to: {str(e)}")
        raise
    

In [36]:
save_model(data = vectorizer, file_name = "vectorizer")
save_model(data = model, file_name = "model")

Data saved succesfully at: ../artifacts/model/vectorizer.pkl
Data saved succesfully at: ../artifacts/model/model.pkl


Prompt: How to measure the model's performance

In [38]:
from sklearn.metrics import accuracy_score
# Evaluate the model's performance
print("Accuracy:", accuracy_score(y_test, predictions))

Accuracy: 0.92403265885694


Prompt: Other model approach

Naive Bayes with TF-IDF

In [49]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score



# Split data into texts (X) and labels (y)
X = dataset_n["question"]
y = dataset_n["answer_type"]

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words="english")

# Fit and transform the training data
X_train_tfidf = vectorizer.fit_transform(X_train)

# Transform the test data
X_test_tfidf = vectorizer.transform(X_test)

# Initialize and train the Naive Bayes model
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_tfidf)

predictions = model.predict(X_test_tfidf)
i = 0
for text, pred, true_label in zip(X_test, predictions, y_test):
    print(f"Input: '{text}' --> Prediction: {pred} --> True Label: {true_label}")
    i += 1
    if i == 5:
        break




Input: 'What was the change in the Total non-current trade and other payables in 2019 from 2018?' --> Prediction: arithmetic --> True Label: arithmetic
Input: 'What is the percentage increase / (decrease) in Fuel Oils from 2018 to 2019?' --> Prediction: arithmetic --> True Label: arithmetic
Input: 'What is the average hardware revenue from 2016 to 2018?' --> Prediction: arithmetic --> True Label: arithmetic
Input: 'What is the percentage change in revenue generated from Partner C from 2018 to 2019?' --> Prediction: arithmetic --> True Label: arithmetic
Input: 'As of March 29, 2019, What is Intangible assets expressed as a percentage of  Gross deferred tax assets?' --> Prediction: arithmetic --> True Label: arithmetic


In [48]:
# Evaluate the model's performance
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.8044378698224852


SVC with TF-IDF

In [50]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC  # Support Vector Classifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Split data into texts (X) and labels (y)
X = dataset_n["question"]
y = dataset_n["answer_type"]


# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words="english")

# Fit and transform the training data
X_train_tfidf = vectorizer.fit_transform(X_train)

# Transform the test data
X_test_tfidf = vectorizer.transform(X_test)

# Initialize and train the SVM model
model = SVC(kernel='linear')  # Linear kernel for text classification
model.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_tfidf)

predictions = model.predict(X_test_tfidf)
i = 0
for text, pred, true_label in zip(X_test, predictions, y_test):
    print(f"Input: '{text}' --> Prediction: {pred} --> True Label: {true_label}")
    i += 1
    if i == 5:
        break


Input: 'What was the change in the Total non-current trade and other payables in 2019 from 2018?' --> Prediction: arithmetic --> True Label: arithmetic
Input: 'What is the percentage increase / (decrease) in Fuel Oils from 2018 to 2019?' --> Prediction: arithmetic --> True Label: arithmetic
Input: 'What is the average hardware revenue from 2016 to 2018?' --> Prediction: arithmetic --> True Label: arithmetic
Input: 'What is the percentage change in revenue generated from Partner C from 2018 to 2019?' --> Prediction: arithmetic --> True Label: arithmetic
Input: 'As of March 29, 2019, What is Intangible assets expressed as a percentage of  Gross deferred tax assets?' --> Prediction: arithmetic --> True Label: arithmetic


In [51]:
# Evaluate the model's performance
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.9381656804733728


Prompt: Random forest classifier with TF-IDF

In [55]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier  # Random Forest model
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Split data into texts (X) and labels (y)
X = dataset_n["question"]
y = dataset_n["answer_type"]

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words="english")

# Fit and transform the training data
X_train_tfidf = vectorizer.fit_transform(X_train)

# Transform the test data
X_test_tfidf = vectorizer.transform(X_test)

# Initialize and train the Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_tfidf)



predictions = model.predict(X_test_tfidf)
i = 0
for text, pred, true_label in zip(X_test, predictions, y_test):
    print(f"Input: '{text}' --> Prediction: {pred} --> True Label: {true_label}")
    i += 1
    if i == 5:
        break


Input: 'What was the change in the Total non-current trade and other payables in 2019 from 2018?' --> Prediction: arithmetic --> True Label: arithmetic
Input: 'What is the percentage increase / (decrease) in Fuel Oils from 2018 to 2019?' --> Prediction: arithmetic --> True Label: arithmetic
Input: 'What is the average hardware revenue from 2016 to 2018?' --> Prediction: arithmetic --> True Label: arithmetic
Input: 'What is the percentage change in revenue generated from Partner C from 2018 to 2019?' --> Prediction: arithmetic --> True Label: arithmetic
Input: 'As of March 29, 2019, What is Intangible assets expressed as a percentage of  Gross deferred tax assets?' --> Prediction: arithmetic --> True Label: arithmetic


In [56]:
# Evaluate the model's performance
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.9491124260355029


In [8]:
# import pickle
# import logging
# def save_model(data, file_name):
#     os.makedirs("../artifacts/model", exist_ok = True)
#     try:
#         file_path = f"../artifacts/model/{str(file_name)}.pkl"
#         with open(file_path, "wb") as file:
#             pickle.dump(data, file)
#         print(f"Data saved succesfully at: {file_path}")
#     except Exception as e:
#         logging.error(f"Failed to save model due to: {str(e)}")
#         raise
    
# save_model(data = vectorizer, file_name = "vectorizer_final")
# save_model(data = model, file_name = "model_final")

In [None]:
'''#NLP using tensorflow
glove->embedding 6b,100d

embedding tf.keras.layers.Embeddings
transformers 
'''


In [6]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical

# Split data into texts (X) and labels (y)
X = dataset_n["question"]
y = dataset_n["answer_type"]

# Encode labels as integers (if not already encoded)
y = pd.factorize(y)[0]  # Factorize the labels into integer form

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)  # Limit to 5000 features

# Fit and transform the training data
X_train_tfidf = vectorizer.fit_transform(X_train)

# Transform the test data
X_test_tfidf = vectorizer.transform(X_test)

# Convert to dense format (TensorFlow prefers dense input)
X_train_tfidf = X_train_tfidf.toarray()
X_test_tfidf = X_test_tfidf.toarray()

# Convert labels to categorical (for multi-class classification)
y_train_cat = to_categorical(y_train)
y_test_cat = to_categorical(y_test)

# Build the NLP model in TensorFlow
model = Sequential()

# First layer (input layer)
model.add(Dense(512, input_dim=X_train_tfidf.shape[1], activation='relu'))  # 512 neurons
model.add(Dropout(0.5))  # Dropout for regularization

# Second layer
model.add(Dense(256, activation='relu'))  # 256 neurons
model.add(Dropout(0.5))  # Dropout

# Third layer
model.add(Dense(128, activation='relu'))  # 128 neurons

# Output layer
model.add(Dense(y_train_cat.shape[1], activation='softmax'))  # Softmax for multi-class classification

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train_tfidf, y_train_cat, epochs=10, batch_size=32, validation_data=(X_test_tfidf, y_test_cat))

# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test_tfidf, y_test_cat)
print(f"Test Accuracy: {test_accuracy:.4f}")

# Make predictions on the test set
y_pred = model.predict(X_test_tfidf)
y_pred_classes = np.argmax(y_pred, axis=1)  # Get the class with the highest probability

# Print first 5 predictions
i = 0
for text, pred, true_label in zip(X_test, y_pred_classes, y_test):
    print(f"Input: '{text}' --> Prediction: {pred} --> True Label: {true_label}")
    i += 1
    if i == 5:
        break


Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m247/247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 11ms/step - accuracy: 0.7597 - loss: 0.4721 - val_accuracy: 0.9237 - val_loss: 0.2124
Epoch 2/10
[1m247/247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 11ms/step - accuracy: 0.9397 - loss: 0.1811 - val_accuracy: 0.9207 - val_loss: 0.2252
Epoch 3/10
[1m247/247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.9600 - loss: 0.1123 - val_accuracy: 0.9222 - val_loss: 0.2372
Epoch 4/10
[1m247/247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.9701 - loss: 0.0875 - val_accuracy: 0.9293 - val_loss: 0.2322
Epoch 5/10
[1m247/247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.9786 - loss: 0.0682 - val_accuracy: 0.9314 - val_loss: 0.2506
Epoch 6/10
[1m247/247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.9817 - loss: 0.0548 - val_accuracy: 0.9266 - val_loss: 0.2848
Epoch 7/10
[1m247/247[0m [32m━━

In [9]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

# Split data into texts (X) and labels (y)
X = dataset_n["question"]
y = dataset_n["answer_type"]

# Encode labels as integers (if not already encoded)
y = pd.factorize(y)[0]  # Factorize the labels into integer form

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)  # Limit to 5000 features

# Fit and transform the training data
X_train_tfidf = vectorizer.fit_transform(X_train)

# Transform the test data
X_test_tfidf = vectorizer.transform(X_test)

# Convert to dense format (TensorFlow prefers dense input)
X_train_tfidf = X_train_tfidf.toarray()
X_test_tfidf = X_test_tfidf.toarray()

# Convert labels to categorical (for multi-class classification)
y_train_cat = to_categorical(y_train)
y_test_cat = to_categorical(y_test)

# Reshape to 3D for LSTM
# TF-IDF data is 2D (samples, features), we need to reshape it to 3D (samples, timesteps, features)
# Let's use the number of features as the timestep for each word's representation

timesteps = 1  # We treat each word as a single timestep (each word gets a single feature vector)
X_train_3d = X_train_tfidf.reshape((X_train_tfidf.shape[0], timesteps, X_train_tfidf.shape[1]))
X_test_3d = X_test_tfidf.reshape((X_test_tfidf.shape[0], timesteps, X_test_tfidf.shape[1]))

# Build the NLP model with LSTM in TensorFlow
model = Sequential()

# First layer (input layer)
model.add(LSTM(128, input_shape=(X_train_3d.shape[1], X_train_3d.shape[2]), return_sequences=True))  # LSTM with 128 units
model.add(Dropout(0.5))  # Dropout for regularization

# LSTM Layer (the main addition)
model.add(LSTM(64))  # LSTM with 64 units
model.add(Dropout(0.5))  # Dropout for regularization

# Output layer
model.add(Dense(y_train_cat.shape[1], activation='softmax'))  # Softmax for multi-class classification

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train_3d, y_train_cat, epochs=10, batch_size=32, validation_data=(X_test_3d, y_test_cat))

# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test_3d, y_test_cat)
print(f"Test Accuracy: {test_accuracy:.4f}")

# Make predictions on the test set
y_pred = model.predict(X_test_3d)
y_pred_classes = np.argmax(y_pred, axis=1)  # Get the class with the highest probability

# Print first 5 predictions
i = 0
for text, pred, true_label in zip(X_test, y_pred_classes, y_test):
    print(f"Input: '{text}' --> Prediction: {pred} --> True Label: {true_label}")
    i += 1
    if i == 5:
        break


  super().__init__(**kwargs)


Epoch 1/10
[1m247/247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 11ms/step - accuracy: 0.6930 - loss: 0.5918 - val_accuracy: 0.9169 - val_loss: 0.2378
Epoch 2/10
[1m247/247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.9285 - loss: 0.2026 - val_accuracy: 0.9325 - val_loss: 0.2040
Epoch 3/10
[1m247/247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.9527 - loss: 0.1529 - val_accuracy: 0.9340 - val_loss: 0.2031
Epoch 4/10
[1m247/247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.9581 - loss: 0.1225 - val_accuracy: 0.9320 - val_loss: 0.2093
Epoch 5/10
[1m247/247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.9626 - loss: 0.1115 - val_accuracy: 0.9311 - val_loss: 0.2167
Epoch 6/10
[1m247/247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.9695 - loss: 0.1032 - val_accuracy: 0.9287 - val_loss: 0.2367
Epoch 7/10
[1m247/247[

In [7]:
import torch
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Sample DataFrame (replace with actual dataset)
dataset_n['answer_type'] = dataset_n['answer_type'].astype('category').cat.codes  # Label encoding

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    dataset_n['question'].tolist(), dataset_n['answer_type'].tolist(), test_size=0.3, random_state=42
)

# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Custom Dataset class
class QADataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=32):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx], truncation=True, padding='max_length',
            max_length=self.max_length, return_tensors='pt'
        )
        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

# Create datasets
train_dataset = QADataset(X_train, y_train, tokenizer)
test_dataset = QADataset(X_test, y_test, tokenizer)

# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

# Load pre-trained BERT model
num_labels = len(set(y_train))
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Optimizer and loss function
optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = torch.nn.CrossEntropyLoss()

# Training loop
epochs = 1
model.train()
for epoch in range(epochs):
    total_loss, correct = 0, 0
    for batch in train_loader:
        optimizer.zero_grad()
        inputs = {key: val.to(device) for key, val in batch.items() if key != 'labels'}
        labels = batch['labels'].to(device)
        outputs = model(**inputs)
        loss = criterion(outputs.logits, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        correct += (outputs.logits.argmax(dim=-1) == labels).sum().item()
    print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}, Accuracy: {correct/len(train_dataset):.4f}")

# Evaluation
model.eval()
y_pred, y_true = [], []
with torch.no_grad():
    for batch in test_loader:
        inputs = {key: val.to(device) for key, val in batch.items() if key != 'labels'}
        labels = batch['labels'].to(device)
        outputs = model(**inputs)
        y_pred.extend(outputs.logits.argmax(dim=-1).cpu().numpy())
        y_true.extend(labels.cpu().numpy())

accuracy = accuracy_score(y_true, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Loss: 0.2074, Accuracy: 0.9153
Test Accuracy: 0.9645


We got the accuracy results as below:
    Logistic regression with TF-IDF -> 92%
    Naive Bayes with TF-IDF->  80%
    SVC with TF-IDF->  93%
    Random Forest Classifier with TF-IDF->  95%
    NLP Model in tensor flow-> 92%
    Added LSTM layer->92%
    Prompt: make a new code using bert transformer using hugging face-> 96%
    
    
    



