In [1]:
import pandas as pd
import re
import string
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from tokenizers import Tokenizer
import pickle

In [2]:
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Sethi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sethi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Sethi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
og_data = pd.read_csv("data/output_chunk_7.csv")

In [4]:
og_data_98 = og_data.sample(frac = 0.98) # work data for assignment
og_data_02 = og_data.drop(og_data_98.index) # test data to be used by prof
#og_data_02.to_csv('data/og_data_02.csv', index=False)  # already created csv file

In [5]:
og_data_98.head()

Unnamed: 0,category,text
3659,lifestyle,as more people become vaccinated and borders b...
6181,sport,when i dream of fried green tomatoes this is l...
6566,unrest,singapore authorities view far right extremism...
1856,education,the mission behind the invisible cities in our...
5022,religion,a texas inmate convicted in the december 2009 ...


In [6]:
og_data_02.count() # check if 2% is kept aside

category    144
text        144
dtype: int64

In [7]:
def clean_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = BeautifulSoup(text, "html.parser").get_text()  # Remove HTML tags
    # call stop words and remove them
    stop_words = stopwords.words('english') 
    removed_stopwords_text = ' '.join(word for word in text.split(' ') if word not in stop_words)
    # Perform stemming
    stemmer = nltk.SnowballStemmer("english")
    return ' '.join(stemmer.stem(word) for word in removed_stopwords_text.split(' '))


og_data_98["clean_text"] = og_data_98["text"].apply(clean_text)
og_data_98.head()



Unnamed: 0,category,text,clean_text
3659,lifestyle,as more people become vaccinated and borders b...,peopl becom vaccin border begin open option tr...
6181,sport,when i dream of fried green tomatoes this is l...,dream fri green tomato liter see photo via han...
6566,unrest,singapore authorities view far right extremism...,singapor author view far right extrem emerg ma...
1856,education,the mission behind the invisible cities in our...,mission behind invis citi shoe challeng help s...
5022,religion,a texas inmate convicted in the december 2009 ...,texa inmat convict decemb fatal stab pregnant...


In [8]:
from sklearn.model_selection import train_test_split

X_nb= og_data_98["clean_text"]
y_nb= og_data_98["category"]

X_train,X_test,y_train,y_test= train_test_split(X_nb,y_nb,test_size=.2,random_state=42)

# Multinomial Naive Bayes with Bag of Words
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score,classification_report

model = make_pipeline(CountVectorizer(), MultinomialNB())

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"MultinomialNB with Bag of Words accuracy: {accuracy:.3f}")

MultinomialNB with Bag of Words accuracy: 0.668


In [9]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
import numpy as np

cv_scores = cross_val_score(model, X_nb, y_nb, cv=StratifiedKFold(n_splits=3, shuffle=True), scoring='accuracy')

print(f"Cross-Validation Scores:{cv_scores}")

print(f"Mean Accuracy: {np.mean(cv_scores):.2f}")

Cross-Validation Scores:[0.65986395 0.65008503 0.67984694]
Mean Accuracy: 0.66


In [10]:
import tensorflow as tf
import transformers

In [11]:
def regular_encode(texts, tokenizer, maxlen=512):
    enc_di = tokenizer.batch_encode_plus(
        texts, 
        return_attention_masks=False, 
        return_token_type_ids=False,
        pad_to_max_length=True,
        max_length=maxlen
    )
    
    return np.array(enc_di['input_ids'])

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder

# Encode labels
le = LabelEncoder()
y_logistic = le.fit_transform(og_data_98['category'])
X_logistic = og_data_98['clean_text']

# Create pipeline
model2 = make_pipeline(
    TfidfVectorizer(max_features=5000),
    LogisticRegression(max_iter=1000)
)

# Train
model2.fit(X_logistic, y_logistic)

# Evaluate
print("Model 2 Accuracy:", model2.score(X_logistic, y_logistic))

Model 2 Accuracy: 0.9037698412698413


In [13]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(og_data_98['clean_text'])
tokenizer.num_words = 5000
sequences = tokenizer.texts_to_sequences(og_data_98['clean_text'])
X_lstm = pad_sequences(sequences, maxlen=200)
y_lstm= le.fit_transform(og_data_98['category'])


# Build model
model3 = tf.keras.Sequential([
    tf.keras.layers.Embedding(5000, 128),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(len(le.classes_), activation='softmax')
])

model3.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# Train
history = model3.fit(
    X_lstm, y_lstm,
    batch_size=32,
    epochs=20,
    validation_split=0.2
)

# Evaluate
print("Model 3 Validation Accuracy:", max(history.history['val_accuracy']))

Epoch 1/20
[1m177/177[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 77ms/step - accuracy: 0.0864 - loss: 2.8302 - val_accuracy: 0.2472 - val_loss: 2.4031
Epoch 2/20
[1m177/177[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 70ms/step - accuracy: 0.3396 - loss: 2.0497 - val_accuracy: 0.3987 - val_loss: 1.9581
Epoch 3/20
[1m177/177[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 54ms/step - accuracy: 0.5210 - loss: 1.5081 - val_accuracy: 0.4433 - val_loss: 1.9503
Epoch 4/20
[1m177/177[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 36ms/step - accuracy: 0.6258 - loss: 1.1596 - val_accuracy: 0.4455 - val_loss: 1.9066
Epoch 5/20
[1m177/177[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 35ms/step - accuracy: 0.7176 - loss: 0.9084 - val_accuracy: 0.4731 - val_loss: 1.9621
Epoch 6/20
[1m177/177[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 35ms/step - accuracy: 0.7945 - loss: 0.6640 - val_accuracy: 0.4993 - val_loss: 2.0777
Epoch 7/20
[1m177/

In [14]:
import pickle
from sklearn.preprocessing import LabelEncoder

labels = og_data_98['category'] # Add your actual categories
label_encoder = LabelEncoder()
label_encoder.fit(labels)

# Save the trained label encoder
pickle.dump(label_encoder, open('label_encoder.pkl', 'wb'))

In [None]:

import joblib
from tensorflow.keras.models import save_model,load_model 


# Model 1: Original Model (Naive Bayes)
joblib.dump(model, 'NaiveBayes.pkl')


# Model 2: Logistic Regression
joblib.dump(model2, 'logreg_model.pkl')
pickle.dump(model2, open('model.pkl','wb'))


# Model 3: LSTM
model3.save('lstm_model.h5')



# Loading function for all models
def load_all_models():
    model1 = joblib.load('original_model.pkl')
    model2 = joblib.load('logreg_model.pkl')
    model3 = load_model('lstm_model.h5')
    
    return model1, model2, model3

# Prediction function for all models
def predict_all_models(text, model1, model2, model3):
    cleaned = clean_text(text)
    
    # Model 1 (Naive Bayes)
    pred1 = model1.predict([cleaned])[0]
    
    # Model 2 (Logistic Regression)
    pred2 = model2.predict([cleaned])[0]
    
    # Model 3 (LSTM)
    sequence = tokenizer.texts_to_sequences([cleaned])
    padded = pad_sequences(sequence, maxlen=200)
    pred3 = model3.predict(padded).argmax(axis=1)[0]
    
    return {
        'NaiveBayes': le.inverse_transform([pred1])[0],
        'Logistic Regression': le.inverse_transform([pred2])[0],
        'LSTM': le.inverse_transform([pred3])[0]
    }




In [16]:
# Load models once at startup
model1, model2, model3= load_all_models()

# Make predictions
sample_text = "Archaeologists discover ancient temple ruins under city center"
predictions = predict_all_models(sample_text, model1, model2, model3)

print("Predictions:", predictions)




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 395ms/step
Predictions: {'NaiveBayes': 'arts', 'Logistic Regression': 'arts', 'LSTM': 'politics'}
