# Importing all necessary libraries

In [1]:
# Data handling
import pandas as pd
import numpy as np
import pandas as pd

# Text preprocessing
import re
import string
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize # Import sent_tokenize as well
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab') # Download the missing resource

# Feature extraction
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Machine learning models
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

# Model evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Optional - Visualization
import matplotlib.pyplot as plt
import seaborn as sns


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


#Reading the test file

In [18]:
test_df = pd.read_csv("/content/validation_data.csv")
test_df.head()
test_df_1 = test_df.copy()

In [3]:
test_df["label"].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
2,4956


#Performing same preprocessing as training data

In [4]:
# Drop unnecessary columns that are not needed for prediction
test_df = test_df.drop(columns = ["title", "subject", "date"], axis = 1)

In [18]:
test_df.isnull().sum()

Unnamed: 0,0
label,0
text,0
Cleaned_text,0
predicted_label,0


In [5]:
# Define a text preprocessing pipeline:
# 1. Tokenization - Split the text into individual words.
# 2. Stopword Removal - Remove common English words that don’t carry much meaning.
# 3. Punctuation/Number Removal - Keep only alphabetic words.
# 4. Lemmatization - Convert words to their base or dictionary form.
def text_preprocessing_pipeline(text):
    # Step 1: Tokenize the text
    tokens = word_tokenize(text)

    # Step 2: Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word.lower() for word in tokens if word.lower() not in stop_words]

    # Step 3: Remove punctuation and numbers
    tokens = [re.sub(r'[^a-zA-Z]', '', word) for word in tokens]  # Keep only letters
    tokens = [word for word in tokens if word]  # Remove empty strings

    # Step 4: Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return ' '.join(lemmatized_tokens)


In [6]:
test_df["Cleaned_text"] = test_df["text"].apply(text_preprocessing_pipeline)

# Load the vectorizer and model

In [7]:
import pickle

# Load the TF-IDF vectorizer
with open('tfidf_vectorizer.pkl', 'rb') as file:
    tfidf_vectorizer = pickle.load(file)

# Load the trained Random Forest model
with open('random_forest_model.pkl', 'rb') as file:
    rf_model = pickle.load(file)


#Transform the Cleaned Test Data

In [8]:
X_test_final = tfidf_vectorizer.transform(test_df["Cleaned_text"])

#Predict using loaded model

In [23]:
predictions = rf_model.predict(X_test_final)
test_df['predicted_label'] = predictions


In [24]:
test_df['predicted_label']

Unnamed: 0,predicted_label
0,1
1,1
2,1
3,1
4,1
...,...
4951,0
4952,0
4953,0
4954,0


In [25]:
test_df_1["predicted_label"] = test_df['predicted_label']

In [16]:


# Save to CSV if needed
test_df_1.to_csv("test_predictions.csv", index=False)
