# Importing all necessary libraries

In [1]:
# Data handling
import pandas as pd
import numpy as np
import pandas as pd

# Text preprocessing
import re
import string
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize # Import sent_tokenize as well
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab') # Download the missing resource

# Feature extraction
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Machine learning models
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

# Model evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Optional - Visualization
import matplotlib.pyplot as plt
import seaborn as sns


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\karel\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\karel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\karel\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\karel\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


#Reading the test file

In [3]:
test_df = pd.read_csv("Dataset_slack/testing_data_lowercase_nolabels.csv",sep="\t", header=None, names=["label", "text"], encoding="utf-8", on_bad_lines='skip')

test_df_1 = test_df.copy()
test_df.head()

Unnamed: 0,label,text
0,2,copycat muslim terrorist arrested with assault...
1,2,wow! chicago protester caught on camera admits...
2,2,germany's fdp look to fill schaeuble's big shoes
3,2,mi school sends welcome back packet warning ki...
4,2,u.n. seeks 'massive' aid boost amid rohingya '...


In [5]:
test_df["label"].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
2,9982
﻿0,2


#Performing same preprocessing as training data

In [6]:
test_df.isnull().sum()

Unnamed: 0,0
label,0
text,0


In [7]:
# Define a text preprocessing pipeline:
# 1. Tokenization - Split the text into individual words.
# 2. Stopword Removal - Remove common English words that don’t carry much meaning.
# 3. Punctuation/Number Removal - Keep only alphabetic words.
# 4. Lemmatization - Convert words to their base or dictionary form.
def text_preprocessing_pipeline(text):
    # Step 1: Tokenize the text
    tokens = word_tokenize(text)

    # Step 2: Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word.lower() for word in tokens if word.lower() not in stop_words]

    # Step 3: Remove punctuation and numbers
    tokens = [re.sub(r'[^a-zA-Z]', '', word) for word in tokens]  # Keep only letters
    tokens = [word for word in tokens if word]  # Remove empty strings

    # Step 4: Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return ' '.join(lemmatized_tokens)


In [8]:
test_df["Cleaned_text"] = test_df["text"].apply(text_preprocessing_pipeline)

# Load the vectorizer and model

In [9]:
import pickle

# Load the TF-IDF vectorizer
with open('cv.pkl', 'rb') as file:
    cv = pickle.load(file)

# Load the trained Random Forest model
with open('best_nb.pkl', 'rb') as file:
    best_nb = pickle.load(file)


#Transform the Cleaned Test Data

In [10]:
X_test_final = cv.transform(test_df["Cleaned_text"])

#Predict using loaded model

In [11]:
predictions = best_nb.predict(X_test_final)
test_df['predicted_label'] = predictions


In [13]:
test_df['predicted_label'].value_counts()

Unnamed: 0_level_0,count
predicted_label,Unnamed: 1_level_1
1,5247
0,4737


In [15]:
test_df_1["predicted_label"] = test_df['predicted_label']
test_df_1.head()

Unnamed: 0,label,text,predicted_label
0,2,copycat muslim terrorist arrested with assault...,0
1,2,wow! chicago protester caught on camera admits...,0
2,2,germany's fdp look to fill schaeuble's big shoes,1
3,2,mi school sends welcome back packet warning ki...,0
4,2,u.n. seeks 'massive' aid boost amid rohingya '...,1


In [17]:
# Save to CSV
test_df_1.to_csv("test_predictions.csv", index=False)


In [18]:
from google.colab import files
files.download("test_predictions.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>