# 1. Data Collection

In [1]:
pip install google-api-python-client


Defaulting to user installation because normal site-packages is not writeable
Collecting google-api-python-client
  Obtaining dependency information for google-api-python-client from https://files.pythonhosted.org/packages/8f/a7/817a0fc24cf948edf3ce1e3220f82d82eccbe3f96d50eba89392c8d673dd/google_api_python_client-2.141.0-py2.py3-none-any.whl.metadata
  Downloading google_api_python_client-2.141.0-py2.py3-none-any.whl.metadata (6.7 kB)
Collecting httplib2<1.dev0,>=0.19.0 (from google-api-python-client)
  Obtaining dependency information for httplib2<1.dev0,>=0.19.0 from https://files.pythonhosted.org/packages/a8/6c/d2fbdaaa5959339d53ba38e94c123e4e84b8fbc4b84beb0e70d7c1608486/httplib2-0.22.0-py3-none-any.whl.metadata
  Downloading httplib2-0.22.0-py3-none-any.whl.metadata (2.6 kB)
Collecting google-auth!=2.24.0,!=2.25.0,<3.0.0.dev0,>=1.32.0 (from google-api-python-client)
  Obtaining dependency information for google-auth!=2.24.0,!=2.25.0,<3.0.0.dev0,>=1.32.0 from https://files.pythonhos



# Fetching Comments from Yutube using Goolgle Cloud API

In [2]:
import os
import googleapiclient.discovery
import csv

# API key from Google Cloud Console
api_key = 'Your Own Api'  ]

# Set up the YouTube API client
youtube = googleapiclient.discovery.build('youtube', 'v3', developerKey=api_key)

def get_comments(video_id):
    """Fetch comments from a YouTube video."""
    comments = []
    next_page_token = None

    while True:
        # Request comments using YouTube Data API
        request = youtube.commentThreads().list(
            part="snippet",
            videoId=video_id,
            pageToken=next_page_token,
            maxResults=100  # Number of comments to fetch per request
        )
        response = request.execute()

        # Extract comments from the API response
        for item in response['items']:
            comment = item['snippet']['topLevelComment']['snippet']['textOriginal']
            comments.append(comment)

        # Check if there is another page of comments
        next_page_token = response.get('nextPageToken')
        if not next_page_token:
            break

    return comments

def fetch_comments_from_multiple_videos(video_ids):
    """Fetch comments from multiple YouTube videos."""
    all_comments = []
    for video_id in video_ids:
        print(f"Fetching comments from video: {video_id}")
        comments = get_comments(video_id)
        all_comments.extend(comments)
    
    return all_comments

def save_comments_to_csv(comments, filename):
    """Save comments to a CSV file."""
    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        comment_writer = csv.writer(csvfile)
        comment_writer.writerow(['Comment'])  # Header
        for comment in comments:
            comment_writer.writerow([comment])

# Extracted video IDs from video Links
video_ids = [
    'tZ3D4I7aSww',
    '2_mWEL0Nuk8',
    'MjxGwfa5lxw',
    'Xb_pvujKCPE'
]

# Fetch comments
all_comments = fetch_comments_from_multiple_videos(video_ids)

# Saving comments to a CSV file
save_comments_to_csv(all_comments, 'youtube_comments.csv')

print(f"Total comments fetched: {len(all_comments)}")


Fetching comments from video: tZ3D4I7aSww
Fetching comments from video: 2_mWEL0Nuk8
Fetching comments from video: MjxGwfa5lxw
Fetching comments from video: Xb_pvujKCPE
Total comments fetched: 94409


# Importing All Libraries

In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
import nltk
from nltk.corpus import stopwords

# Load the dataset


In [5]:
file_path = 'C:/Users/LENOVO/Desktop/BC200404125 FYP 2024/Data/youtube_comments.csv'
comments_df = pd.read_csv(file_path)

# Display dataset shape and a preview of the data

In [8]:
print(f"The shape of the dataset is: {comments_df.shape}")
print(comments_df.head(8))
print(comments_df.tail(7))

The shape of the dataset is: (94409, 1)
                                             Comment
0    I love Pakistan Army from the core of my heart.
1                               Operation hinterland
2  Aik He Cherah hai jesay Dekh Muje Khud Bakhud ...
3                                    I love pak army
4                             حسبنا الله ونعم الوكيل
5              الحمدلله پاکستان الله کی عظیم نعمت ھے
6  Nafrt ho gai hain ap sy. Kya bna dea Ic mulk P...
7         Israili army ke liye kam krti hai lol army
      Comment
94402    ❤❤❤❤
94403   First
94404   ❤❤❤❤❤
94405   First
94406      ❤❤
94407       1
94408   First


# Define Roman Urdu stopwords

In [11]:
roman_urdu_stopwords = [
    'aur', 'hai', 'ka', 'ke', 'ki', 'ko', 'main', 'mein', 'ne', 'se', 'to', 'wo',
    'ye', 'ho', 'bhi', 'par', 'nahi', 'kar', 'raha', 'rahi', 'tha', 'the', 'is', 'us',
    'mein', 'tum', 'yeh', 'woh', 'ap', 'apna', 'apne', 'mera', 'meri', 'mere', 'tumhara',
    'hum', 'kya', 'kon', 'kis', 'kab', 'kaha', 'kyun', 'jab', 'tab', 'ab', 'bas', 'tak',
    'hain', 'kuch', 'sab', 'jese', 'lekin', 'jabtak', 'magar', 'phir', 'kehti', 'kehte',
    'rakh', 'kiya', 'gi', 'ge', 'kitna', 'kitni', 'aise', 'jese', 'janab', 'zara', 'abhi',
    'ager', 'yahan', 'wahan', 'poora', 'isliye', 'kaun', 'karna', 'karti', 'karte', 'karta',
    'karti', 'un', 'in', 'tumhein', 'mujhe', 'tumse', 'mujhse', 'inka', 'unka', 'unki', 
    'inki', 'uski', 'iski', 'unka', 'inka', 'unka', 'lekin', 'magar', 'kisi', 'aur',
    'wah', 'yah', 'jana', 'ayega', 'ayegi', 'jaega', 'jaegi', 'acha', 'achi', 'hota',
    'hoti', 'hona', 'zaroor', 'bilkul', 'kitni', 'zyada', 'bohot', 'itna', 'kam', 'acha',
    'rahe', 'rehte', 'jati', 'jata', 'jatay', 'chal', 'chalti', 'chalte', 'raho', 'rakhna',
    'yehi', 'is', 'aise', 'aise', 'baat', 'kuchh', 'aise', 'koji', 'wala', 'wale', 'wali',
    'valay', 'jawaab', 'waqt', 'jaisa', 'jaisi', 'kaisa', 'kaisi', 'kaam', 'hona', 'aesa',
    'sabse', 'sabko', 'sabhi', 'kuchh', 'saath', 'kisne', 'kis', 'kisi', 'isme', 'ismein',
    'unka', 'unka', 'wahi', 'yahi', 'wahan', 'yahan', 'kabhi', 'kaun', 'dekho', 'dekhte',
    'batao', 'bata', 'pata', 'se', 'kyun', 'koi', 'kuch', 'hamari', 'hamara', 'hamare',
    'karlo', 'karun', 'chahie', 'laga', 'laga', 'laga', 'shuru', 'rahne', 'rahna', 'karna',
    'kartay', 'karni', 'chuki', 'rehne', 'na', 'nahi', 'hai', 'hain'
]

# Save Roman Urdu stopwords to a CSV file

In [14]:
stopwords_df = pd.DataFrame(roman_urdu_stopwords, columns=['Stopword'])
stopwords_df.to_csv('C:/Users/LENOVO/Desktop/BC200404125 FYP 2024/data/roman_urdu_stopwords.csv', index=False)
print("Roman Urdu stopwords have been saved to 'roman_urdu_stopwords.csv'.")

Roman Urdu stopwords have been saved to 'roman_urdu_stopwords.csv'.


# Load English stopwords and Creating a Merged datafame of Roman Urdu and English

In [16]:
roman_urdu_stopwords = pd.read_csv('C:/Users/LENOVO/Desktop/BC200404125 FYP 2024/data/roman_urdu_stopwords.csv')
nltk.download('stopwords')
english_stopwords = set(stopwords.words('english'))
combined_stopwords = set(english_stopwords).union(set(roman_urdu_stopwords['Stopword']))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


# Function to clean text by removing special characters, signs and Stopwords & Function to check if a comment is entirely in English &  Function to detect and filter out comments containing non-Latin characters

In [18]:
def preprocess_text(comment):
    """Preprocess the comment by removing special characters, converting to lowercase, and standardizing spaces."""
    comment = re.sub(r'[^a-zA-Z0-9\s]', '', comment)  # Remove special characters
    comment = comment.lower()  # Convert to lowercase
    comment = re.sub(r'\s+', ' ', comment).strip()  # Remove extra spaces and strip leading/trailing spaces
    return comment

def remove_stopwords(comment):
    """Remove stopwords from the comment."""
    return ' '.join([word for word in comment.split() if word not in combined_stopwords])

def is_english_comment(comment):
    """Check if a comment is entirely in English."""
    words = comment.split()
    if len(words) == 0:  # If the comment is empty, return False
        return False
    num_english_words = sum(1 for word in words if word.lower() in english_stopwords)
    # Checking if the proportion of English words is above 90% or all words are English
    return num_english_words == len(words) or num_english_words / len(words) > 0.9


def contains_non_latin(comment):
    """Detect and filter out comments containing non-Latin characters."""
    non_latin_pattern = re.compile(r'[^\u0000-\u007F]+')
    return bool(non_latin_pattern.search(comment))

# 2. Pre- Process Data

1. Fill NaN values with empty strings, clean text, remove stopwords, remove duplicates
2. Remove empty lines
3. Remove complete english scripts & Filter out comments containing non-Latin characters
4. Filter Noise and Outliers

Preprocess comments: fill NaN values with empty strings , clean text, convert to lowercase, and standardize spaces

In [23]:
comments_df['Comment'] = comments_df['Comment'].fillna('') 
comments_df['Cleaned_Comment'] = comments_df['Comment'].apply(preprocess_text) 

Remove stopwords

In [26]:
comments_df['Cleaned_Comment'] = comments_df['Cleaned_Comment'].apply(remove_stopwords)

Remove duplicate comments

In [29]:
comments_df = comments_df.drop_duplicates(subset='Cleaned_Comment')

Remove comments that are entirely in English

In [32]:
comments_df = comments_df[~comments_df['Cleaned_Comment'].apply(is_english_comment)]

Filter out comments containing non-Latin characters

In [35]:
comments_df = comments_df[~comments_df['Cleaned_Comment'].apply(contains_non_latin)]

Remove empty lines (comments that are completely blank or contain only whitespace)

In [38]:
comments_df = comments_df[comments_df['Cleaned_Comment'].str.strip() != '']

Remove comments with very short length or repetitive characters

regex pattern to find sequences of repeated characters (three or more of the same character)



In [41]:
comments_df = comments_df[comments_df['Cleaned_Comment'].str.len() > 2]  # Example threshold: length > 2

comments_df = comments_df[~comments_df['Cleaned_Comment'].str.contains(r'(.)\1{2,}', regex=True)]


  comments_df = comments_df[~comments_df['Cleaned_Comment'].str.contains(r'(.)\1{2,}', regex=True)]


 Remove very short and very long comments 
 

In [44]:
comments_df = comments_df[comments_df['Cleaned_Comment'].str.len() > 5]  # Example: keep comments longer than 5 characters

comments_df = comments_df[comments_df['Cleaned_Comment'].str.len() < 500]  # Example: remove comments longer than 500 characters

In [46]:
comments_df.rename(columns={'Cleaned_Comment': 'Roman Urdu Comments'}, inplace=True)
comments_df['Toxic comments'] = '' 

# Display the shape and a preview of the cleaned dataset

In [49]:
print(f"Shape after removing duplicates, stopwords, English, and non-Latin character comments, and empty lines: {comments_df.shape}")
print(comments_df.head(10))

Shape after removing duplicates, stopwords, English, and non-Latin character comments, and empty lines: (41218, 3)
                                              Comment  \
0     I love Pakistan Army from the core of my heart.   
1                                Operation hinterland   
2   Aik He Cherah hai jesay Dekh Muje Khud Bakhud ...   
3                                     I love pak army   
6   Nafrt ho gai hain ap sy. Kya bna dea Ic mulk P...   
7          Israili army ke liye kam krti hai lol army   
8                                  Qadyany--fuj-gulam   
11  Masha ALLAH Masha ALLAH.......kitni achi army hai   
12                          🇵🇰 Pakistan army Zindabad   
13  Illegal activities are not allowed in Pakistan...   

                                  Roman Urdu Comments Toxic comments  
0                       love pakistan army core heart                 
1                                operation hinterland                 
2   aik cherah jesay dekh muje khud bakhud g

In [51]:
refined_comments_df = comments_df.drop(columns=['Comment', 'Toxic comments'])

# Saving the refined DataFrame to a new CSV file
refined_output_file_path = 'C:/Users/LENOVO/Desktop/BC200404125 FYP 2024/data/refined_youtube_comments.csv'
refined_comments_df.to_csv(refined_output_file_path, index=False)

print(f"Refined comments saved successfully to '{refined_output_file_path}'.")

Refined comments saved successfully to 'C:/Users/LENOVO/Desktop/BC200404125 FYP 2024/data/refined_youtube_comments.csv'.


In [55]:
toxic_words = ['beghairat','nafarat','nafrt', 'chutiya', 'haramzada', 'harami', 'ghaleez', 'gandu', 'kameena', 'laanti', 'lanat', 
    'bhadwa', 'behnchod', 'madarchod', 'ullu ka pattha', 'kutta', 'kutti', 'bhenchod', 'bhosda', 
    'haramkhor', 'choot', 'jaahil', 'paagal', 'kuttiya', 'zaleel', 'ghatia', 'nikamma', 'namakharam', 
    'badtameez', 'ghatiya', 'besharam', 'bakwas', 'nalayak', 'fazool', 'zameerfarosh', 'badtameez', 
    'kutta kamina', 'jhootay', 'fraudia', 'bakwas', 'chichora', 'kanjar', 'bhikari', 'charsi', 
    'ghaleez', 'begumaat', 'haram ki aulad', 'jaahil log', 'tatti', 'chootia', 'haramzada', 'bhaand', 
    'chalu', 'chamaar', 'chamdi', 'dehaati', 'gawar', 'nalayak', 'paagal', 'pehngah', 'raand', 
    'rascal', 'randi', 'saala', 'sarkari kutta', 'tharki', 'chirkut', 'kachra', 'dalla', 'bhains', 
    'bachi', 'chalu aurat', 'chikna', 'choor', 'dheet', 'farzi', 'ghatiya', 'kanjar', 'katwa', 
    'khoon kharaba', 'khoon peena', 'kuttay', 'kutti', 'nalayak', 'nasli', 'neech', 'paapi', 
    'phuddu', 'phuski', 'rakhail', 'rasgulla', 'saand', 'sanki', 'sasti', 'thook', 'thukna', 
    'waarsi', 'zillat', 'zinda lash', 'zinda murda', 'zinda qabristan', 'zubaan', 'zuban darazi', 
    'zuban samajhna', 'zuban daraz', 'zinda laash', 'zyada dimagh na kharab kar', 'tumse kuch nahi hoga', 
    'chal nikal', 'mujhe bakwas mat suna', 'tere baap ka naukar nahi hoon', 'ghalat mat samajh', 
    'aurat kya samjhti hai apne aap ko', 'teri maa ka', 'tere baap ka', 'teri behen ka', 'teri biwi ka', 
    'tu kya samjhta hai', 'teri aukaat kya hai', 'chal hatt', 'chal bhag', 'bakwas kar raha hai', 
    'chor hai tu', 'dheela hai tu', 'tera dimagh kharab hai', 'tere dimagh mein kuch nahi', 
    'tere bas ki baat nahi', 'tu kuch nahi kar sakta', 'tera kuch nahi hoga', 'chal bhaag yahan se', 
    'chal nikal yahan se', 'chal tu apna kaam kar', 'chal apna raasta naap', 'tere liye yahaan kuch nahi hai', 
    'tere ko koi nahi puchhega', 'tera kuch nahi hone wala', 'chal apni shakal dekh', 'chal tu apna kaam kar', 
    'chal apni aukaat dekh', 'tu zindagi mein kuch nahi kar sakta', 'chal apni shakal dekh', 'chal tu apna kaam kar', 
    'chal tu kuch nahi kar sakta','Shaytans hugging','kusra','kadra shemale','kadra','shemale', 'chal tu zindagi mein kuch nahi kar sakta',
]

toxic_words_df = pd.DataFrame(toxic_words, columns=['Toxic_Roman_urdu_Words'])

file_path = 'C:/Users/LENOVO/Desktop/BC200404125 FYP 2024/data/Toxic_Roman_urdu_Words.csv'
toxic_words_df.to_csv(file_path, index=False)

print("CSV file saved successfully with the toxic Roman Urdu words.")


CSV file saved successfully with the toxic Roman Urdu words.


In [57]:
# Load the toxic comments from the Excel sheet
file_path = 'C:/Users/LENOVO/Desktop/BC200404125 FYP 2024/data/Roman Urdu Toxic Dataset.xlsx'
excel_data = pd.read_excel(file_path)

# Filter comments marked as toxic
toxic_comments_from_excel = excel_data[excel_data['toxic'] == 1]['Text'].tolist()

# Combine the toxic comments with the existing toxic words list
combined_toxic_words = toxic_words + toxic_comments_from_excel

# Remove duplicates and refine the list
combined_toxic_words = list(set(combined_toxic_words))

# Convert the final list to a DataFrame
toxic_words_df = pd.DataFrame(combined_toxic_words, columns=['Toxic_Roman_Urdu_Words'])

# Save the updated dataset to a CSV file
output_file_path = 'C:/Users/LENOVO/Desktop/BC200404125 FYP 2024/data/Updated_Toxic_Roman_urdu_Words.csv'
toxic_words_df.to_csv(output_file_path, index=False)

print("CSV file updated successfully with toxic Roman Urdu words.")

CSV file updated successfully with toxic Roman Urdu words.


In [59]:
# Load the updated toxic words list
toxic_words_file_path = 'C:/Users/LENOVO/Desktop/BC200404125 FYP 2024/data/Updated_Toxic_Roman_urdu_Words.csv'
toxic_words_df = pd.read_csv(toxic_words_file_path)
toxic_words_list = toxic_words_df['Toxic_Roman_Urdu_Words'].tolist()

# Load the refined YouTube comments
comments_file_path = 'C:/Users/LENOVO/Desktop/BC200404125 FYP 2024/data/refined_youtube_comments.csv'
comments_df = pd.read_csv(comments_file_path)

# Function to check if any toxic words appear in the comment
def is_toxic(comment):
    for word in toxic_words_list:
        if word in comment:
            return 1  # Mark as toxic
    return 0  # Mark as non-toxic

# Apply the function to the comments
comments_df['Toxic'] = comments_df['Roman Urdu Comments'].apply(is_toxic)

# Save the updated DataFrame to a new CSV file
output_file_path = 'C:/Users/LENOVO/Desktop/BC200404125 FYP 2024/data/updated_youtube_comments.csv'
comments_df.to_csv(output_file_path, index=False)

print(f"Updated comments saved successfully to '{output_file_path}'.")

Updated comments saved successfully to 'C:/Users/LENOVO/Desktop/BC200404125 FYP 2024/data/updated_youtube_comments.csv'.


# 3. Feature Extraction

Initialize the TF-IDF Vectorizer for Unigrams and Bigrams and trigram

In [60]:
vectorizer = TfidfVectorizer(ngram_range=(1, 3))
X = vectorizer.fit_transform(comments_df['Roman Urdu Comments'])
y = comments_df['Toxic']  # The target variable is 'Toxic' (1 for toxic, 0 for non-toxic)

# 4. Train & Test Data Split (70% Training, 30% Testing)

In [64]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 5. Machine Learning Models

In [None]:
# Initialize Models
nb_model = MultinomialNB(class_prior=[0.3, 0.7])
dt_model = DecisionTreeClassifier(random_state=42)
rf_model = RandomForestClassifier(random_state=42)

# Train Models
nb_model.fit(X_train, y_train)
dt_model.fit(X_train, y_train)
rf_model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# 6. Evaluate Models

In [27]:
# Naive Bayes Evaluation
nb_preds = nb_model.predict(X_test)
nb_confusion = confusion_matrix(y_test, nb_preds)
nb_accuracy = accuracy_score(y_test, nb_preds)
nb_precision = precision_score(y_test, nb_preds, zero_division=1)
nb_recall = recall_score(y_test, nb_preds, zero_division=1)
nb_f1 = f1_score(y_test, nb_preds, zero_division=1)

In [28]:
# Decision Tree Evaluation
dt_preds = dt_model.predict(X_test)
dt_confusion = confusion_matrix(y_test, dt_preds)
dt_accuracy = accuracy_score(y_test, dt_preds)
dt_precision = precision_score(y_test, dt_preds, average='weighted')
dt_recall = recall_score(y_test, dt_preds, average='weighted')
dt_f1 = f1_score(y_test, dt_preds, average='weighted')

In [30]:
# Random Forest Evaluation
rf_preds = rf_model.predict(X_test)
rf_confusion = confusion_matrix(y_test, rf_preds)
rf_accuracy = accuracy_score(y_test, rf_preds)
rf_precision = precision_score(y_test, rf_preds, average='weighted')
rf_recall = recall_score(y_test, rf_preds, average='weighted')
rf_f1 = f1_score(y_test, rf_preds, average='weighted')

In [31]:
# 7. Print Evaluation Results
print("Naive Bayes Confusion Matrix:")
print(nb_confusion)
print(f"Naive Bayes Accuracy: {nb_accuracy:.4f}\n")
print(f"Naive Bayes Precision: {nb_precision:.4f}")
print(f"Naive Bayes Recall: {nb_recall:.4f}")
print(f"Naive Bayes F1-Score: {nb_f1:.4f}")

print("Decision Tree Confusion Matrix:")
print(dt_confusion)
print(f"Decision Tree Accuracy: {dt_accuracy:.4f}\n")
print(f"Decision Tree Precision: {dt_precision:.4f}")
print(f"Decision Tree Recall: {dt_recall:.4f}")
print(f"Decision Tree F1-Score: {dt_f1:.4f}")

print("Random Forest Confusion Matrix:")
print(rf_confusion)
print(f"Random Forest Accuracy: {rf_accuracy:.4f}\n")
print(f"Random Forest Precision: {rf_precision:.4f}")
print(f"Random Forest Recall: {rf_recall:.4f}")
print(f"Random Forest F1-Score: {rf_f1:.4f}")



Naive Bayes Confusion Matrix:
[[9577 2619]
 [  37  133]]
Naive Bayes Accuracy: 0.7852

Naive Bayes Precision: 0.0483
Naive Bayes Recall: 0.7824
Naive Bayes F1-Score: 0.0910
Decision Tree Confusion Matrix:
[[12191     5]
 [   29   141]]
Decision Tree Accuracy: 0.9973

Decision Tree Precision: 0.9972
Decision Tree Recall: 0.9973
Decision Tree F1-Score: 0.9971
Random Forest Confusion Matrix:
[[12196     0]
 [  122    48]]
Random Forest Accuracy: 0.9901

Random Forest Precision: 0.9902
Random Forest Recall: 0.9901
Random Forest F1-Score: 0.9874


In [32]:
evaluation_results = {
    'Model': ['Naive Bayes', 'Decision Tree', 'Random Forest'],
    'Accuracy': [nb_accuracy, dt_accuracy, rf_accuracy],
    'Precision': [nb_precision, dt_precision, rf_precision],
    'Recall': [nb_recall, dt_recall, rf_recall],
    'F1-Score': [nb_f1, dt_f1, rf_f1],
    'Confusion Matrix': [nb_confusion, dt_confusion, rf_confusion]
}

evaluation_df = pd.DataFrame(evaluation_results)
print(evaluation_df)

# Save results to a CSV file
evaluation_df.to_csv('C:/Users/LENOVO/Desktop/BC200404125 FYP 2024/data/model_evaluation_results.csv', index=False)
print("Model evaluation results saved to 'model_evaluation_results.csv'.")

           Model  Accuracy  Precision    Recall  F1-Score  \
0    Naive Bayes  0.785218   0.048328  0.782353  0.091034   
1  Decision Tree  0.997251   0.997189  0.997251  0.997147   
2  Random Forest  0.990134   0.990232  0.990134  0.987398   

            Confusion Matrix  
0  [[9577, 2619], [37, 133]]  
1    [[12191, 5], [29, 141]]  
2    [[12196, 0], [122, 48]]  
Model evaluation results saved to 'model_evaluation_results.csv'.


In [33]:
import joblib


In [34]:
joblib.dump(nb_model, 'naive_bayes_model.pkl')


joblib.dump(dt_model, 'decision_tree_model.pkl')


joblib.dump(rf_model, 'random_forest_model.pkl')

['random_forest_model.pkl']

In [35]:
joblib.dump(vectorizer, 'vectorizer.pkl')

['vectorizer.pkl']

In [1]:
import webbrowser


file_path = r"C:/Users/LENOVO/Desktop/BC200404125 FYP 2024\result\main.html"

# Open the HTML file in the default web browser
webbrowser.open(file_path)

True