In [1]:
# from url_utils import custom_preprocessor, custom_tokenizer
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import joblib
import os
import sys

In [2]:
df = pd.read_csv('../data/urls.csv')

In [3]:
# Create a pipeline
vectorizer = CountVectorizer()
classifier = MultinomialNB()

pipeline = Pipeline([
    ('vectorizer', vectorizer),
    ('classifier', classifier)
])

# Prepare the features and labels
X = df['text']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Train the pipeline on the training data
pipeline.fit(X_train, y_train)

# Predict the labels on the test data
y_pred = pipeline.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 92.20%

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.96      0.93     89157
           1       0.95      0.88      0.91     77983

    accuracy                           0.92    167140
   macro avg       0.92      0.92      0.92    167140
weighted avg       0.92      0.92      0.92    167140



In [4]:
# Save the pipeline
joblib.dump(pipeline, '../models_saved/vectorizer_and_multinomialNB.joblib')

['../models_saved/vectorizer_and_multinomialNB.joblib']