In [1]:
%pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7
Note: you may need to restart the kernel to use updated packages.


In [1]:
from ucimlrepo import fetch_ucirepo
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [2]:
youtube_spam_collection = fetch_ucirepo(id=380)

X = youtube_spam_collection.data.features['CONTENT']  
y = youtube_spam_collection.data.targets['CLASS']     


In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [4]:
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

y_pred = nb_model.predict(X_test_tfidf)


In [5]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.9260204081632653
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.88      0.91       176
           1       0.91      0.96      0.93       216

    accuracy                           0.93       392
   macro avg       0.93      0.92      0.92       392
weighted avg       0.93      0.93      0.93       392

Confusion Matrix:
 [[155  21]
 [  8 208]]


In [13]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('nb', MultinomialNB())
])

param_distributions = {
    'tfidf__max_features': [1000, 1500, 2000],  
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],  
    'tfidf__min_df': [0.01, 0.05, 0.1], 
    'tfidf__max_df': [0.85, 0.9, 1.0],  
    'nb__alpha': np.linspace(0.1, 1.0, num=10),  
    'nb__fit_prior': [True, False]  
}

random_search = RandomizedSearchCV(
    pipeline,
    param_distributions,
    n_iter=1000,  
    cv=5,
    scoring='accuracy',
    verbose=1,
    n_jobs=-1,
    random_state=42  
)

random_search.fit(X_train, y_train)

print("Mejores parámetros encontrados:", random_search.best_params_)
print("Mejor puntuación de validación:", random_search.best_score_)


Fitting 5 folds for each of 1000 candidates, totalling 5000 fits
Mejores parámetros encontrados: {'tfidf__ngram_range': (1, 3), 'tfidf__min_df': 0.01, 'tfidf__max_features': 2000, 'tfidf__max_df': 0.85, 'nb__fit_prior': False, 'nb__alpha': np.float64(0.1)}
Mejor puntuación de validación: 0.9315945768821168
