In [14]:
import streamlit as st
import pandas as pd
import nltk
import requests
from bs4 import BeautifulSoup
import gensim
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

ModuleNotFoundError: No module named 'gensim'

In [None]:
# Descargar stopwords
nltk.download('stopwords')
from nltk.corpus import stopwords

# Cargar los datos y preprocesarlos
verdaderas = pd.read_csv("onlytrue1000.csv")
falsas = pd.read_csv("onlyfakes1000.csv")
verdaderas["Falsa"] = 0
falsas["Falsa"] = 1
df = pd.concat([verdaderas, falsas], axis=0).reset_index(drop=True)
df.columns = df.columns.str.replace(' ', '')
df.rename(columns={"text": "Titulo"}, inplace=True)
df.astype({'Falsa': 'category'}).dtypes

# Preprocesamiento del texto
stop_words = set(stopwords.words('spanish'))
stop_words = list(stop_words)
stop_words.extend(['según', 'tras', 'cabe', 'bajo', 'durante', 'mediante', 'so', 'toda', 'todas', 'cada', 'me', 'después', 'despues', 'segun', 'solo', 'sido', 'estan', 'lunes', 'martes', 'miércoles', 'jueves', 'viernes'])

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3 and token not in stop_words:
            result.append(token)
    return result

df['Titulo limpio'] = df['Titulo'].apply(preprocess)
df['TituloDefinitivo'] = df['Titulo limpio'].apply(lambda x: " ".join(x))
X = df['TituloDefinitivo']
y = df['Falsa']

# División de los datos y entrenamiento de modelos
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
vect = CountVectorizer()
X_train_dtm = vect.fit_transform(X_train)

# Entrenar modelo (Random Forest en este caso)
model_3 = LogisticRegression()
model_3.fit(X_train_dtm, y_train)

# Guardar el modelo y el vectorizador
with open('model.pkl', 'wb') as model_file:
    pickle.dump(model_3, model_file)
with open('vectorize.pkl', 'wb') as vectorizer_file:
    pickle.dump(vect, vectorizer_file)


In [None]:
def predict_fake_news(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Extraer título y contenido
    title = soup.find('h1', class_='title').get_text(strip=True) if soup.find('h1', class_='title') else ""
    article_paragraphs = soup.find_all('p')
    article_text = " ".join([p.get_text(strip=True) for p in article_paragraphs])
    
    # Preprocesar y predecir
    df = pd.DataFrame({'title': [title], 'text': [article_text]})
    df['completed_text'] = df['title'] + df['text']
    df.dropna(subset=["completed_text"], inplace=True)
    df['clean_text'] = df['completed_text'].apply(preprocess)
    df['clean_text_joined'] = df['clean_text'].apply(lambda x: " ".join(x))
    
    # Transformar el texto en formato vectorizado
    test_dtm = vect.transform(df["clean_text_joined"])
    y_pred_test = model_3.predict(test_dtm)
    
    # Devolver resultado
    return pd.Series(y_pred_test).map({0: "Real", 1: "Fake"})[0]