In [2]:
import csv
import time
import random
from datetime import datetime, timedelta
from pygooglenews import GoogleNews
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from langdetect import detect
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from snorkel.labeling import labeling_function, PandasLFApplier, LFAnalysis
from snorkel.labeling.model import LabelModel
from concurrent.futures import ThreadPoolExecutor

# Descargar recursos de NLTK
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

# Preguntar al usuario por el nombre de la empresa y el año
empresa = input("Introduce el nombre de la empresa a investigar: ")
year = int(input("Introduce el año para la búsqueda de noticias (por ejemplo, 2023): "))

# Función para obtener títulos de noticias
def get_titles(search, lang, from_date, to_date):
    gn = GoogleNews(lang=lang)
    stories = []
    try:
        search = gn.search(search, from_=from_date, to_=to_date)
        newsitem = search['entries']
        for item in newsitem:
            story = {
                'title': item.title,
                'link': item.link,
                'date': item.published
            }
            stories.append(story)
    except Exception as e:
        print(f"Error al obtener noticias: {e}")
    return stories

# Función para obtener títulos en diferentes intervalos y lenguajes en paralelo
def get_all_titles(search, year):
    all_stories = []
    if year == datetime.now().year:
        end_date = datetime.now()
        start_date = end_date - timedelta(days=365)
    else:
        start_date = datetime(year, 1, 1)
        end_date = datetime(year, 12, 31)

    date_ranges = []
    while start_date < end_date:
        from_date = start_date.strftime('%m/%d/%Y')
        to_date = (start_date + timedelta(days=30)).strftime('%m/%d/%Y')
        date_ranges.append((from_date, to_date))
        start_date += timedelta(days=30)

    def fetch_news(lang):
        stories = []
        for date_range in date_ranges:
            stories.extend(get_titles(search, lang, *date_range))
            time.sleep(random.uniform(1, 5))
        return stories

    with ThreadPoolExecutor(max_workers=2) as executor:
        results = list(executor.map(fetch_news, ['es', 'en']))

    for result in results:
        all_stories.extend(result)

    return all_stories

# Función para preprocesar el texto
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Detectar idioma y configurar stopwords y lemmatizer
    lang = 'spanish' if detect(text) == 'es' else 'english'
    tokens = nltk.word_tokenize(text)
    stop_words = set(stopwords.words(lang))
    tokens = [word for word in tokens if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    processed_text = ' '.join(tokens)
    return processed_text

# Listas de palabras para etiquetado
positive_words = ["beneficio", "ganancia", "crecimiento", "seguro", "safe", "expansión", "mejora", "regalo", "gift", "logro", "logra", "dona", "otorga", "proporciona", "sostenible", "desarrollo", "amistoso", "saludable", "bondadoso", "benefit", "profit", "growth", "expansion", "improvement", "achieves", "donates", "grants", "provides", "sustainable", "development", "friendly", "healthy", "kind", "impulsa", "promote", "preserva", "preserve", "acuerdo", "celebra", "celebrate", "convivio", "reúne", "dialoga", "dialogue", "coopera", "cooperate", "concreta", "sustenta", "prolifera", "proliferate", "agreement", "excelente", "excellent", "perfect", "eficiente", "motiva", "motivate", "atiende", "serves", "éxito", "successful", "responsable", "concientiza", "raise", "apoyo", "awareness", "support", "resguarda", "inaugura", "protect", "inaugurate", "estrena", "debut", "repara", "repair", "ayuda", "helps", "reconoce", "recognize", "agradece", "thank", "creció", "ganancias", "iniciativa", "evolución", "evolution", "enjoy", "disfrutar", "gustar", "anuncia", "anounce", "recorta", "valioso", "valuable", "proteje", "resolve", "resuelve", "great", "atractivo", "compromiso", "commitment", "attractive", "prevención", "prevention", "solución", "solution", "invest", "invierte", "solar", "favorita", "favorite", "reparte", "instala", "install", "superior", "justo", "duplica", "duplicate", "ofrece", "offer"]
negative_words = ["sospechoso", "despide", "fire", "desploma", "malware", "devolver", "trampa", "trap", "suplantación", "impersonation", "íntimas", "trolleo", "demanda", "filtra", "intimate", "protesta", "violencia", "agresivo", "peligroso", "bloqueo", "horrible", "contaminación", "exhibe", "violación", "guerra", "conflicto", "corruption", "suspect", "protest", "violence", "aggressive", "dangerous", "blockade", "horrible", "pollution", "exhibits", "war", "conflict", "condena", "sentence", "destroy", "destroza", "arrebato", "pelea", "pleito", "fight", "daño", "damage", "falla", "hackeo", "hacker", "hackean", "fault", "defect", "failure", "poison", "veneno", "ineficiente", "inútil", "inefficient", "useless", "vulnera", "accidente", "deficiente", "complain", "queja", "polémica", "polemic", "burlas", "mock", "teasing", "corte", "shortcut", "afectado", "affected", "cae", "fall", "obliga", "obligate", "pérdida", "loss", "vandaliza", "vandalize", "diagnosticado", "diagnosed", "riesgo", "risk", "problematic", "toxic", "murder", "asesinato", "cut", "radiation", "radiación", "esconde", "hide", "caos", "contra", "againts", "trabas", "disputa", "suspende", "mentira", "lie", "lier", "mentiroso", "engaño", "acortar", "abandona", "abandone", "víctima", "victim", "atentado", "attempt", "amenaza", "abusa", "abuse", "threat", "timo", "defectuoso", "faulty", "falsificar", "robar", "falsify", "steal", "infiel", "indigno", "infidel", "unworthy", "decepción", "alerta", "alert", "espiar", "spy", "spía"]


# Listas de palabras con peso cuádruple
heavy_positive_words = ["exitoso", "innovación", "innovation", "premio", "victoria", "award", "victory", "celebrate", "celebra", "desarrolla", "achievement", "beca", "scholarship"]
heavy_negative_words = ["escándalo", "denuncia", "crisis", "delito", "quiebra", "abuso", "catastrófico", "fraude", "fraud", "crimen", "crime", "asesino", "muerto", "corrupción", "criminal", "intoxicación", "intoxication", "prisión", "encarcelado", "cárcel", "jail", "violación", "violation", "acusado", "leaked", "accused", "extorción", "extortion", "multa", "sanción", "sanction", "explosión", "explotion",]

@labeling_function()
def lf_contains_positive_words(x):
    return 0 if any(word in x['processed_title'] for word in positive_words) else 1

@labeling_function()
def lf_contains_negative_words(x):
    return 1 if any(word in x['processed_title'] for word in negative_words) else 0

@labeling_function()
def lf_heavy_positive_words(x):
    return 0 if any(word in x['processed_title'] for word in heavy_positive_words) else 1

@labeling_function()
def lf_heavy_negative_words(x):
    return 1 if any(word in x['processed_title'] for word in heavy_negative_words) else 0

@labeling_function()
def lf_predominant_sentiment(x):
    neg_count = sum(1 for word in x['processed_title'].split() if word in negative_words) + \
                4 * sum(1 for word in x['processed_title'].split() if word in heavy_negative_words)
    pos_count = sum(1 for word in x['processed_title'].split() if word in positive_words) + \
                4 * sum(1 for word in x['processed_title'].split() if word in heavy_positive_words)
    return 1 if neg_count > pos_count else 0

# Cargar el conjunto de datos de noticias reales
file_path = "C:/Users/e-malandaf/Downloads/True6.csv" 
df = pd.read_csv(file_path, encoding='latin1')

# Limpiar la columna 'negativo'
df['negativo'] = df['negativo'].replace('}', None)
df['negativo'] = df['negativo'].dropna().astype(int)

# Aplicar el preprocesamiento a la columna 'title'
df['processed_title'] = df['title'].apply(preprocess_text)

# Filtrar filas con valores no binarios en la columna 'negativo'
df = df[df['negativo'].isin([0, 1])]

# Eliminar filas donde 'processed_title' esté vacío
df = df[df['processed_title'].str.strip().astype(bool)]

# Crear una matriz TF-IDF para el texto procesado
tfidf_vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1, 3), max_df=0.95, min_df=2)
X_tfidf = tfidf_vectorizer.fit_transform(df['processed_title'])

# Aplicar funciones de etiquetado a los datos
lfs = [lf_contains_positive_words, lf_contains_negative_words, lf_heavy_positive_words, lf_heavy_negative_words, lf_predominant_sentiment]
applier = PandasLFApplier(lfs)
L_train = applier.apply(df)

# Analizar las funciones de etiquetado
lf_analysis = LFAnalysis(L=L_train, lfs=lfs).lf_summary()
print(lf_analysis)

# Modelado generativo para refinar las etiquetas
label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train, n_epochs=500, log_freq=100, seed=42)

# Obtener etiquetas refinadas
df['snorkel_label'] = label_model.predict(L=L_train)

# Utilizar las etiquetas refinadas por Snorkel como etiquetas de entrenamiento
y_snorkel = df['snorkel_label'].astype(int)

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y_snorkel, test_size=0.2, random_state=42)

# Definir el modelo SVM
svm_model = SVC(probability=True)

# Definir los hiperparámetros a buscar
param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto']
}

# Realizar la búsqueda de hiperparámetros
grid_search = GridSearchCV(svm_model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Evaluar el mejor modelo encontrado
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
print("Mejor precisión:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Mejor modelo:", best_model)

# Obtener todas las noticias sobre la empresa en el año especificado
stories = get_all_titles(empresa, year)

# Preprocesar y predecir el sentimiento de los títulos obtenidos, incluyendo probabilidad
sospechosa_count = 0
for story in stories:
    processed_title = preprocess_text(story['title'])
    title_tfidf = tfidf_vectorizer.transform([processed_title])
    
    # Obtener la predicción de sentimiento y la probabilidad
    sentiment = best_model.predict(title_tfidf)[0]
    probability = best_model.predict_proba(title_tfidf)[0][sentiment]
    
    # Asignar la etiqueta de sentimiento y la probabilidad al story
    story['sentiment'] = 'Positive' if sentiment == 0 else 'Negative'
    story['probability'] = probability
    
    # Contar las noticias sospechosas
    if sentiment == 1: 
        sospechosa_count += 1

# Calcular el porcentaje de noticias negativas solo si se encontraron historias
if len(stories) > 0:
    negative_percentage = (sospechosa_count / len(stories)) * 100
    print(f"\nPorcentaje de noticias negativas: {negative_percentage:.2f}%")
else:
    negative_percentage = 0
    print("\nNo se encontraron noticias para la empresa en el año especificado.")

# Guardar los resultados en un archivo CSV solo si hay historias
if len(stories) > 0:
    with open(f"Resultados_{empresa}_{year}.csv", "w", newline="", encoding="utf-8-sig") as csvfile:
        fieldnames = ["title", "link", "date", "sentiment", "probability"]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for story in stories:
            writer.writerow({
                "title": story['title'],
                "link": story['link'],
                "date": story['date'],
                "sentiment": story['sentiment'],
                "probability": story['probability']
            })
    print("\nAnálisis completado y resultados guardados en el archivo CSV.")
else:
    print("\nNo se generó ningún archivo CSV porque no se encontraron noticias.")



[nltk_data] Error loading punkt: HTTP Error 403: Forbidden
[nltk_data] Error loading stopwords: HTTP Error 403: Forbidden
[nltk_data] Error loading wordnet: HTTP Error 403: Forbidden
100%|██████████| 3005/3005 [00:02<00:00, 1465.25it/s]
INFO:root:Computing O...
INFO:root:Estimating \mu...


                            j Polarity  Coverage  Overlaps  Conflicts
lf_contains_positive_words  0   [0, 1]       1.0       1.0   0.974376
lf_contains_negative_words  1   [0, 1]       1.0       1.0   0.974376
lf_heavy_positive_words     2   [0, 1]       1.0       1.0   0.974376
lf_heavy_negative_words     3   [0, 1]       1.0       1.0   0.974376
lf_predominant_sentiment    4   [0, 1]       1.0       1.0   0.974376


  0%|          | 0/500 [00:00<?, ?epoch/s]INFO:root:[0 epochs]: TRAIN:[loss=13.884]
 18%|█▊        | 92/500 [00:00<00:00, 913.56epoch/s]INFO:root:[100 epochs]: TRAIN:[loss=0.006]
 37%|███▋      | 187/500 [00:00<00:00, 928.43epoch/s]INFO:root:[200 epochs]: TRAIN:[loss=0.003]
 57%|█████▋    | 287/500 [00:00<00:00, 957.47epoch/s]INFO:root:[300 epochs]: TRAIN:[loss=0.002]
 77%|███████▋  | 385/500 [00:00<00:00, 965.45epoch/s]INFO:root:[400 epochs]: TRAIN:[loss=0.002]
100%|██████████| 500/500 [00:00<00:00, 966.18epoch/s]
INFO:root:Finished Training


Mejor precisión: 0.8752079866888519
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.99      0.92       434
           1       0.95      0.58      0.72       167

    accuracy                           0.88       601
   macro avg       0.91      0.78      0.82       601
weighted avg       0.89      0.88      0.86       601

Mejor modelo: SVC(C=1, kernel='linear', probability=True)

Porcentaje de noticias negativas: 0.23%

Análisis completado y resultados guardados en el archivo CSV.
