<a href="https://colab.research.google.com/github/ManuelValdivia03/ProyectoIA/blob/main/entrenamiento.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
import pandas as pd

# ===== 1. INGL√âS: titulares de Fake.csv / True.csv =====
fake_en = pd.read_csv("Fake.csv")[["title"]].copy()
true_en = pd.read_csv("True.csv")[["title"]].copy()

fake_en["label"] = 0   # Fake
true_en["label"] = 1   # Real

fake_en["text"] = fake_en["title"].astype(str).str.strip()
true_en["text"] = true_en["title"].astype(str).str.strip()

df_en = pd.concat([fake_en[["text", "label"]],
                   true_en[["text", "label"]]],
                  ignore_index=True)

# Limpiar ingl√©s
df_en.dropna(subset=["text"], inplace=True)
df_en = df_en[df_en["text"].str.len() > 0]


# ===== 2. ESPA√ëOL: titulares desde development.xlsx =====
df_es_raw = pd.read_excel("development.xlsx")

df_es = df_es_raw[["Headline", "Category"]].copy()
df_es["label"] = df_es["Category"].map({"Fake": 0, "True": 1})
df_es["text"] = df_es["Headline"].astype(str).str.strip()
df_es = df_es[["text", "label"]]

# Limpiar espa√±ol
df_es.dropna(subset=["text", "label"], inplace=True)
df_es = df_es[df_es["text"].str.len() > 0]


# ===== 3. Combinar INGL√âS + ESPA√ëOL =====
df_all = pd.concat([df_en, df_es], ignore_index=True)

# Normalizar: bajar todo a min√∫sculas
df_all["text"] = df_all["text"].str.lower().str.strip()

# Quitar duplicados de texto
df_all = df_all.drop_duplicates(subset=["text"])

print("Total combinado antes de balancear:", len(df_all))
print(df_all["label"].value_counts())


# ===== 4. Balancear clases (mismo n√∫mero de Fake y Real) =====
counts = df_all["label"].value_counts()
min_count = counts.min()

df_balanced = (
    df_all
    .groupby("label", group_keys=False)
    .apply(lambda g: g.sample(n=min_count, random_state=42))
    .reset_index(drop=True)
)

print("Total despu√©s de balancear:", len(df_balanced))
print(df_balanced["label"].value_counts())


# ===== 5. Guardar dataset ARREGLADO =====
df_balanced.to_csv("dataset_fixed.csv", index=False, encoding="utf-8")

df_balanced.head()


Total combinado antes de balancear: 39021
label
1    20977
0    18044
Name: count, dtype: int64
Total despu√©s de balancear: 36088
label
0    18044
1    18044
Name: count, dtype: int64


  .apply(lambda g: g.sample(n=min_count, random_state=42))


Unnamed: 0,text,label
0,this trumpcare provision will let trump furthe...,0
1,track palin faces assault and weapons charges ...,0
2,wow! this video might explain why #unfithillar...,0
3,florida school overreacts to prank by pressing...,0
4,usa today just uncovered a bunch more trump la...,0


In [18]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score

import joblib

In [19]:
data = pd.read_csv("dataset_fixed.csv")

print("Tama√±o del dataset:", len(data))
data.head()

Tama√±o del dataset: 36088


Unnamed: 0,text,label
0,this trumpcare provision will let trump furthe...,0
1,track palin faces assault and weapons charges ...,0
2,wow! this video might explain why #unfithillar...,0
3,florida school overreacts to prank by pressing...,0
4,usa today just uncovered a bunch more trump la...,0


In [20]:
X = data["text"]
y = data["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

len(X_train), len(X_test)

(28870, 7218)

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    max_features=20000,
    ngram_range=(1, 2)
)



# Ajustar con train y transformar ambos sets
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

X_train_vec.shape, X_test_vec.shape


((28870, 20000), (7218, 20000))

In [22]:
results = {}

# === 1. Naive Bayes ===
nb = MultinomialNB()
nb.fit(X_train_vec, y_train)
pred_nb = nb.predict(X_test_vec)
results["Naive Bayes"] = accuracy_score(y_test, pred_nb)

# === 2. SVM ===
svm = LinearSVC(class_weight="balanced")
svm.fit(X_train_vec, y_train)
pred_svm = svm.predict(X_test_vec)
results["SVM"] = accuracy_score(y_test, pred_svm)

# === 3. √Årbol de decisi√≥n ===
tree = DecisionTreeClassifier()
tree.fit(X_train_vec, y_train)
pred_tree = tree.predict(X_test_vec)
results["Decision Tree"] = accuracy_score(y_test, pred_tree)

# === 4. k-NN ===
knn = KNeighborsClassifier()
knn.fit(X_train_vec, y_train)
pred_knn = knn.predict(X_test_vec)
results["k-NN"] = accuracy_score(y_test, pred_knn)

results

{'Naive Bayes': 0.9390412856747021,
 'SVM': 0.954696591853699,
 'Decision Tree': 0.8884732612912164,
 'k-NN': 0.8762815184261569}

In [23]:
best_model_name = max(results, key=results.get)
best_model = {
    "Naive Bayes": nb,
    "SVM": svm,
    "Decision Tree": tree,
    "k-NN": knn
}[best_model_name]

print("Mejor modelo:", best_model_name)
print("Accuracy:", results[best_model_name])


Mejor modelo: SVM
Accuracy: 0.954696591853699


In [24]:
joblib.dump(best_model, "modelo.joblib")
joblib.dump(vectorizer, "vectorizador.joblib")

print("Modelo y vectorizador guardados correctamente.")


Modelo y vectorizador guardados correctamente.


In [25]:
!python dronix.py

DRONIX online. Listo para separar la verdad de la ciza√±a digital üòé
Escr√≠beme un titular y te digo si es REAL o FAKE.
Para salir escribe: exit

üìù Titular: El gobierno anuncia que a partir de ma√±ana todo ser√° gratis para los estudiantes
‚ùå FAKE: Uy no, esta noticia trae m√°s mentiras que un ex en A√±o Nuevo. 

üìù Titular: El gobierno presenta un nuevo paquete de reformas econ√≥micas
‚ùå FAKE: Uy no, esta noticia trae m√°s mentiras que un ex en A√±o Nuevo. 

üìù Titular: NASA announces plan to build a theme park on the Moon
‚ùå FAKE: Hmm... eso suena m√°s falso que los billetes del Monopolio üòÖ 

üìù Titular: NASA launches new satellite to monitor climate change
ü§∑ Resultado incierto: Mi radar est√° a medias, tomar√≠a esto con pinzas üß∑ 

üìù Titular: Scientists develop a new vaccine showing promising early results
‚úî REAL: Esto s√≠ parece leg√≠timo, te lo confirmo üßê‚ú® 

üìù Titular: Government announces new education program for low-income families
ü§∑ Result