In [None]:
import pandas as pd

# Obtener Dataset

In [None]:
url = 'https://larr.cl/larr.cl/dataset/FakeDes.xlsx'
df1 = pd.read_excel(url)

# Explorar el Dataset

Mostramos el conjunto de datos

In [None]:
df1

Revisar un registro

In [None]:
df1.iloc[0]

In [None]:
df1[0:1]

Revisar un campo

In [None]:
df1['Headline']

Revisar un registro de un campo determinado

In [None]:
df1['Headline'][0]

Contemos el numero de Fake News

In [None]:
cFake = 0 
cTrue = 0
for o in df1['Category']:
  if o == 'True':
    cTrue+=1
  elif o == 'Fake':
    cFake+=1
print(f'Fake: {cFake}')
print(f'True: {cTrue}')


# Preparacion del dataset


## Eliminar campos que no aporten

In [None]:
del df1['Id']

In [None]:
del df1['Topic']

In [None]:
del df1['Source']

In [None]:
del df1['Link']

In [None]:
df1

In [None]:
df1 = df1.dropna()
df1

In [None]:
df1['Headline'][0]

In [None]:
df1['Text'][0]

## Quitar Urls


Funcion que permite a traves de una expresion regular detectar y quitar las urls

In [None]:
import re
def quitarURL(df:pd.DataFrame,campo:str):
  for i in range(0,len(df)):
    df[campo][i] = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*',' ',df[campo][i])
    return df

In [None]:
df = df1.copy()
%time df = quitarURL(df,'Headline')
df

In [None]:
%time df = quitarURL(df,'Headline')
df

## Stem, Lower y Stopwords

In [None]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
nltk.download('stopwords')
ps = PorterStemmer()
def stemming(df:pd.DataFrame,campo:str):
  corpus = []
  replacements = (
        ("á", "a"),
        ("é", "e"),
        ("í", "i"),
        ("ó", "o"),
        ("ú", "u"),
    )
  for i in range(0,len(df)):
    for a, b in replacements:
        df[campo][i] = df[campo][i].replace(a, b).replace(a.upper(), b.upper())
    review = re.sub('[^a-zA-Z]', ' ', df[campo][i]) #Deja solo texto
    review = review.lower() #Convierte a minusculas
    review = review.split() #Separa cada palabra
    review = [ps.stem(word) for word in review if not word in stopwords.words('spanish')] #Proceso de stopword + stem
    review = ' '.join(review) #Une nuevamente cada palabra
    corpus.append(review)
    
  return corpus

In [None]:
df['Headline'][0]

In [None]:
corpus = stemming(df,'Headline')
corpus

In [None]:
df1['Headline'][0]

In [None]:
corpus[0]

# Vectorizacion

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import nltk
import re
from nltk.corpus import stopwords
nltk.download('stopwords')
sw = stopwords.words('spanish')
vectorizer = CountVectorizer(stop_words=sw, token_pattern=r'[^\d\W]+')
freq_matrix = vectorizer.fit_transform(corpus)
feature_names = vectorizer.get_feature_names()
transformer = TfidfTransformer()
tfidf_matrix = transformer.fit_transform(freq_matrix)


In [None]:
from scipy.sparse import csr_matrix
dff = pd.DataFrame.sparse.from_spmatrix(tfidf_matrix,columns=feature_names)

In [None]:
dff

# Conjuntos de datos

In [None]:
y = df['Category']
y

Dividimos el dataframe en un 70% para entrenamiento y 30% para lo que sera validacion y prueba

In [None]:
from sklearn.model_selection import train_test_split
%time X_train, X_test, y_train, y_test = train_test_split(dff, y, test_size=0.3, random_state=5, stratify=y)

Dividimos nuevamente el dataframe en un 70% para prueba y 30% para validacion

In [None]:
from sklearn.model_selection import train_test_split
%time X_val, X_test, y_val, y_test = train_test_split(X_test, y_test,test_size=0.3, random_state=5, stratify=y_test)

Reiniciamos los indices

In [None]:
def reiniciarIndiceDF(df):
  df.reset_index(inplace=True)
  df=df.drop('index',axis=1)
  return df

In [None]:
def reiniciarIndiceSerie(serie):
  serie = serie.reset_index()
  del serie['index']
  return serie

In [None]:
X_train = reiniciarIndiceDF(X_train)
X_test = reiniciarIndiceDF(X_test)
X_val = reiniciarIndiceDF(X_val)

In [None]:
y_train = reiniciarIndiceSerie(y_train)
y_test = reiniciarIndiceSerie(y_test)
y_val = reiniciarIndiceSerie(y_val)


In [None]:
def toNumber(df):
  df = df.replace(to_replace='Fake',value=0)
  df = df.replace(to_replace='True',value=1)
  return df

In [None]:
y_train = toNumber(y_train)
y_test = toNumber(y_test)
y_val = toNumber(y_val)


In [None]:
from tensorflow.keras.utils import to_categorical
y_train = to_categorical(y_train, num_classes=2)
y_test = to_categorical(y_test, num_classes=2)
y_val = to_categorical(y_val, num_classes=2)


# Modelo

## Crear Modelo

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras import backend as K 
def crearModelo():
  K.clear_session( )
  model = Sequential()
  model.add(Dense(15,  activation='relu'))
  model.add(Dense(2,  activation='softmax'))
  model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy'])
  return model

In [None]:
X_train.shape

In [None]:
y_train.shape

## Entrenar Modelo

In [None]:
import numpy as np
m = crearModelo()
m.fit(np.asarray(X_train),np.asarray(y_train),validation_data=(np.asarray(X_val),np.asarray(y_val)),epochs=10)


## Predecir

In [None]:
import torch
with torch.no_grad():
  y = m.predict(np.asarray(X_test)).round()

In [None]:
y

In [None]:
from sklearn.metrics import accuracy_score, precision_score,f1_score,recall_score, multilabel_confusion_matrix, ConfusionMatrixDisplay, confusion_matrix
confusion = multilabel_confusion_matrix(y_test,y)
print(f"acc: {accuracy_score(y_test,y)}")
print(f"precision: {precision_score(y_test,y,average='macro')}")
print(f"recall: {recall_score(y_test,y,average='macro')}")
print(f"f1: {f1_score(y_test,y,average='micro')}")
confusion_display = ConfusionMatrixDisplay(confusion[0],display_labels=['True','Fake']).plot(values_format='d')


In [None]:
fake = 0 #0
true = 0 #1
for x in range(len(y)):
  if y[x][0]==1 and y_test[x][0]==1:
    fake+=1
  elif y[x][1]==1 and y_test[x][1]==1:
    true+=1
    

In [None]:
len(y)

In [None]:
fake

In [None]:
true

In [None]:
y_test