# Análisis de datos exploratorio

- **Ejecutar primero data.ipynb (linux)**

## Cargar datos

In [2]:
import pandas as pd
import numpy as np

In [48]:
df = pd.read_json("features.json", orient='records', lines=True)

In [49]:
df.head(5)

Unnamed: 0,features,label
0,fake news cnn president like oz behind curtain...,0
1,epic commie obama pictured vietnam president f...,0
2,medium freak watch msnbc cut mic black trump s...,0
3,bernie sander crashed greedy drug company stoc...,0
4,american free slavery democrat partyor wait la...,0


In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 entries, 0 to 44897
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   features  44898 non-null  object
 1   label     44898 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 701.7+ KB


## Creacion del Word Embedding (Glove 100d)

In [51]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [37]:
# Crear el embedding diccionario de nuestro dataset
def load_glove_model(file_path):
    """Cargar un modelo GloVe desde un archivo de texto."""
    model = {}
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            values = line.split()
            word = values[0]
            vector = [float(value) for value in values[1:]]
            model[word] = vector
    return model

# Ruta al archivo GloVe
glove_file_path = 'glove.6B.100d.txt'

# Cargar el modelo GloVe
embedding_index = load_glove_model(glove_file_path)

# Ejemplo: Obtener el vector para la palabra 'the'
word_to_lookup = 'the'
vector_for_word = embedding_index.get(word_to_lookup)

if vector_for_word:
    print(f"Vector para la palabra '{word_to_lookup}': {vector_for_word}")
else:
    print(f"La palabra '{word_to_lookup}' no está en el modelo.")


Vector para la palabra 'the': [-0.038194, -0.24487, 0.72812, -0.39961, 0.083172, 0.043953, -0.39141, 0.3344, -0.57545, 0.087459, 0.28787, -0.06731, 0.30906, -0.26384, -0.13231, -0.20757, 0.33395, -0.33848, -0.31743, -0.48336, 0.1464, -0.37304, 0.34577, 0.052041, 0.44946, -0.46971, 0.02628, -0.54155, -0.15518, -0.14107, -0.039722, 0.28277, 0.14393, 0.23464, -0.31021, 0.086173, 0.20397, 0.52624, 0.17164, -0.082378, -0.71787, -0.41531, 0.20335, -0.12763, 0.41367, 0.55187, 0.57908, -0.33477, -0.36559, -0.54857, -0.062892, 0.26584, 0.30205, 0.99775, -0.80481, -3.0243, 0.01254, -0.36942, 2.2167, 0.72201, -0.24978, 0.92136, 0.034514, 0.46745, 1.1079, -0.19358, -0.074575, 0.23353, -0.052062, -0.22044, 0.057162, -0.15806, -0.30798, -0.41625, 0.37972, 0.15006, -0.53212, -0.2055, -1.2526, 0.071624, 0.70565, 0.49744, -0.42063, 0.26148, -1.538, -0.30223, -0.073438, -0.28312, 0.37104, -0.25217, 0.016215, -0.017099, -0.38984, 0.87424, -0.72569, -0.51058, -0.52028, -0.1459, 0.8278, 0.27062]


In [52]:
import numpy as np

def cosine_similarity(vec1, vec2):
    """Calcular la similitud coseno entre dos vectores."""
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    similarity = dot_product / (norm_vec1 * norm_vec2)
    return similarity

def most_similar_words(word, model, top_n=5):
    """Encontrar las palabras más similares a la palabra dada en el modelo."""
    if word not in model:
        print(f"La palabra '{word}' no está en el modelo.")
        return []

    word_vector = model[word]
    similarities = []

    for other_word, other_vector in model.items():
        if other_word != word:
            similarity = cosine_similarity(word_vector, other_vector)
            similarities.append((other_word, similarity))

    # Ordenar por similitud descendente
    similarities.sort(key=lambda x: x[1], reverse=True)

    # Devolver las palabras más similares (top_n)
    return similarities[:top_n]

In [47]:
# Ejemplo de uso
word_to_lookup = 'ate'
similar_words = most_similar_words(word_to_lookup, embedding_index, top_n=10)

print(f"Palabras más similares a '{word_to_lookup}':")
for similar_word, similarity in similar_words:
    print(f"{similar_word}: {similarity}")

Palabras más similares a 'ate':
eaten: 0.809029408004442
drank: 0.7871671568844296
eat: 0.7813820363677394
eating: 0.7338559174177061
eats: 0.7251667659830515
slept: 0.7152498971261759
smoked: 0.6635120852825582
meal: 0.654029261267389
cooked: 0.6502910720528902
dined: 0.6021937327293632


In [65]:
max_words = 10000
max_sequence_length = 250
embedding_dim = 100

In [81]:
# Tokenizar la data (BOW)
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df['features'])
word_index = tokenizer.word_index
vocab_size = len(word_index)
# vocab_size

In [78]:
vocab_size

205444

In [73]:
# Realizar el padding
sequences = tokenizer.texts_to_sequences(df['features'])
padded_seq = pad_sequences(sequences, maxlen=max_sequence_length, padding='post', truncating='post')

In [82]:
# Crear el embedding matrix de nuestro dataset
embedding_matrix = np.zeros((max_words + 1, embedding_dim))
for word, i in word_index.items():
    if i <= max_words:
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector


In [83]:
embedding_matrix[1]

array([-0.15731 , -0.75503 ,  0.36845 , -0.18958 , -0.16896 , -0.23157 ,
       -0.22658 , -0.30186 ,  0.24372 ,  0.61896 ,  0.58995 ,  0.047638,
       -0.055164, -0.70211 ,  0.22084 , -0.69232 ,  0.49419 ,  1.4285  ,
       -0.25362 ,  0.20031 , -0.26192 ,  0.05315 , -0.048418, -0.44982 ,
        0.54644 , -0.014645, -0.015531, -0.61197 , -0.91964 , -0.7528  ,
        0.64843 ,  1.0934  ,  0.052682,  0.33345 ,  0.10532 ,  0.59517 ,
        0.023104, -0.37105 ,  0.29749 , -0.23683 ,  0.079566, -0.10326 ,
        0.35885 , -0.28935 , -0.19881 ,  0.22908 , -0.061435,  0.56127 ,
       -0.017115, -0.32868 , -0.78417 , -0.49375 ,  0.34944 ,  0.16278 ,
       -0.061168, -1.3106  ,  0.39152 ,  0.124   , -0.20873 , -0.18473 ,
       -0.56184 ,  0.55693 ,  0.012114, -0.54545 , -0.31409 ,  0.1     ,
        0.31543 ,  0.74757 , -0.47734 , -0.18332 , -0.65623 ,  0.40768 ,
       -0.30697 , -0.47247 , -0.7421  , -0.44978 , -0.078122, -0.52673 ,
       -0.70633 ,  1.3271  ,  0.26298 , -0.91    , 

In [62]:
padded_seq[0]

array([  512,    31,   289,     5,    28, 10428,   436, 68034,  1651,
         172,  3085,  7197,  1887,   486,   818,    42,   289,     5,
        1246, 12371, 16274,   172,  7784,     5,    17,     1,  3085,
         486,   268,    90, 35180,   224,  2211, 12371,   182,  6656,
          90,  1233,   180,   520, 68035,   960,   220,   328,   289,
        3085,  4047,  3447,    22,    13,  6485,  2648,  1441,  2477,
       12371,   106,   231,   552,   329,  2607,  2197,   927,   893,
         228,  2199,   249,    69,  3447, 68036,   289,  3488,  7077,
       54988,     1,  2477,   419,  2283,  1246,  3630,    40,   289,
        3424,     1,   169,   185,  3630,     2,     1,  9786,   175,
         241,   419,   781,   279, 12371,   106,  3447,  1116,   367,
        2794,   172,    71,     6,    46,     5, 10128,     2,   237,
        1524, 68037,   781,  3447,   140,  3630,   172, 14913,   168,
       12371, 68038,   328,   734,  3630,  1407,     1,   912,  3022,
         924,   568,

# Entrenar el modelo

In [86]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(padded_seq, df['label'], test_size=0.20, random_state=42, stratify=df['label'])

In [87]:
tokenizer_json = tokenizer.index_word
inv_map = {v: k for k, v in tokenizer_json.items()}

In [88]:
import json

# Nombre del archivo de salida
nombre_archivo = 'vocab.json'

# Guarda el vocabulario invertido en un archivo JSON
with open(nombre_archivo, 'w') as archivo:
    json.dump(inv_map, archivo)

print(f'Vocabulario invertido guardado en {nombre_archivo}')


Vocabulario invertido guardado en vocab.json


# Modelating

## Redes Neuronales

In [89]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from tensorflow.keras.layers import Embedding, LSTM

In [90]:
# Construir y entrenar la red neuronal
model = Sequential()

# Capa de Embedding
model.add(
    Embedding(
        input_dim=max_words + 1,
        output_dim=embedding_dim,
        weights=[embedding_matrix],
        input_length=max_sequence_length,
        trainable=False,
    )
)
model.add(Dropout(0.2))
# model.add(LSTM(units=128, recurrent_dropout=0.25, dropout=0.25))
model.add(LSTM(units=128))
model.add(Dropout(0.2))
model.add(Dense(1, activation="sigmoid"))
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
model.summary()

2023-11-11 18:16:41.533641: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-11-11 18:16:41.537227: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2211] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 250, 100)          1000100   
                                                                 
 dropout (Dropout)           (None, 250, 100)          0         
                                                                 
 lstm (LSTM)                 (None, 128)               117248    
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 1117477 (4.26 MB)
Trainable params: 117377 (458.50 KB)
Non-trainable params: 1000100 (3.82 MB)
_________________________________________________________________


In [91]:
history = model.fit(
    np.array(X_train),
    y_train,
    epochs=5,
    batch_size=32,
    verbose=True,
    validation_data=(np.array(X_test), y_test),
    workers=4,
)

Epoch 1/5


2023-11-11 18:16:49.841517: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 35918000 exceeds 10% of free system memory.




KeyboardInterrupt: 

In [None]:
model.save("modelo.h5")

# Ver importancia de palabras

In [None]:
import re
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
import string

In [None]:
def decontract(text):
    text = re.sub(r"won\'t", "will not", text)
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    return text
lemmatizer = WordNetLemmatizer()

In [None]:
def process_text(text):
    stopwords_english = stopwords.words('english')
    # remove stock market tickers like $GE
    text = re.sub(r'\$\w*', '', text)
    # remove old style retweet text "RT"
    text = re.sub(r'^RT[\s]+', '', text)
    # remove hyperlinks
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
    # remove hashtags
    text = re.sub(r'#', '', text)
    text = str(re.sub("\S*\d\S*", "", text).strip())
    text = decontract(text)

    # tokenize texts
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    tokens = tokenizer.tokenize(text)

    texts_clean = []
    for word in tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation+'...'):  # remove punctuation
            #
            stem_word = lemmatizer.lemmatize(word,"v")  # Lemmatizing word
            texts_clean.append(stem_word)

    return " ".join(texts_clean)

In [None]:
%pip install lime

In [None]:
from lime.lime_text import LimeTextExplainer
class_names=['NotFake','IsFake']
explainer= LimeTextExplainer(class_names=class_names)

def predict_proba(arr):
  processed=[]
  for i in arr:
    processed.append(process_text(i))
  list_tokenized_ex = tokenizer.texts_to_sequences(processed)
  Ex = pad_sequences(list_tokenized_ex, maxlen=max_sequence_length)
  pred=model.predict(Ex)
  returnable=[]
  for i in pred:
    temp=i[0]
    returnable.append(np.array([1-temp,temp])) #I would recommend rounding temp and 1-temp off to 2 places
  return np.array(returnable)

In [None]:
print("Actual rating",df['label'][2])
explainer.explain_instance(df['text'][2],predict_proba).show_in_notebook(text=True)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Bolsa de palabras
vectorizer = CountVectorizer(max_features=5000, stop_words=stopwords.words('english'))

# Ahora le solicitamos utilizando nuestro conjunto de datos que construya el vocabulario y tambien transforme nuestro texto
texto_features = vectorizer.fit_transform(df['features'][0])

palabras = vectorizer.get_feature_names_out()

frecuencias = texto_features.toarray()

In [None]:
frecuencias_totales = frecuencias.sum(axis=0)

# Obtener las palabras más frecuentes y sus frecuencias
palabras_mas_frecuentes = [palabras[i] for i in frecuencias_totales.argsort()[::-1][:10]]
frecuencias_mas_frecuentes = [frecuencias_totales[i] for i in frecuencias_totales.argsort()[::-1][:10]]

# Crear un gráfico de barras horizontal
plt.figure(figsize=(10, 6))
plt.barh(palabras_mas_frecuentes, frecuencias_mas_frecuentes, color='skyblue')
plt.xlabel('Frecuencia')
plt.ylabel('Palabra')
plt.title('Palabras más frecuentes')
plt.gca().invert_yaxis()  # Invertir el eje y para mostrar las palabras más frecuentes arriba
plt.show()

## Evaluation

In [None]:
# Evaluar el modelo
loss, accuracy = model.evaluate(np.array(X_test), y_test)
print(f'Accuracy: {accuracy}')

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

In [None]:
predictions = model.predict(np.array(X_test))
predictions = list(map(lambda x: 1 if (x > 0.5) else 0, predictions))

cm = confusion_matrix(y_test, predictions,labels=[0,1])
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0,1])
disp.plot()

In [None]:
print(classification_report(y_test, list(predictions), digits=4))

In [None]:
import matplotlib.pyplot as plt

plt.plot(history.history["loss"])
plt.plot(history.history["val_loss"])
plt.title("model loss")
plt.ylabel("loss")
plt.xlabel("epoch")
plt.legend(["train", "test"], loc="upper left")
plt.show()