In [None]:
import pandas as pd

In [None]:
import zipfile
import os

In [None]:
zip_file = '/workspaces/Fake-News-2/data/archive.zip'

In [None]:
extract_dir = '/workspaces/Fake-News-2/data'

In [None]:
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

In [None]:
DATA_PATH = "/workspaces/Fake-News-2/data/WELFake_Dataset.csv"

In [None]:
df = pd.read_csv(DATA_PATH)

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
df = df.dropna()

In [None]:
df.isnull().sum()

In [None]:
# Get first news

df.iloc[0]['text']

In [None]:
df.iloc[3]['text']

# CLEAN TEXT

In [None]:
import nltk

nltk.download('stopwords')

stop_words = nltk.corpus.stopwords.words('english')

stop_words = set(stop_words)

In [None]:
import string

def clean_word(word:str) -> str:

    word = word.lower()
    word = word.strip()

    for letter in word:
        if letter in string.punctuation:
            word = word.replace(letter, '')

    return word        

clean_word("Hello!!! Hola    ")

In [None]:
string.punctuation

In [None]:
def clean_text(text:str) -> list[str]:

    clean_text_list = []
    for word in text.split():
        cleaned_word = clean_word(word)
        if cleaned_word not in stop_words:
            clean_text_list.append(cleaned_word)

    return clean_text_list

In [None]:
'hola adios 3 444 666'.split()

In [None]:
text = 'hola!! how are you today?'

clean_text(text)

In [None]:
df["clean_text"] = df["text"].apply(clean_text)

In [None]:
df.head()

# Vectorize Words

In [None]:
import gensim

from scipy.linalg import get_blas_funcs
from scipy.linalg.lapack import get_lapack_funcs
from scipy.special import psi  
try:
    from numpy import triu

except ImportError:
    from scipy.linalg import triu

EMBEDDING_DIM = 100

sentences = df["clean_text"]

model = gensim.models.Word2Vec(
    sentences = sentences,
    vector_size = EMBEDDING_DIM,
    window = 5,
    min_count = 1,
)

In [None]:
model.wv["trump"]

In [None]:
model.wv.most_similar('spain')

In [None]:
import numpy as np

def vectorize_text(text:list[str]) -> np.ndarray:
    """Vectorize a text by averaging all the word vectors in the text

    Args:
       text(str): the text to vectorize

    Returns:
       np.ndarray: the vectorized text
    """

    text_vector = np.zeros(EMBEDDING_DIM)
    for word in text:
        word_vector = model.wv[word]
        text_vector += word_vector

    text_vector/= len(text)

    return text_vector

In [None]:
# Add a column with the text converted to vectors
              
df["vector"] = df["clean_text"].apply(vectorize_text)

# TRAIN THE MODEL

In [None]:
from sklearn.model_selection import train_test_split

X = np.stack(df["vector"].values)
y = df["label"].values

nan_mask = np.isnan(X)
rows_with_nan = np.any(nan_mask, axis=1)
indices = np.where(rows_with_nan)[0]

X = np.delete(X, indices, axis=0)
y = np.delete(y, indices, axis=0)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [None]:
from sklearn.svm import SVC

clf = SVC()

In [None]:
clf.fit(X_train, y_train)

In [None]:
# Evaluate

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

accuracy, precision, recall

In [None]:
# Plot X_train vectors using PCA

from sklearn.decomposition import PCA

pca = PCA(n_components = 2)
pca.fit(X_train)

X_train_pca = pca.transform(X_train)

import matplotlib.pyplot as plt

plt.scatter(X_train_pca[:, 0], X_train_pca[:, 1], c=y_train, s=1)

plt.colorbar()

plt.show