In [None]:
## Parquetise only fake and reliable news
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq


csv_file_path = "cleaned_file.csv"
parquet_file_prefix = "cleaned_file.parquet/file_"
parquet_file_suffix = ".parquet"

column_dtypes = {"id": int, "type": object, "content": object}
desired_columns = ["id", "content", "type"]
chunk_size = 100000  # number of rows per chunk

# Iterate over the CSV file in chunks and write each chunk to a separate Parquet file
for i, chunk in enumerate(
    pd.read_csv(csv_file_path, chunksize=chunk_size, dtype=column_dtypes)
):
    # Filter rows based on the "type" column
    chunk = chunk.query('type == "fake" or type == "reliable"').copy()

    if not chunk.empty:
        # Create the filename for the current chunk
        ident = str(i).zfill(4)
        parquet_file_path = parquet_file_prefix + ident + parquet_file_suffix

        # Write the current chunk to a Parquet file
        table = pa.Table.from_pandas(chunk[desired_columns])
        pq.write_table(table, parquet_file_path)
        print("\rchunk", ident)

In [None]:
import os
import glob
import pandas as pd
import numpy as np
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
from scipy.sparse import save_npz


# Custom tokenizer with lemmatization
def custom_tokenizer(text):
    words = nltk.word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return lemmatized_words


label_mapping = {"fake": 1, "reliable": 0}


# Initialize the TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(
    tokenizer=custom_tokenizer, max_features=2**18, use_idf=False
)

# Directory containing your Parquet files
parquet_directory = "cleaned_file.parquet/"

# First pass: Fit the TfidfVectorizer on the entire dataset
for parquet_file in glob.glob(os.path.join(parquet_directory, "*.parquet")):
    print("first pass: Parsing", str(parquet_file))
    df = pd.read_parquet(parquet_file, engine="pyarrow")
    tfidf_vectorizer.fit(df["content"])

# Update the TfidfVectorizer to use IDF
tfidf_vectorizer.use_idf = True

# Initialize counters for training and test set file indices
train_idx = 0
test_idx = 0

# Second pass: Transform the content using the TfidfVectorizer and save it incrementally to separate files
for parquet_file in glob.glob(os.path.join(parquet_directory, "*.parquet")):
    print("second pass: parsing", str(parquet_file))
    df = pd.read_parquet(parquet_file, engine="pyarrow")

    # Transform the content and create labels array
    vectorized_contents = tfidf_vectorizer.transform(df["content"])
    labels = np.array(df["type"])

    # Split the current chunk of data into training and testing sets
    X_train_chunk, X_test_chunk, y_train_chunk, y_test_chunk = train_test_split(
        vectorized_contents, labels, test_size=0.2, random_state=42
    )

    # Encode labels
    y_train_chunk = np.array([label_mapping[label] for label in y_train_chunk])
    y_test_chunk = np.array([label_mapping[label] for label in y_test_chunk])

    # Saving the data
    # Save the training and testing sets as binary files
    save_npz(f"X_train_{train_idx:04d}.npz", X_train_chunk)
    save_npz(f"X_test_{test_idx:04d}.npz", X_test_chunk)
    np.save(f"y_train_{train_idx:04d}.npy", y_train_chunk)
    np.save(f"y_test_{test_idx:04d}.npy", y_test_chunk)

    # Increment the counters
    train_idx += 1
    test_idx += 1

In [None]:
import numpy as np
from scipy.sparse import load_npz
import glob
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import LabelEncoder

# Load the training data
train_files = sorted(glob.glob("X_train_*.npz"))
train_label_files = sorted(glob.glob("y_train_*.npy"))

# Determine the input dimension from the first training file
input_dim = load_npz(train_files[0]).shape[1]

# Create a neural network model
model = Sequential()
model.add(Dense(128, input_dim=input_dim, activation="relu"))
model.add(Dense(64, activation="relu"))
model.add(Dense(1, activation="sigmoid"))

# Compile the model
model.compile(
    loss="binary_crossentropy",
    optimizer=Adam(learning_rate=0.001),
    metrics=["accuracy"],
)

# Train the model incrementally using the saved training set files
batch_size = 1536
epochs = 1

for x_file, y_file in zip(train_files, train_label_files):
    print("training on", x_file, y_file)
    X_train_chunk = load_npz(x_file)
    y_train_chunk = np.load(y_file, allow_pickle=True)

    # Train the model in smaller batches
    num_samples = X_train_chunk.shape[0]
    num_batches = (num_samples + batch_size - 1) // batch_size

    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}/{epochs}")
        for batch_idx in range(num_batches):
            start_idx = batch_idx * batch_size
            end_idx = min((batch_idx + 1) * batch_size, num_samples)

            X_batch = X_train_chunk[start_idx:end_idx].todense()
            y_batch = y_train_chunk[start_idx:end_idx]

            loss, acc = model.train_on_batch(X_batch, y_batch)
            print(
                f" - Batch {batch_idx + 1}/{num_batches}: loss={loss:.4f}, accuracy={acc:.4f}"
            )

model.save("my_saved_model")

In [None]:
from tensorflow.keras.models import load_model

# Load the saved model from disk
model = load_model("my_saved_model")

In [None]:
# Make predictions on test data (doesn't work)
import numpy as np
from scipy.sparse import load_npz
import glob
from sklearn.metrics import accuracy_score

batch_size = 128
# Load the test set files
test_files = sorted(glob.glob("X_test_*.npz"))
test_label_files = sorted(glob.glob("y_test_*.npy"))

y_pred = []
y_true = []

# Make predictions on the test data
for x_file, y_file in zip(test_files, test_label_files):
    print("predicting on", x_file, y_file)
    X_test_chunk = load_npz(x_file)
    y_test_chunk = np.load(y_file, allow_pickle=True)

    # Process the test data in smaller batches
    num_samples = X_test_chunk.shape[0]
    num_batches = (num_samples + batch_size - 1) // batch_size

    for batch_idx in range(num_batches):
        start_idx = batch_idx * batch_size
        end_idx = min((batch_idx + 1) * batch_size, num_samples)

        X_batch = X_test_chunk[start_idx:end_idx].todense()
        y_batch = y_test_chunk[start_idx:end_idx]

        # Get the predictions for this batch
        y_pred_chunk = model.predict(X_batch)

        # Since the output activation is sigmoid, we need to threshold the predictions
        y_pred_chunk = (y_pred_chunk > 0.5).astype(int).flatten()

        y_pred.extend(y_pred_chunk)
        y_true.extend(y_batch)

# Calculate the accuracy
accuracy = accuracy_score(y_true, y_pred)
print("Accuracy:", accuracy)