In [None]:
# Put id, type, column in temp database (when fake or reliable)
import sqlite3
import csv
import random
n = 0
n_inserted = 0

input_file = "cleaned_file.csv"
#output_file = "cleaned_file_shuffled.csv"
db_file = "temp_database.db"

# Connect to the SQLite database file
conn = sqlite3.connect(db_file)
c = conn.cursor()
c.execute("CREATE TABLE data (id TEXT, type TEXT, content TEXT, title TEXT);")

# Read the input file and insert rows into the SQLite database
with open(input_file, "r") as f_in:
    reader = csv.reader(f_in)
    header = next(reader)  # Read the header

    for row in reader:
        row_type = row[3]
        if row_type == "fake" or row_type == "reliable":
            # Remove newlines from each value in the row
            id_value = row[1].replace("\n", " ")
            type_value = row[3].replace("\n", " ")
            content_value = row[5].replace("\n", " ")
            title_value = row[9].replace("\n", " ")

            c.execute("INSERT INTO data (id, type, content, title) VALUES (?, ?, ?, ?)", (id_value, type_value, content_value, title_value))
            n_inserted += 1
        n += 1
        if n % 1000 == 0 and n:
            print("reading row:", n, "inserted", n_inserted, "rows\r", end = '')
conn.commit()  # Commit changes to the database file
print()
print("finished")

# Run in sqlite:

```
CREATE TABLE nodup AS
SELECT id, type, content, title, RANDOM() AS rnd
FROM data
GROUP BY content
ORDER BY rnd;

```

In [None]:
#put back into csv
import csv
import sqlite3

# Set the batch size for reading the data from the SQLite table
batch_size = 500000
database_path = 'temp_database.db'
output_file = 'shuffled_fake_reliable.csv'
# Connect to the database and create a cursor
conn = sqlite3.connect(database_path)
c = conn.cursor()

# Execute a SELECT statement to get the column names
c.execute("SELECT id, type, content, title FROM nodup LIMIT 0")
column_names = [description[0] for description in c.description]
print('column names:', column_names)

# Open the output file and write the header row
with open(output_file, "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(column_names)

    # Execute a SELECT statement to read the data from the table in batches
    offset = 0
    while True:
        c.execute(f"SELECT id, type, content, title FROM nodup LIMIT {batch_size} OFFSET {offset}")
        rows = c.fetchall()

        # Break the loop if no more rows are returned
        if not rows:
            break

        # Write the rows to the output file
        writer.writerows(rows)

        # Increment the offset to read the next batch of rows
        offset += batch_size
# Close the database connection
conn.close()

In [None]:
## Parquetise sql filtered data
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq


csv_file_path = 'shuffled_fake_reliable.csv'
parquet_file_prefix = 'cleaned_file.parquet/file_'
parquet_file_suffix = '.parquet'

column_dtypes = {
    "id": int,
    "type": object,
    "content": object,
    "title": object
}
#desired_columns = ['id', 'content', 'type']
chunk_size = 50000  # number of rows per chunk

# Iterate over the CSV file in chunks and write each chunk to a separate Parquet file
for i, chunk in enumerate(pd.read_csv(csv_file_path, chunksize=chunk_size, dtype=column_dtypes)):
    # Filter rows based on the "type" column
    #chunk = chunk.query('type == "fake" or type == "reliable"').copy()
        
    # Create the filename for the current chunk
    ident = str(i).zfill(4)
    parquet_file_path = parquet_file_prefix + ident + parquet_file_suffix

    # Write the current chunk to a Parquet file
    table = pa.Table.from_pandas(chunk)
    pq.write_table(table, parquet_file_path)
    print("\rchunk", ident)

In [None]:
## But this
import os
import glob
import pandas as pd
import numpy as np
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
from scipy.sparse import save_npz
import contractions
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

# Custom tokenizer with lemmatization
def custom_tokenizer(text):
    # Expand contractions
    expanded_text = contractions.fix(text)
    
    # Convert to lowercase
    lower_text = expanded_text.lower()
    
    # Tokenize with RegexpTokenizer
    tokenizer = RegexpTokenizer(r'\b[a-z]+\b')
    tokenized_words = tokenizer.tokenize(lower_text)

    # Load English stopwords
    stopwords_set = set(stopwords.words("english"))

    # Remove stopwords
    filtered_words = [word for word in tokenized_words if word not in stopwords_set]
    
    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]
    
    return lemmatized_words

label_mapping = {'fake': 1, 'reliable': 0}

# Initialize the TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(tokenizer=custom_tokenizer, max_features=2**12, use_idf=False)

# Directory containing your Parquet files
parquet_directory = 'cleaned_file.parquet/'

numpy_directory = 'numpyfiler/'

# First pass: Fit the TfidfVectorizer on the entire dataset
for parquet_file in sorted(glob.glob(os.path.join(parquet_directory, '*.parquet'))):
    print("first pass: Parsing", str(parquet_file))
    df = pd.read_parquet(parquet_file, engine='pyarrow')
    tfidf_vectorizer.fit(df['content'])

# Update the TfidfVectorizer to use IDF
tfidf_vectorizer.use_idf = True

# Initialize counters for training and test set file indices
train_idx = 0
test_idx = 0

reliable_factor = 0.426192247178894

# Second pass: Transform the content using the TfidfVectorizer and save it incrementally to separate files
for parquet_file in sorted(glob.glob(os.path.join(parquet_directory, '*.parquet'))):
    print("second pass: parsing", str(parquet_file))
    df = pd.read_parquet(parquet_file, engine='pyarrow')
    labels = np.array(df['type'])

    # Split the current chunk of data into training and testing sets
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

    # Separate the training chunk into fake and reliable classes
    train_fake_df = train_df[train_df['type'] == 'fake']
    train_reliable_df = train_df[train_df['type'] == 'reliable']

    # Multiply the number of reliable samples in the training chunk by the factor
    train_reliable_df_downsampled = train_reliable_df.sample(n=round(len(train_reliable_df) * reliable_factor), replace=False, random_state=42)

    # Concatenate the balanced training chunk
    train_balanced_df = pd.concat([train_fake_df, train_reliable_df_downsampled], ignore_index=True).sample(frac=1, random_state=42)

    # Transform the content for the balanced training chunk and testing chunk
    X_train_balanced = tfidf_vectorizer.transform(train_balanced_df['content'])
    X_test_chunk = tfidf_vectorizer.transform(test_df['content'])

    # Create labels array for the balanced training chunk and testing chunk
    y_train_balanced = np.array([label_mapping[label] for label in train_balanced_df['type']])
    y_test_chunk = np.array([label_mapping[label] for label in test_df['type']])

    # Save the training data as .npz files
    save_npz(os.path.join(numpy_directory, f'X_train_{train_idx:04d}.npz'), X_train_balanced)
    np.save(os.path.join(numpy_directory, f'y_train_{train_idx:04d}.npy'), y_train_balanced)

    # Save the test data as .npz files
    save_npz(os.path.join(numpy_directory, f'X_test_{test_idx:04d}.npz'), X_test_chunk)
    np.save(os.path.join(numpy_directory, f'y_test_{test_idx:04d}.npy'), y_test_chunk)

    # Increment the counters
    train_idx += 1
    test_idx += 1



In [None]:
import os
import numpy as np
from scipy.sparse import load_npz
import glob
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import LabelEncoder
numpy_dir = 'numpyfiler/'

# Load the training data
train_files = sorted(glob.glob(numpy_dir+'X_train_*.npz'))
train_label_files = sorted(glob.glob(numpy_dir+'y_train_*.npy'))

# Determine the input dimension from the first training file
input_dim = load_npz(train_files[0]).shape[1]

# Create a neural network model
model = Sequential()
model.add(Dense(1024, input_dim=input_dim, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

# Train the model incrementally using the saved training set files
batch_size = 1536
epochs = 1

for x_file, y_file in zip(train_files, train_label_files):
    print("training on", x_file, y_file)
    X_train_chunk = load_npz(x_file)
    y_train_chunk = np.load(y_file, allow_pickle=True)

    # Train the model in smaller batches
    num_samples = X_train_chunk.shape[0]
    num_batches = (num_samples + batch_size - 1) // batch_size

    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}/{epochs}")
        for batch_idx in range(num_batches):
            start_idx = batch_idx * batch_size
            end_idx = min((batch_idx + 1) * batch_size, num_samples)

            X_batch = X_train_chunk[start_idx:end_idx].todense()
            y_batch = y_train_chunk[start_idx:end_idx]

            loss, acc = model.train_on_batch(X_batch, y_batch)
            print(f" - Batch {batch_idx + 1}/{num_batches}: loss={loss:.4f}, accuracy={acc:.4f}")
            


# Function to generate a unique filename
def get_unique_filename(filename_prefix):
    counter = 1
    while os.path.exists(f'{filename_prefix}{counter}'):
        counter += 1
    return f'{filename_prefix}{counter}'

# Save the model
unique_filename = get_unique_filename('my_saved_model')
model.save(unique_filename)


In [None]:
# Make predictions on test data 
import numpy as np
from scipy.sparse import load_npz
import glob
from sklearn.metrics import accuracy_score
batch_size = 128
# Load the test set files
numpy_dir = 'numpyfiler/'
test_files = sorted(glob.glob(numpy_dir+'X_test_*.npz'))
test_label_files = sorted(glob.glob(numpy_dir+'y_test_*.npy'))

y_pred = []
y_true = []

# Make predictions on the test data
for x_file, y_file in zip(test_files, test_label_files):
    print("predicting on", x_file, y_file)
    X_test_chunk = load_npz(x_file)
    y_test_chunk = np.load(y_file, allow_pickle=True)

    # Process the test data in smaller batches
    num_samples = X_test_chunk.shape[0]
    num_batches = (num_samples + batch_size - 1) // batch_size

    for batch_idx in range(num_batches):
        start_idx = batch_idx * batch_size
        end_idx = min((batch_idx + 1) * batch_size, num_samples)

        X_batch = X_test_chunk[start_idx:end_idx].todense()
        y_batch = y_test_chunk[start_idx:end_idx]

        # Get the predictions for this batch
        y_pred_chunk = model.predict(X_batch)
        
        # Since the output activation is sigmoid, we need to threshold the predictions
        y_pred_chunk = (y_pred_chunk > 0.5).astype(int).flatten()
        
        y_pred.extend(y_pred_chunk)
        y_true.extend(y_batch)

# Calculate the accuracy
accuracy = accuracy_score(y_true, y_pred)
print("Accuracy:", accuracy)

In [None]:
from sklearn import metrics
import matplotlib.pyplot as plt
# Make confusion matrix
confusion_matrix = metrics.confusion_matrix(y_true, y_pred, normalize="true")
# Plot confusion matrix
plt.figure(figsize=(5, 5))
plt.imshow(confusion_matrix, interpolation="nearest", cmap=plt.cm.gray_r)
for i in range(2):
    for j in range(2):
        plt.text(j, i, format(confusion_matrix[i, j], '.2f'), horizontalalignment="center", color="white" if confusion_matrix[i, j] > 0.5 else "black")
plt.xlabel("Predicted label")
plt.ylabel("True label")