In [None]:
!pip install supabase

In [None]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification, RobertaModel
from torch.utils.data import DataLoader, TensorDataset
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
from kaggle_secrets import UserSecretsClient
from supabase import create_client, ClientOptions
import torch
from tqdm import tqdm
from sklearn.neighbors import NearestNeighbors
from statistics import mode


In [None]:
# Function to extract RoBERTa features for a batch of encodings
def extract_roberta_features_batch(encodings):
    input_ids = torch.tensor(encodings['input_ids']) 
    attention_mask = torch.tensor(encodings['attention_mask'])
    
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        last_hidden_states = outputs.last_hidden_state[:, 0, :].detach().numpy()
    
    return last_hidden_states

# Function to extract RoBERTa features for the entire data
def extract_roberta_features(encodings, batch_size=32):
    num_samples = len(encodings.input_ids)
    num_batches = (num_samples + batch_size - 1) // batch_size
    
    features = []
    for i in tqdm(range(num_batches), desc="Extracting RoBERTa Features"):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, num_samples)
        batch_encodings = {key: value[start_idx:end_idx] for key, value in encodings.items()}
        batch_features = extract_roberta_features_batch(batch_encodings)
        features.append(batch_features)
    
    features = np.concatenate(features, axis=0)
    return features


In [None]:
def fetch_data_from_supabase(schema, table, field, num_of_records):
    user_secrets = UserSecretsClient()

    # Connect to supabase
    supabase_url = "https://fglqovplibiyttjzqxuj.supabase.co"
    supabase_key = user_secrets.get_secret("SUPABASE_KEY")

    supabase = create_client(
                supabase_url,
                supabase_key,
                options=ClientOptions(
                  schema=schema
                ))


    resp = supabase.table(table).select(field).limit(num_of_records).execute()
    data = resp.data

    data = [row[field] for row in data]
    return data

In [None]:
def write_records_to_supabase(schema, table, records):
    user_secrets = UserSecretsClient()

    # Connect to Supabase
    supabase_url = "https://fglqovplibiyttjzqxuj.supabase.co"
    supabase_key = user_secrets.get_secret("SUPABASE_KEY")

    supabase = create_client(
                supabase_url,
                supabase_key,
                options=ClientOptions(
                  schema=schema
                ))

    try:
        # Insert records into Supabase
        for record in records:
            supabase.table(table).insert(record).execute()
    except APIError as e:
        print("API Error occurred:", e)

In [None]:
def visualize_features(features, labels, title, dim):
    pca = PCA(n_components=dim)
    reduced_features = pca.fit_transform(features)

    # Convert labels to numerical values
    label_map = {'true': 0, 'false': 1}
    numerical_labels = [label_map[label] for label in labels]
    
    if (dim == 3):
        # Plot the reduced features in 3D
        fig = plt.figure(figsize=(10, 6))
        ax = fig.add_subplot(111, projection='3d')
        ax.scatter(reduced_features[:, 0], reduced_features[:, 1], reduced_features[:, 2], c=numerical_labels, cmap='viridis', alpha=0.5)
        ax.set_title(title)
        ax.set_xlabel('Principal Component 1')
        ax.set_ylabel('Principal Component 2')
        ax.set_zlabel('Principal Component 3')
        plt.show()
    elif (dim == 2):
        # Plot the reduced features in 2d
        plt.figure(figsize=(10, 6))
        plt.scatter(reduced_features[:, 0], reduced_features[:, 1], c=numerical_labels, cmap='viridis', alpha=0.5)
        plt.title(title)
        plt.xlabel('Principal Component 1')
        plt.ylabel('Principal Component 2')
        plt.colorbar(label='Label')
        plt.show()


In [None]:
true_news = fetch_data_from_supabase("text_datasets", "true_text_dataset", "text", 85000)
false_news = fetch_data_from_supabase("text_datasets", "false_text_dataset", "text", 85000)

In [None]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaModel.from_pretrained('roberta-base')

In [None]:
batch_size = 10000
num_samples = len(true_news)
num_batches = num_samples // batch_size + (num_samples % batch_size > 0)  # Ceiling division
print(num_batches)
for batch_index in range(5, num_batches):
    start_index = batch_index * batch_size
    end_index = min((batch_index + 1) * batch_size, num_samples)
    
    true_encodings = tokenizer(true_news[start_index:end_index], truncation=True, padding=True)
    false_encodings = tokenizer(false_news[start_index:end_index], truncation=True, padding=True)

    true_features = extract_roberta_features(true_encodings)
    false_features = extract_roberta_features(false_encodings)
    true_features_2d = true_features.reshape(true_features.shape[0], -1)
    false_features_2d = false_features.reshape(false_features.shape[0], -1)

    true_features_2d = [{"feature": feature.tolist()} for feature in true_features_2d]
    false_features_2d = [{"feature": feature.tolist()} for feature in false_features_2d]

    # Write batch of features to the database
    write_records_to_supabase("features_lake", "true_features_lake", true_features_2d)
    write_records_to_supabase("features_lake", "false_features_lake", false_features_2d)

    # Free memory occupied by batch features
    del true_features_2d
    del false_features_2d
    del true_features
    del false_features
    del true_encodings
    del false_encodings