In [None]:
!pip install supabase

In [None]:
!pip install psycopg2-binary


In [7]:
!mkdir ~/.kaggle


mkdir: cannot create directory '/root/.kaggle': File exists


In [8]:
!cp /kaggle/input/kaggle-token/kaggle.json ~/.kaggle

In [None]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification, RobertaModel
from torch.utils.data import DataLoader, TensorDataset
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
from kaggle_secrets import UserSecretsClient
from supabase import create_client, ClientOptions
import torch
from tqdm import tqdm
from sklearn.neighbors import NearestNeighbors
from statistics import mode
import psycopg2
import os
import pandas as pd
import json

In [None]:
# Function to extract RoBERTa features for a batch of encodings
def extract_roberta_features_batch(model, encodings):
    input_ids = torch.tensor(encodings['input_ids'])
    attention_mask = torch.tensor(encodings['attention_mask'])

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        last_hidden_states = outputs.last_hidden_state[:, 0, :].detach().numpy()

    return last_hidden_states

# Function to extract RoBERTa features for the entire data
def extract_roberta_features(model, encodings, batch_size=32):
    num_samples = len(encodings.input_ids)
    num_batches = (num_samples + batch_size - 1) // batch_size

    features = []
    for i in tqdm(range(num_batches), desc="Extracting RoBERTa Features"):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, num_samples)
        batch_encodings = {key: value[start_idx:end_idx] for key, value in encodings.items()}
        batch_features = extract_roberta_features_batch(model, batch_encodings)
        features.append(batch_features)

    features = np.concatenate(features, axis=0)
    return features


In [None]:
def fetch_data_from_supabase(schema, table, field, num_of_records=None):
    user_secrets = UserSecretsClient()

    # Connect to supabase
    supabase_url = "https://fglqovplibiyttjzqxuj.supabase.co"
    supabase_key = user_secrets.get_secret("SUPABASE_KEY")

    supabase = create_client(
                supabase_url,
                supabase_key,
                options=ClientOptions(
                  schema=schema
                ))


    # Execute query
    if num_of_records is not None:
        resp = supabase.table(table).select(field).limit(num_of_records).execute()
    else:
        resp = supabase.table(table).select(field).execute()

    data = resp.data

    data = [row[field] for row in data]
    return data

In [None]:
class WeightedKNNClassifier:
    def __init__(self, n_neighbors=5):
        self.n_neighbors = n_neighbors

    def fit(self, X_train, y_train, weights):
        self.X_train = X_train
        self.y_train = y_train
        self.weights = weights

    def predict(self, X_test):
        y_pred = []
        for x in X_test:
            distances = np.sqrt(np.sum((self.X_train - x) ** 2 * self.weights, axis=1))
            nearest_indices = np.argsort(distances)[:self.n_neighbors]
            nearest_labels = self.y_train[nearest_indices].astype(int)  # Cast nearest_indices to int
            pred_label = np.bincount(nearest_labels).argmax()
            y_pred.append(pred_label)
        return np.array(y_pred)

In [None]:
def propagate_labels(labeled_texts, labeled_texts_labels, unlabeled_texts, weights=None):
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    model = RobertaModel.from_pretrained('roberta-base')

    # Load the CSV files of the features lakes
    df_true = pd.read_csv('/kaggle/input/features-lake/true_features_lake.csv')
    df_false = pd.read_csv('/kaggle/input/features-lake/false_features_lake.csv')

    # Extract the "features" column and store them in lists
    true_features = df_true['feature'].apply(eval).tolist()  # Convert string representation back to list
    false_features = df_false['feature'].apply(eval).tolist()  # Convert string representation back to list


    # Add labels and prepare data
    all_features = true_features + false_features
    all_labels = [0] * len(true_features) + [1] * len(false_features)

    # Tokenize and encode the text data
    labeled_texts_encodings = tokenizer(labeled_texts, truncation=True, padding=True)
    unlabeled_texts_encodings = tokenizer(unlabeled_texts, truncation=True, padding=True)

    # Extract the features of the encodings using RoBERTa
    labeled_texts_features = extract_roberta_features(model, labeled_texts_encodings)
    unlabeled_texts_features = extract_roberta_features(model, unlabeled_texts_encodings)

    # Concatenate the labeled texts features of this batch with the whole set of labeled features
    concatenated_features = np.concatenate((all_features, labeled_texts_features), axis=0)
    concatenated_labels = all_labels + labeled_texts_labels

    # Reshape the features to fill two dimensions
    concatenated_features_2d = concatenated_features.reshape(concatenated_features.shape[0], -1)
    unlabeled_texts_features_2d = unlabeled_texts_features.reshape(unlabeled_texts_features.shape[0], -1)

    pad_width = concatenated_features_2d.shape[1] - unlabeled_texts_features_2d.shape[1]
    unlabeled_texts_features_2d = np.pad(unlabeled_texts_features_2d, ((0, 0), (0, pad_width)), mode='constant')

    # Write the new features to the set of saved features
    # Prepare new records
    true_records = []
    false_records = []

    labeled_texts_features_2d = labeled_texts_features.reshape(labeled_texts_features.shape[0], -1)
    for index in range(len(labeled_texts_features_2d)):
        feature_str = str(labeled_texts_features_2d[index].tolist())  # Convert list to string for storage
        if labeled_texts_labels[index] == 0:
            true_records.append({"id": len(df_true) + index, "feature": feature_str})
        elif labeled_texts_labels[index] == 1:
            false_records.append({"id": len(df_false) + index, "feature": feature_str})

    # Convert lists of dictionaries to DataFrames
    df_true_new = pd.DataFrame(true_records)
    df_false_new = pd.DataFrame(false_records)

    # Concatenate the new records with the existing DataFrames
    df_true = pd.concat([df_true, df_true_new], ignore_index=True)
    df_false = pd.concat([df_false, df_false_new], ignore_index=True)

    os.makedirs('/kaggle/working/updated_dataset', exist_ok=True)
    # Save the updated dataframes back to CSV files
    df_true.to_csv('/kaggle/working/updated_dataset/true_features_lake.csv', index=False)
    df_false.to_csv('/kaggle/working/updated_dataset/false_features_lake.csv', index=False)

    metadata = {
    "title": "features-lake",
    "id": "nfrdkaggle/features-lake",
    "licenses": [{"name": "CC0-1.0"}]
    }

    with open('/kaggle/working/updated_dataset/dataset-metadata.json', 'w') as f:
        json.dump(metadata, f)


    # Train KNN classifier
    if weights is None:
        knn_classifier = KNeighborsClassifier(n_neighbors=5)
        knn_classifier.fit(concatenated_features_2d, concatenated_labels)
        predictions = knn_classifier.predict(unlabeled_texts_features_2d)
    else:
        knn_classifier = WeightedKNNClassifier(n_neighbors=5)
        knn_classifier.fit(concatenated_features_2d, concatenated_labels, weights=weights)
        predictions = knn_classifier.predict(unlabeled_texts_features_2d)


    return predictions


In [None]:
def perform_representative_replacement(features, labels):
    # Calculate distances between data points
    nn = NearestNeighbors(n_neighbors=len(features), metric='euclidean')
    nn.fit(features)
    distances, indices = nn.kneighbors(features)

    # Set a threshold for similarity
    threshold = 0.7

   # Check if labels are already numeric
    if all(isinstance(label, int) for label in labels):
        numerical_labels = labels
    else:
        label_map = {'true': 0, 'false': 1}
        numerical_labels = [label_map[label.lower()] for label in labels]

    # Group similar data points
    similar_groups = {}
    for i in range(len(features)):
        similar_group = [(i, numerical_labels[i])]
        for j, dist in zip(indices[i], distances[i]):
            if j != i and dist < threshold:
                similar_group.append((j, numerical_labels[j]))
        if len(similar_group) > 1:
            similar_groups[i] = similar_group

    # Select representative data points and calculate weights
    representatives = []
    representatives_labels = []
    weights = []
    for group in similar_groups.values():
        group_X = [features[i] for i, j in group]
        group_Y = [j for i, j in group]  # Index train_labels with integers
        # Calculate weight based on the number of data points in the group
        weight = len(group)
        weights.append([weight for i in range(len(features[0]))])
        # Select a representative data point
        representative = np.mean(group_X, axis=0)
        representative_label = mode(group_Y)
        representatives.append(representative)
        representatives_labels.append(representative_label)


    # Replace similar data points with representatives
    summarized_features = np.array(representatives)
    summarized_labels = np.array(representatives_labels)
    summarized_weights = np.array(weights)

    return summarized_features, summarized_labels, summarized_weights

In [None]:
!kaggle datasets version -p /kaggle/working/updated_dataset -m "Updated dataset with new features"