In [4]:
from datasets import load_dataset,Dataset
import json
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForSequenceClassification, AutoTokenizer

SBERT_MODEL = "all-MiniLM-L6-v2"
import pandas as pd
import numpy as np
import scipy.spatial as sp
from collections import defaultdict
from tqdm import tqdm
import faiss
from collections import ChainMap
from scipy.special import softmax
from torch.utils.data import Dataset,DataLoader
import torch
np.set_printoptions(precision=3)


In [5]:

THRESHOLD = 0.72
B_SIZE = 500
MAXLEN = 196


In [42]:
def load_vectorizer(model=SBERT_MODEL):
    return SentenceTransformer(model)


def vectorize_text(model, texts):
    return model.encode(texts, show_progress_bar=True)

def build_index(vector):
    
    d = vector.shape[1]
    index = faiss.IndexFlatIP(d)
    faiss.normalize_L2(rot_vector)
    index.add(rot_vector)
    return index
    

def get_conversations(rot_vector, posts_vector):
    
    index = build_index(rot_vector)
    num_result=5
    faiss.normalize_L2(posts_vector)
    matching_dict = defaultdict(list)
    for idx in tqdm(range(0, posts_vector.shape[0], B_SIZE)):
        I, D = index.search(posts_vector[idx:idx+B_SIZE],num_result)
        sim_indices = np.argwhere(I >= THRESHOLD)
        #return sim_indices
        for k, j in sim_indices:
            matching_dict[k+idx].append({D[k][j]:I[k][j]})

    return matching_dict

def get_top_n(data,n=2):
    
    data = dict(ChainMap(*data))
    data = {k: v for k, v in sorted(data.items(), key=lambda data: data[1],reverse=True)}
    return list(data.keys())[:n]
    

def convert_to_json(match_dict,all_rots,reddit_df):
    all_data = []
    posts = reddit_df['title'].values
    permalinks = reddit_df['permalink'].values
    for post_id,rot_list in match_dict.items():
        rot_ids = get_top_n(rot_list)
        ps_rots = [all_rots[idx] for idx in rot_ids]
        all_data.append({"context":posts[post_id], "rots":ps_rots,"permalink":permalinks[post_id],"episone_done":True})
    
    return all_data

In [8]:
dataset = load_dataset("allenai/prosocial-dialog",split="train")
conf_df = pd.read_csv("/home/shahul/Data/one-million-reddit-confessions.csv")

Using custom data configuration allenai--prosocial-dialog-ebbad39ca08b6d44
Found cached dataset json (/home/shahul/.cache/huggingface/datasets/allenai___json/allenai--prosocial-dialog-ebbad39ca08b6d44/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)
  conf_df = pd.read_csv("/home/shahul/Data/one-million-reddit-confessions.csv")


In [9]:
conf_df_filtered = conf_df.query("score>3.0")
conf_df_filtered["title_len"] = conf_df_filtered['title'].map(lambda x : len(x.split()))
conf_df_filtered = conf_df_filtered.query("title_len>5")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  conf_df_filtered["title_len"] = conf_df_filtered['title'].map(lambda x : len(x.split()))


In [10]:
rots = [item["rots"] for item in dataset]
rots = set([x for item in rots for x in item])
rots = list(rots)

In [12]:
model = load_vectorizer()

In [13]:
rot_vector = vectorize_text(model, rots)

Batches:   0%|          | 0/3630 [00:00<?, ?it/s]

In [14]:
posts = conf_df_filtered['title'].values.tolist()
posts_vector = vectorize_text(model,posts)

Batches:   0%|          | 0/5431 [00:00<?, ?it/s]

In [16]:
match_dict = get_conversations(rot_vector,posts_vector[:10000])

100%|██████████████████| 2/2 [00:03<00:00,  1.67s/it]


In [43]:
json_data = convert_to_json(match_dict,rots,conf_df_filtered.head(10000))

In [45]:
confession_dataset = Dataset.from_list(json_data)

In [46]:
confession_dataset[0]

{'context': 'i accused a family member of something they didn’t do',
 'rots': ['It is understandable to be accused of something you did not do.',
  "You should get a lawyer if you're accused of something you didn't do."],
 'permalink': 'https://old.reddit.com/r/confession/comments/pwv3h1/i_accused_a_family_member_of_something_they_didnt/',
 'episone_done': True}

## Pseudo label

In [37]:
label_to_id = {"__casual__":0,"__needs_caution__":1,"__needs_intervention__":2,"__probably_needs_caution__":3,"__possibly_needs_caution__":4}
id_to_label = {v:k for k,v in label_to_id.items()}

In [65]:

class ProSocialDataset(Dataset):
    
    def __init__(self,dataset,tokenizer):
        
        super().__init__()
        self.tokenizer = tokenizer
        self.sep_token  = self.tokenizer.sep_token
        self.dataset = dataset
        self.label2id = label_to_id
        self.id2label = {v:k for k,v in label_to_id.items()}
        self.max_len = MAXLEN
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self,idx):
        encoding = {}

        
        context = self.dataset[idx]["context"]
        rots = self.dataset[idx]["rots"]
        input_tokens = self.tokenizer.encode(self.dataset[idx]["context"],add_special_tokens=False)
        max_len = max(0,self.max_len - (len(input_tokens)+4))

        rots = self.tokenizer.encode(self.tokenizer.sep_token.join(rots),
                                     add_special_tokens=False,
                                   max_length=max_len,)
        
        
        input_ids = [0] + input_tokens + [self.tokenizer.sep_token_id] + rots + [self.tokenizer.eos_token_id]
        input_ids = input_ids + [self.tokenizer.pad_token_id] * max(0,(self.max_len - len(input_ids)))
        mask = [1]*len(input_ids) + [self.tokenizer.pad_token_id] * (self.max_len-len(input_ids))
        
        encoding["input_ids"] = torch.tensor(input_ids)
        encoding["attention_mask"] = torch.tensor(mask)
        
        return encoding
        
        
        

In [66]:
MODEL = "shahules786/prosocial-classifier"

In [75]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
tokenizer = AutoTokenizer.from_pretrained('roberta-base')


In [67]:
def predict(pred_dataloader):
    
    probs, labels = [], []
    for data in tqdm(pred_dataloader):
        pred = softmax(
            model(data["input_ids"], data["attention_mask"])
            .logits.detach()
            .numpy()
            .astype("float16"),
            axis=1,
        )
        probs.append(pred.max(axis=1))
        labels.append([model.config.id2label.get(pred_id) for pred_id in pred.argmax(axis=1)])

    return np.concatenate(probs), np.concatenate(labels)


In [68]:
pred_dataset = ProSocialDataset(confession_dataset,tokenizer)

In [71]:
pred_dataset = DataLoader(pred_dataset,batch_size=4)

In [80]:
prob,label=predict(pred_dataset)

In [81]:
confession_dataset = confession_dataset.add_column("confidence", prob)
confession_dataset = confession_dataset.add_column("safety_label", label)
confession_dataset = confession_dataset.add_column("response", [None]*len(confession_dataset))

In [82]:
confession_dataset

Dataset({
    features: ['context', 'rots', 'permalink', 'episone_done', 'confidence', 'safety_label'],
    num_rows: 44
})

In [83]:
confession_dataset.upload_to_hub("shahules786/prosocial-confessions")

{'context': 'i accused a family member of something they didn’t do',
 'rots': ['It is understandable to be accused of something you did not do.',
  "You should get a lawyer if you're accused of something you didn't do."],
 'permalink': 'https://old.reddit.com/r/confession/comments/pwv3h1/i_accused_a_family_member_of_something_they_didnt/',
 'episone_done': True,
 'confidence': 0.4321,
 'safety_label': '__needs_caution__'}