In [None]:
! huggingface-cli login --token XXXXXXXXXXXXXXXXXXXXXXXxxxxxxxxx

In [None]:
from transformers import pipeline
from datasets import load_dataset
from tqdm import tqdm
import pandas as pd 
import re
import json


In [None]:
pipe = pipeline("text-generation", 
                model="microsoft/Phi-3-mini-128k-instruct", 
                trust_remote_code=True)

In [None]:
system_prompt = """
You are a text classification model with the ability to filter and categorize texts based on specified topics.
"""

user_prompt = """
Please process the provided text and filter it according to the following topics:
1. Safety
2. Privacy
3. Harassment
4. Hate Speech
5. Misinformation

For each text, determine whether it is relevant to these topics and categorize it accordingly. Return the results in JSON format, where each entry includes the original text and its classification for each topic.

**Format:**
```json
[
    {
        "classification": {
            "Safety": "Yes/No",
            "Privacy": "Yes/No",
            "Harassment": "Yes/No",
            "Hate Speech": "Yes/No",
            "Misinformation": "Yes/No"
        }
    },

]
```
"""

In [None]:
def run_filter(text) :
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt},
        {"role": "user", "content": f"My text : {text}" }
    ]
    generation_args = {
        "max_new_tokens": 2048,
        "return_full_text": True,
        "temperature": 0.0,
        "do_sample": False
    }
    output = pipe(messages, **generation_args)
    return output[0]['generated_text'][3]['content']

In [None]:
def extract_json_from_string(text):
    objs = []
    errors = []

    # Use regex to find JSON-like structures in the text
    matches = re.findall(r'{.*}', text, re.DOTALL)
    if not matches:
        errors.append("No JSON patterns found")
    
    for index, json_string in enumerate(matches):
        try:
            # Load JSON object
            json_obj = json.loads(json_string)
            objs.append(json_obj)
        except json.JSONDecodeError as e:
            errors.append((index, json_string, f"JSONDecodeError: {e}"))
        except Exception as e:
            errors.append((index, json_string, f"Error: {e}"))

    return objs, errors

In [None]:
def llm_filter(ds_name , split , target_column , save=False) : 
    
    ds = load_dataset(ds_name , split=split)
    text_list = ds[target_column]
    
    df_dict = []
    # Process each text in the list
    for i in tqdm(text_list, desc="Processing texts"):
        result = run_filter(i)
        objs, errors = extract_json_from_string(result)

        # Handle cases where objs or errors might be empty
        classification = objs[0]['classification'] if objs else None
        error_info = errors[0] if errors else None

        # Append results to the list
        df_dict.append({
            "text": i,
            "classification": classification,
            "errors": error_info
        })
    
    df = pd.DataFrame(df_dict)
    classification_df = pd.json_normalize(df['classification'])
    final_df = df.drop(columns=['classification']).join(classification_df)
    if save : 
        final_df.to_csv(f"{ds_name.rsplit('/')[-1]}-{target_column}-Filtered.csv", index=False)
     
    return final_df
    

In [None]:
new_df = llm_filter(ds_name="ayoubkirouane/Small-Instruct-Alpaca_Format" , 
                    split="train" , 
                    target_column="response",
                    save=True)

In [None]:
new_df