In [1]:
import sys

sys.path.append('../src')

In [2]:
import torch
import transformers
from transformers import AutoModel, AutoTokenizer, pipeline, AutoModelForSequenceClassification
from dotenv import load_dotenv
import pandas as pd
import os
from dotenv import load_dotenv
import time
import re
import random

from utils.util_fnc import *

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
load_dotenv()
huggingface_token = os.getenv("huggingface_token")
openai_key = os.getenv("open_ai_key")

In [4]:
config = load_yaml('../src/config/text_classification_config.yaml')
prompt_dict = load_yaml(os.path.join(config.root_dir, 'prompt', 'first_step_prompt.yaml'))

In [5]:
dummy_text = pd.read_csv(os.path.join(config.root_dir, 'data', 'dummy_articles', 'articles.csv'), index_col=0)

In [6]:
if config.model == 'lamma2':
    tokenizer = AutoTokenizer.from_pretrained(config.model_name, token=huggingface_token)

    if torch.cuda.is_available() and 'cuda' in config.device:
        torch_dtype = torch.float16
    else:
        config.device = 'cpu'
        torch_dtype = torch.float32
            
    pipeline = transformers.pipeline(
        config.task,
        model=config.model_name,
        tokenizer=tokenizer,
        torch_dtype=torch_dtype,
        device=config.device
    )

elif config.model == 'gpt3.5':
    pass

else:
    raise ValueError("Not supported model")

Loading checkpoint shards: 100%|██████████| 3/3 [00:04<00:00,  1.45s/it]


In [10]:
def answer(x, config):
    if config.model == 'lamma2':
        x = '\n'.join(x)
        sequences = pipeline(x, **config.lamma_config)
        return sequences[0]['generated_text'].replace(x, "")
        
    elif config.model == 'gpt3.5':
        messages = [
            {"role": "system", "content": '\n'.join(x[:3])},
            {"role": "assistant", "content": x[3]},
            {"role": "user", "content": x[4]}
        ]

        response = openai.ChatCompletion.create(**config.openai_config, messages=messages)
        return response['choices'][0]['text']
    else:
        raise ValueError("Not a supported model")

In [8]:
def evaluate_text(article, prompt_dict, config):
    start_time = time.time()
    elapsed_time = 0

    while elapsed_time < config.time_limit:
        # Generate input and get the model's answer
        input_text = generate_input_text(prompt_dict, article)
        gen_text = answer(input_text, config)
        
        cleaned_output = cleaning_text(gen_text)
        if len(cleaned_output) >= 500:
            continue

        # Check for True or False in the output
        if "True" in cleaned_output and "False" not in cleaned_output:
            return True, cleaned_output
        elif "False" in cleaned_output and "True" not in cleaned_output:
            return False, cleaned_output

        # Update elapsed time
        elapsed_time = time.time() - start_time

    # Reached time limit without a definitive answer
    return None, None

In [11]:
result_dict = {}
cleaned_output_dict = {}

for index, row in dummy_text.iterrows():
    # Clean the article text
    article = re.sub(r'[^\w\s]', '', row.Content).strip()

    # Evaluate the text
    output, cleaned_output = evaluate_text(article, prompt_dict, config)

    result_dict[index] = f'{row.Category}: ' + str(output)
    cleaned_output_dict[index] = cleaned_output



In [12]:
result_dict

{0: 'Travel: False',
 1: 'Technology: False',
 2: 'Science: False',
 3: 'Food and Cooking: False',
 4: 'Literature: False',
 5: 'Personal Development: False',
 6: 'Entertainment: False',
 7: 'Environmental Issues: False',
 8: 'Art and Creativity: False',
 9: 'Automotive: True',
 10: 'Automotive: True',
 11: 'Automotive: True'}

In [13]:
cleaned_output_dict

{0: 'False This article does not relate to the car business because there is no mention of cars or any other transportationrelated topics',
 1: 'The text provided does not directly relate to the Automotives Domain It discusses Artificial intelligences application in the healthcare sector specifically focusing on imaging diagnostics clinical support systems drug discovery and personalized medicine While AI may have some indirect connections to autonomous vehicles or other automotivetechnologies there is no direct relationship between the content and the automobile industry Therefore my response is False',
 2: 'False There is no mention of any aspect of the automobile industry in the provided text It discusses space explorations which does not fall under the purview of the auto mobile industry',
 3: 'False The provided text does not relate to the car business or any other aspect of the automobile sector It discusses various plantbased cuisine recipes ingredients and cooking methods',
 4: