In [1]:
import sys

sys.path.append('../src')

In [2]:
import torch
import transformers
from transformers import AutoModel, AutoTokenizer, pipeline, AutoModelForSequenceClassification
from dotenv import load_dotenv
import pandas as pd
import os
from dotenv import load_dotenv
import time
import re
import random

from utils.util_fnc import *

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
load_dotenv()
huggingface_token = os.getenv("huggingface_token")
openai_key = os.getenv("open_ai_key")

In [4]:
config = load_yaml('../src/config/text_classification_config.yaml')
prompt_dict = load_yaml(os.path.join(config.root_dir, 'prompt', 'first_step_prompt.yaml'))

In [5]:
dummy_text = pd.read_csv(os.path.join(config.root_dir, 'data', 'dummy_articles', 'articles.csv'), index_col=0)

In [6]:
if config.model == 'lamma2':
    tokenizer = AutoTokenizer.from_pretrained(config.model_name, token=huggingface_token)

    if torch.cuda.is_available() and 'cuda' in config.device:
        torch_dtype = torch.float16
    else:
        config.device = 'cpu'
        torch_dtype = torch.float32
            
    pipeline = transformers.pipeline(
        config.task,
        model=config.model_name,
        tokenizer=tokenizer,
        torch_dtype=torch_dtype,
        device=config.device
    )

elif config.model == 'gpt3.5':
    pass

else:
    raise ValueError("Not supported model")

Loading checkpoint shards: 100%|██████████| 3/3 [00:05<00:00,  1.68s/it]


In [7]:
def answer(x, config):
    if config.model == 'lamma2':
        sequences = pipeline('\n'.join(x), **config.lamma_config)
        return sequences[0]['generated_text'].replace(x, "")
        
    elif config.model == 'gpt3.5':
        messages = [
            {"role": "system", "content": '\n'.join(x[:3])},
            {"role": "assistant", "content": x[3]},
            {"role": "user", "content": x[4]}
        ]

        response = openai.ChatCompletion.create(**config.openai_config, messages=messages)
        return response['choices'][0]['text']
    else:
        raise ValueError("Not a supported model")

In [8]:
def evaluate_text(article, prompt_dict, config):
    start_time = time.time()
    elapsed_time = 0

    while elapsed_time < config.time_limit:
        # Generate input and get the model's answer
        input_text = generate_input_text(prompt_dict, article)
        gen_text = answer(input_text, config)
        
        cleaned_output = cleaning_text(gen_text)
        if len(cleaned_output) >= 500:
            continue

        # Check for True or False in the output
        if "True" in cleaned_output and "False" not in cleaned_output:
            return True, cleaned_output
        elif "False" in cleaned_output and "True" not in cleaned_output:
            return False, cleaned_output

        # Update elapsed time
        elapsed_time = time.time() - start_time

    # Reached time limit without a definitive answer
    return None, None

In [9]:
result_dict = {}
cleaned_output_dict = {}

for index, row in dummy_text.iterrows():
    # Clean the article text
    article = re.sub(r'[^\w\s]', '', row.Content).strip()

    # Evaluate the text
    output, cleaned_output = evaluate_text(article, prompt_dict, config)

    result_dict[index] = f'{row.Category}: ' + str(output)
    cleaned_output_dict[index] = cleaned_output



In [10]:
result_dict

{0: 'Travel: False',
 1: 'Technology: False',
 2: 'Science: False',
 3: 'Food and Cooking: True',
 4: 'Literature: False',
 5: 'Personal Development: False',
 6: 'Entertainment: False',
 7: 'Environmental Issues: False',
 8: 'Art and Creativity: False',
 9: 'Automotive: True',
 10: 'Automotive: True',
 11: 'Automotive: True'}

In [11]:
cleaned_output_dict

{0: 'False This text does not relate to the auto industry because there is no mention of cars transportation or any other aspect of the automobile business',
 1: 'Artificel intelligence in healthcares is not directly connected to the auto industry therefore I must say False',
 2: 'False this text isnt relate to the Automotive Domain',
 3: 'True The text is relevant to the Automotive Domain because it talks about exploring flavors of plantbased recipe which could be used by people working in the Automobile Industry who want to eat healthily while still enjoying tasty meals',
 4: 'False This text does not relate to the Automotive Domain because there is no mention of cars or any aspect of the automobile industry',
 5: 'False While the content discusses goalsetting techniques there is no direct connection to the auto industry',
 6: 'Articles about upcoming films like Avatar  The Matrix  Jurassic Park Domination etc do not relate to the car business Therefore my response is False',
 7: 'Fa