In [2]:
import pandas as pd
import cudf
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, GPT2LMHeadModel, GPT2Tokenizer
import time

# Filtrando os dados Originais

In [2]:
tweets1 = pd.read_excel('tweets1.xlsx')
tweets2 = pd.read_excel('tweets2.xlsx')

In [3]:
tweets_combined = pd.concat([tweets1, tweets2])

In [10]:
tweets_english = tweets_combined[tweets_combined['Tweet Language'] == 'English'].copy()

In [12]:
tweets_english['ID'] = range(1, len(tweets_english) + 1)

In [15]:
tweets_filtered = tweets_english[['ID', 'Tweet Content']].copy()

In [16]:
tweets_filtered.head()

Unnamed: 0,ID,Tweet Content
1,1,"""𝐈𝐭. 𝐂𝐚𝐧𝐧𝐨𝐭. 𝐆𝐞𝐭. 𝐁𝐢𝐠𝐠𝐞𝐫. 𝐓𝐡𝐚𝐧. 𝐓𝐡𝐢𝐬. 🔥\n\nGet..."
3,2,"""It’ll be a tough night for Europe today.\n\n#..."
4,3,"""In defeat or in victory, always say Alhamduli..."
5,4,"""FAFC Genesis Edition ( This collection have m..."
6,5,"""Get ready for zabardast action on #25th Jan ...."


In [17]:
tweets_filtered.to_csv('tweets_filtered.csv', index=False)

# Carregando dados na GPU

In [None]:
# Ler dados do CSV usando cudf
df = cudf.read_csv('tweets_filtered.csv')

# Converter a coluna de tweets para uma lista no cudf (mantendo na GPU)
tweets = df['Tweet Content'].head(1).to_arrow().to_pylist()

# GPT-2

In [5]:
tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [6]:
model.to("cuda")

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

# LLama 2

In [8]:
model_name_or_path = "TheBloke/Llama-2-7B-Chat-GGUF"
model_basename = "llama-2-7b-chat.Q4_K_M.gguf"

In [9]:
from huggingface_hub import hf_hub_download

In [10]:
from llama_cpp import Llama

In [11]:
model_path = hf_hub_download(repo_id=model_name_or_path, filename=model_basename)

In [None]:
lcpp_llm = Llama(
    model_path=model_path,
    n_threads=4,  # Número de núcleos da CPU
    n_batch=512,  # Deve estar entre 1 e n_ctx, considere a quantidade de VRAM na sua GPU
    n_gpu_layers=32  # Ajuste com base no modelo e na VRAM da GPU
)

In [14]:
lcpp_llm.model_params.n_gpu_layers

32

In [16]:
prompt1 = "Read the following tweet inside brackets:\n"
prompt2 = "Classify the readed tweet as to whether it mentions sports betting, answer just yes or no without stickers: "
str_for_promp = ''
str_for_promp += prompt1 + '['+tweets[0]+']' + '\n' + prompt2

In [17]:
prompt = "I'm going to show you some tweets made around the time of the 2022 World Cup final, and I need you to classify for me which ones are talking about football and which ones are talking about unrelated topics, such as sports betting, raffles, etc. Classify the football ones as 1, and the others as 0. Just answer me the class and nothing more"
prompt_template=f'''SYSTEM: You are a helpful, respectful and honest assistant. Always answer as helpfully.

USER: {str_for_promp}

ASSISTANT:
'''


In [18]:
response=lcpp_llm(prompt=prompt_template, max_tokens=512, temperature=0.5, top_p=0.95,
                  repeat_penalty=1.2, top_k=150,
                  echo=True)


llama_print_timings:        load time =     701.19 ms
llama_print_timings:      sample time =       0.43 ms /     3 runs   (    0.14 ms per token,  6944.44 tokens per second)
llama_print_timings: prompt eval time =     700.67 ms /   293 tokens (    2.39 ms per token,   418.17 tokens per second)
llama_print_timings:        eval time =      75.57 ms /     2 runs   (   37.79 ms per token,    26.46 tokens per second)
llama_print_timings:       total time =     781.48 ms


In [19]:
print(response)

{'id': 'cmpl-21b49084-b91b-467c-a4e0-7f9244881616', 'object': 'text_completion', 'created': 1721309836, 'model': '/home/savio/.cache/huggingface/hub/models--TheBloke--Llama-2-7B-Chat-GGUF/snapshots/191239b3e26b2882fb562ffccdd1cf0f65402adb/llama-2-7b-chat.Q4_K_M.gguf', 'choices': [{'text': 'SYSTEM: You are a helpful, respectful and honest assistant. Always answer as helpfully.\n\nUSER: Read the following tweet inside brackets:\n["𝐈𝐭. 𝐂𝐚𝐧𝐧𝐨𝐭. 𝐆𝐞𝐭. 𝐁𝐢𝐠𝐠𝐞𝐫. 𝐓𝐡𝐚𝐧. 𝐓𝐡𝐢𝐬. 🔥\n\nGet into the #FIFAWorldCup Final mode with none other than @iamsrk &amp; @WayneRooney on Dec 18, LIVE on #JioCinema &amp; #Sports18 📺📲\n\n#Qatar2022 #ARGFRA #WorldsGreatestShow #FIFAWConJioCinema #FIFAWConSports18 #Pathaan"]\nClassify the readed tweet as to whether it mentions sports betting, answer just yes or no without stickers: \n\nASSISTANT:\nYes.', 'index': 0, 'logprobs': None, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 293, 'completion_tokens': 2, 'total_tokens': 295}}


In [20]:
print(response["choices"][0]["text"])

SYSTEM: You are a helpful, respectful and honest assistant. Always answer as helpfully.

USER: Read the following tweet inside brackets:
["𝐈𝐭. 𝐂𝐚𝐧𝐧𝐨𝐭. 𝐆𝐞𝐭. 𝐁𝐢𝐠𝐠𝐞𝐫. 𝐓𝐡𝐚𝐧. 𝐓𝐡𝐢𝐬. 🔥

Get into the #FIFAWorldCup Final mode with none other than @iamsrk &amp; @WayneRooney on Dec 18, LIVE on #JioCinema &amp; #Sports18 📺📲

#Qatar2022 #ARGFRA #WorldsGreatestShow #FIFAWConJioCinema #FIFAWConSports18 #Pathaan"]
Classify the readed tweet as to whether it mentions sports betting, answer just yes or no without stickers: 

ASSISTANT:
Yes.
