In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation import GenerationConfig
import warnings
warnings.filterwarnings("ignore")
import textwrap
import torch

In [2]:
# load the dataset
with open('/kaggle/input/imdb-corpus/corpus.txt','r',encoding="utf8") as f:
    document = f.readlines()
f.close()

labels, texts = [], []
for line in document:
    content = line.split()
    label = content[0]
    labels.append(label[-1])
    texts.append(" ".join(content[1:]))

print(len(labels), len(texts))

10000 10000


In [3]:
from sklearn.model_selection import train_test_split
X_train2, X_test2, y_train, y_test  = train_test_split(texts, labels, train_size=0.75,random_state=1234)

In [4]:
from transformers import pipeline


In [5]:
model_path = "HuggingFaceH4/zephyr-7b-beta" 
tokenizer = AutoTokenizer.from_pretrained(model_path)
pipe = pipeline("text-generation", model=model_path, torch_dtype=torch.bfloat16, device_map="auto")

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

In [6]:
def get_completion(prompt):
    messages = [{
        "role": "user", 
        "content": prompt }]
    prompt2 = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    outputs = pipe(prompt2, max_new_tokens=400, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
    return outputs[0]["generated_text"]

In [7]:
zephyr_pred = []
for i, s in enumerate(X_test2):
    if (i % 100) == 0:
        print(i)
    text = s
    prompt = f"""
    What is the sentiment of the following text, which is delimited with triple backticks?

    Give your answer as a single word, either "positive" or "negative".
    Review text: '''{text}'''
    """
    response = get_completion(prompt)
    zephyr_pred.append(response)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400


In [8]:
from sklearn.metrics import accuracy_score

In [10]:
zephyr_pred[0]

'<|user|>\n\n    What is the sentiment of the following text, which is delimited with triple backticks?\n\n    Give your answer as a single word, either "positive" or "negative".\n    Review text: \'\'\'Poly tape: Does a great job highly visible. Works well with the different fencers that I use in the different pastures. Last a long time. Can\'t beat it for the price.\'\'\'\n    </s>\n<|assistant|>\nThe sentiment of the review text delimited with triple backticks is "positive". Keywords such as "great", "highly visible", "works well", and "lasts a long time" indicate a favorable opinion of the product. The reviewer also mentions that the product is "can\'t beat it for the price", implying that it is an excellent value. Overall, the sentiment is positive.'

In [26]:
sentiments = []
for zp in zephyr_pred:    
    response = zp
    abc = response.split('<|assistant|>')
    s = abc[1]
    if 'positive' in s.lower():
        sentiments.append('2')
    else:
        sentiments.append('1')

In [27]:
sentiments[0:10]
  

['2', '1', '2', '1', '2', '2', '1', '2', '1', '1']

In [28]:
accuracy_score(y_test, sentiments)

0.9224