In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
deviceNum = 0 if cuda.is_available() else -1
device


# Preparing Data

In [None]:
import pandas as pd
#HERE DATA IS PREPAREDs

dataset = 'mba'
data = pd.read_excel('{}/codes.xlsx'.format(dataset))

print(data.iloc[:,0])

def load_file(path):
    fd = open(path).readlines()
    fd = [x.strip() for x in fd]
    fd = [x for x in fd if x != '']
    fd = ' '.join(fd)
    file_data = ' '.join(fd.split('\n')) 
    return file_data

import os

def load_file_if_exists(filepath):
    if os.path.exists(filepath):
        return load_file(filepath)
    else:
        return None 

max_id = data['Story number'].max()

# Use a dictionary comprehension with a condition that checks for file existence
texts = {x: load_file_if_exists('{}/{}.txt'.format(dataset, str(x))) for x in data.iloc[:,0] if os.path.exists('{}/{}.txt'.format(dataset, str(x)))}

data

# Determining Best Model

In [None]:
models = [
    'bert-base-uncased',
    'distilbert/distilbert-base-uncased-finetuned-sst-2-english',
    # 'FacebookAI/roberta-base',
    # 'google/electra-base-discriminator',
    # 'tkharisov7/aes-ielts',
    # 'google-t5/t5-base'
]

In [None]:
accStoresBackup = {}

In [None]:
# I had to run the following linux command to get llama to use GPU
# CUDACXX=/usr/bin/nvcc CMAKE_ARGS="-DLLAMA_CUBLAS=on -DCMAKE_CUDA_ARCHITECTURES=all-major" FORCE_CMAKE=1 pip install llama-cpp-python --no-cache-dir --force-reinstall --upgrade
#         ^^^^^^^^^^^^^  = your path to nvcc (nvidia cuda compiler)
from llama_cpp import Llama

# llama = "./model/Meta-Llama-3-8B-Instruct.Q2_K.gguf" # weaker but faster
llama = "./model/Meta-Llama-3-8B-Instruct.Q4_K_M.gguf" #stronger but slower
LLM = Llama(model_path=llama, n_ctx=5096)


def llamaGuess(text, labels):
    labelString = ", ".join(labels)
    promptB = "These are the sentiment labels: " + labelString + ". Which one of the sentiment labels best describes the following text: \""

    prompt = promptB + text + "\" Only have the one best sentiment label in the response, nothing else. The one best sentiment label: "
    
    response = LLM(prompt, max_tokens = 10, temperature=0, stop=".")
    answer = response["choices"][0]["text"]

    return answer


In [None]:
from transformers import pipeline
from sklearn.metrics import accuracy_score

# models = list of model names
# data = list of text samples
# labels = DataFrame of corrects answers as 1's and 0's
# labelIndices = an index of what label corresponds to what integer
def runModels(models, texts, data, processor=-1, debug=0):
    # get labels
    labels = data.keys()[1:]

    #represent labels as integers
    labelIndices = {}
    labelIndex = 0
    for x in labels:
        labelIndices[x] = labelIndex
        labelIndex+= 1
    if(debug>=1): print(labelIndices)

        
    accScores = {}
        
    #we'll get an accuracy score for each model
    for model in models:
        pipe = pipeline("zero-shot-classification", model=model, device=processor) #device>=0 for gpu supposedly
        
        #run current model on every sample from data       
        golds = []
        guesses = []
        if(debug>=1): print("RUNNING", model)
        for i in range(len(data)):
    
            corrects = [] #golds for this sample
            j = 0
            
            for label in labels:
                if(data[label][i] == 1):
                    corrects.append(j)
                j+=1
            if(len(corrects) == 0):
                print("DATA ERROR AT INDEX", i,"- NO CORRECTS")
                continue
    
            #here we run the pipe and get the top choice
            text = texts[data.iloc[:, 0][i]]
            results = pipe(text[data.iloc[:, 0][i]],candidate_labels=labels)
            topChoice = labelIndices[results['labels'][0]]
    
            #if it guessed correctly, we append the top choice to golds and guesses
            if topChoice in corrects:
                golds.append(topChoice)
                guesses.append(topChoice)
                if(debug>=2): print("\tstory", i, "- correct: ", topChoice)
            #if it guessed incorrectly, we append the wrong choice for each correct answer
            else:
                if(debug>=2): print("\tstory", i, "- INcorrect: ", topChoice)
                golds.append(corrects[0])
                guesses.append(topChoice)
        
        finalAcc = accuracy_score(guesses,golds)
        if(debug>=1): print("accuracy score for", model, "-", finalAcc)
    
        accScores[model] = finalAcc
        accStoresBackup[model] = finalAcc
        
    return accScores


    
    

In [None]:
scores = runModels(models, texts, data, deviceNum, debug=2)
scores

In [None]:
print(scores)

In [None]:
best = max(scores)
best

# Fine-Tuning Best Model

In [None]:
import transformers
from transformers import AutoTokenizer,AutoModel

import torch
import tensorflow as tf

import numpy as np

model_id = best
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModel.from_pretrained(model_id)

all_devices = tf.config.list_physical_devices()
all_devices

In [None]:
vectors = []

for t in data.iloc[:, 0]:
    samp = []
    tokenized = tokenizer(texts[t],return_tensors="pt",truncation=True,max_length=10)
    outputs = model(**tokenized)
    for a in outputs[0][0]:
        samp.extend(a)
    vectors.append(samp)

len(vectors)

In [None]:
tensors = []
for vector in vectors:
    # Convert each PyTorch tensor in the vector to a NumPy array
    numpy_vector = [item.detach().cpu().numpy() if isinstance(item, torch.Tensor) else item for item in vector]
    # Convert the list of NumPy arrays to a TensorFlow tensor
    tensor = tf.convert_to_tensor(numpy_vector)
    tensors.append(tensor)

tensors = np.array(tensors)
print(tensors.shape)

In [None]:
# testPercentage = percentage of data to be used in TRAINING
testPercentage = 70
# split = exact number of samples to be used in training
split = round(len(tensors) * (testPercentage / 100))

y_train = data.iloc[:split, 1:].values
print(y_train.shape)

y_test = data.iloc[split:, 1:].values
print(y_test.shape)

In [None]:
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras import Sequential

nn = Sequential()
nn.add(Dense(400, input_shape=(7680,),activation='relu'))
nn.add(Dropout(0.2))
nn.add(Dense(11, activation='sigmoid'))

In [None]:
nn.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

In [None]:
nn.fit(tensors[:split], y_train, validation_data=(tensors[split:], y_test), epochs=20, batch_size=10, verbose=1, shuffle=True)


In [None]:
loss, accuracy = nn.evaluate(tensors[split:], y_test)
accuracy