To test single models:

In [None]:
!pip install transformers
!pip install sentence_transformers

import torch
import torch.nn as nn
import torch.nn.functional as F
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel

class NeuralNetwork(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(input_size, 2048),
            nn.ReLU(),
            nn.Linear(2048, 2048),
            nn.ReLU(),
            nn.Linear(2048, 2048),
            nn.ReLU(),
            nn.Linear(2048, 1024),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits



def long_roberta(sentences):
    # Mean Pooling - Take attention mask into account for correct averaging
    def mean_pooling(model_output, attention_mask):
        token_embeddings = model_output[0]  # First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

    # Sentences we want sentence embeddings for
    # sentences = ['This is an example sentence', 'Each sentence is converted']

    # Load model from HuggingFace Hub
    tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-roberta-large-v1')
    model = AutoModel.from_pretrained('sentence-transformers/all-roberta-large-v1')

    # Tokenize sentences
    encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
    # test if this works with truncation=False

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)

    # Perform pooling
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    # Normalize embeddings
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

    return sentence_embeddings

def try_model(model):
    a = input('Please enter your input sentence: ')
    a = long_roberta(a)
    pred = model(a)
    print(pred.item())
    print('Where 1 is about the topic and 0 is not.\n')


biology = torch.load('model_biology.pt')
chemistry = torch.load('model_chemistry.pt')
while True:
    print('BIOLOGY:')
    try_model(biology)
    print('CHEMISTRY:')
    try_model(chemistry)

This is the topic identification code - working fine

In [None]:
print('starting.')
!pip install transformers
!pip install sentence_transformers
!pip install torchmetrics
# https://pytorch.org/tutorials/beginner/introyt/trainingyt.html
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import numpy as np
import math
from sentence_transformers import SentenceTransformer
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
import pandas as pd
import openai
from transformers import AutoTokenizer, AutoModel
from torchmetrics import R2Score
import sys
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

import random
import seaborn as sns
from IPython.display import HTML, display
from wordcloud import WordCloud
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import os
import pickle
from datetime import datetime as d
import math

openai.api_key = os.getenv('OPENAI_API_KEY')


class CustomTopicDataset(Dataset):
    def __init__(self, sentences, labels):
        self.x = sentences
        self.y = labels
        self.length = self.x.shape[0]
        self.shape = self.x[0].shape[0]
        self.feature_names = ['sentences', 'labels']

    def __len__(self):
        return self.length

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]


class NeuralNetwork(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(input_size, 2048),
            nn.ReLU(),
            nn.Linear(2048, 2048),
            nn.ReLU(),
            nn.Linear(2048, 2048),
            nn.ReLU(),
            nn.Linear(2048, 1024),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits


class History():
    def __init__(self, val_set, train_set, model, **kwargs):
        self.val_set = val_set
        self.train_set = train_set
        self.model = model
        self.kwargs = kwargs
        self.history = {'steps': []}
        for i in kwargs.keys():
            self.history.update({'val_'+i: []})
            self.history.update({'tra_'+i: []})
        self.valloader = None
        self.trainloader = None


    def save(self, step):
        short_history = {}
        for i in self.kwargs.keys():
            short_history.update({'val_'+i: []})
            short_history.update({'tra_'+i: []})
        k = 500
        short_train_set, waste = torch.utils.data.random_split(self.train_set, [k, len(self.train_set) - k])
        short_val_set, waste = torch.utils.data.random_split(self.val_set, [k, len(self.val_set) - k])
        self.valloader = DataLoader(dataset=short_val_set, batch_size=5, shuffle=True, num_workers=2)
        self.trainloader = DataLoader(dataset=short_train_set, batch_size=5, shuffle=True, num_workers=2)
        for i, ((val_in, val_label), (tra_in, tra_label)) in enumerate(zip(self.valloader, self.trainloader)):
            with torch.no_grad():
                self.model.eval()
                val_pred = self.model(val_in)
                tra_pred = self.model(tra_in)
                for j in self.kwargs.keys():
                    if len(val_pred) > 1:
                        val_l = self.kwargs[j](val_pred, val_label).item()
                        tra_l = self.kwargs[j](tra_pred, tra_label).item()
                        short_history['val_'+j].append(val_l)
                        short_history['tra_'+j].append(tra_l)
                self.model.train()
        for i in self.kwargs.keys():
            self.history['val_' + i].append(sum(short_history['val_' + i]) / len(short_history['val_' + i]))
            self.history['tra_' + i].append(sum(short_history['tra_' + i]) / len(short_history['tra_' + i]))
        self.history['steps'].append(step)


    def plot(self, path=None):
        figures = []
        for i in self.kwargs.keys():
            fig, ax = plt.subplots()
            ax.plot(self.history['steps'], self.history['val_' + i], 'b')
            ax.plot(self.history['steps'], self.history['tra_' + i], 'r')
            ax.set_title(i.upper())
            ax.set_ylabel(i)
            ax.set_xlabel('Epochs')
            figures.append(fig)
            if path is None:
                plt.show()
                plt.clf()
            else:
                plt.savefig(f"{path}/{i}")
        return figures  # what is this?



def generate_data(topic, data=None, nr=15):
    def ask_ai(nr, prompt):
        response = openai.Completion.create(model="text-davinci-003", prompt=prompt, temperature=1, max_tokens=10*nr)
        response = '1.' + response['choices'][0]['text'] + '\n'
        l = []
        for i in range(nr):
            pos = response.find(str(i + 1))
            beg = pos + len(str(i + 1)) + 2
            end = response[beg:].find('\n')
            l.append(response[beg:beg + end])
        return l

    def gen_sentences(nr, factor, prompt):
        keywords = ask_ai(nr, prompt)
        sentences = []
        for i in keywords:
            print(i)
            requests = ask_ai(15*factor, f'Give me {15*factor} independent short requests about "{i}".\n\n1.')
            demands = ask_ai(15*factor, f'Give me {15*factor} independent short demands about "{i}".\n\n1.')
            questions = ask_ai(15*factor, f'Give me {15*factor} independent short questions about "{i}".\n\n1.')
            facts = ask_ai(5*factor, f'Give me {5*factor} independent short factual statements about "{i}".\n\n1.')
            sentences.extend(requests + demands + questions + facts)
        return sentences

    if data is None:
        all_sentences = []
        print(f'Writing sentences about {topic}.')
        # nr = 15
        fac = 1
        prompt = f'Give me {nr*2} independent keywords to the topic {topic}.\n\n1.'
        all_sentences.extend(gen_sentences(nr*2, fac, prompt))
        with open("save.p", "wb") as f:
            pickle.dump(all_sentences, f)
        print(f'Writing sentences not about {topic}.')
        prompt = f'Give me {nr} topics fully unrelated to {topic}.\n\n1.'
        all_sentences.extend(gen_sentences(nr, 2*fac, prompt))
        with open("save.p", "wb") as f:
            pickle.dump(all_sentences, f)
        print(all_sentences)
        print(len(all_sentences))
        print('Labelling sentences.')
        labels = []
        for i in range(len(all_sentences)):
            if i < len(all_sentences)/2:
                labels.append(True)
            else:
                labels.append(False)
        data = [all_sentences, labels]
        data = np.array(data).transpose()
        mapping = []
        uni = np.unique(data)
        for i in uni:
            mapping.append(np.where(data == i)[0][0])
        data = data[mapping[1:]]
        with open("save.p", "wb") as f:
            pickle.dump(data, f)
        print('full data has been saved to "save.p".')
    else:
        # data = data[~pd.isnull(data[:,0])]  # doesn't work
        mapping = []
        uni = np.unique(data)
        for i in uni:
            mapping.append(np.where(data == i)[0][0])
        data = data[mapping[1:]]
    pd.DataFrame(data).to_csv(f"{topic.replace(' ', '_')}_generated_data.csv", index = False, header = ['sentences', 'labels'])
    return pd.read_csv(f"{topic.replace(' ', '_')}_generated_data.csv")
    # TODO needs better filters and better quality. Especially empty inputs need to be filtered out!


def long_roberta(sentences):
    # Mean Pooling - Take attention mask into account for correct averaging
    def mean_pooling(model_output, attention_mask):
        token_embeddings = model_output[0]  # First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

    # Sentences we want sentence embeddings for
    # sentences = ['This is an example sentence', 'Each sentence is converted']

    # Load model from HuggingFace Hub
    tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-roberta-large-v1')
    model = AutoModel.from_pretrained('sentence-transformers/all-roberta-large-v1')

    # Tokenize sentences
    encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
    # test if this works with truncation=False

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)

    # Perform pooling
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    # Normalize embeddings
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

    return sentence_embeddings


def prepare_data_slowly(data, topic):
    # data = data[data.review.str.split().str.len().le(64)]
    np_data = data.to_numpy().transpose()
    # use sentence embedding to encode the reviews
    # model = SentenceTransformer('sentence-transformers/all-roberta-large-v1')
    embedded_data = np.array([[0,0]])
    # embedded_data = torch.load('embedded_data.pt')
    k = 100
    for i in range(math.ceil(len(np_data[0]) / k)):
        reviews = long_roberta(list(np_data[0][k*i:k*i+k]))
        labels = np_data[1][k*i:k*i+k]
        # reviews = torch.tensor_split(reviews, 0, dim=0)
        a = np.array([torch.tensor_split(reviews, len(reviews)), labels])
        a = a.transpose()
        embedded_data = np.append(embedded_data, a, axis=0)
        if i == 0:
            embedded_data = embedded_data[1:]
        # embedded_data.extend(a)
        torch.save(embedded_data, f'embedded_data_{topic}.pt')
        print(f'saved {i+1} / {len(np_data[0]) / k}')
    return embedded_data.transpose()


def check_length(data):
    def tokenize(sentences):
        tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-roberta-large-v1')
        encoded_input = tokenizer(sentences, padding=True, truncation=False, return_tensors='pt')
        return encoded_input

    sorted_data = data.reindex(data.sentences.str.len().sort_values().index[::-1]).reset_index(drop=True)

    for idx, row in sorted_data.iterrows():
        length = len(tokenize(row.sentences)['input_ids'][0])
        if length > 512:
            print('Warning: Paragraph longer than 512 tokens therefore to long.')
        elif length > 128:
            print('Warning: Paragraph longer than 128 tokens therefore longer than recommended.')
        elif length < 80:
            break



def analyse_full_data(data):
    '''
    max = data.review.str.len().sum()
    print('Full set average length')
    print(max / 50000)
    data = data[data.review.str.split().str.len().le(64)]
    max = data.review.str.len().sum()
    print('Short set average length')
    print(max / 50000)
    '''

    print('INFO')
    data.info()
    data.groupby(['labels']).describe()
    print(f'Number of unique sentences: {data["sentences"].nunique()}')
    duplicates = data[data.duplicated()]
    print(f'Number of duplicate rows:\n{len(duplicates)}')
    print(f'Check for nulls:\n{data.isnull().sum()}')
    sns.countplot(x=data['labels'])  # ploting distribution for easier understanding
    print(data.head(3))

    print('A few random examples from the dataset:')
    # let's see how data is looklike
    random_index = random.randint(0, data.shape[0] - 3)
    for row in data[['sentences', 'labels']][random_index:random_index + 3].itertuples():
        _, text, label = row
        class_name = "About topic"
        if label == 0:
            class_name = "Not about topic"
        print(f'TEXT: {text}')
        print(f'LABEL: {label}')
    # data contain so much garbage needs to be cleaned

    truedata = data[data['labels'] == 1]
    truedata = truedata['sentences']
    falsedata = data[data['labels'] == 0]
    falsedata = falsedata['sentences']

    def wordcloud_draw(data, color, s):
        words = ' '.join(data)
        cleaned_word = " ".join([word for word in words.split() if (word != 'movie' and word != 'film')])
        wordcloud = WordCloud(stopwords=stopwords.words('english'), background_color=color, width=2500,
                              height=2000).generate(cleaned_word)
        plt.imshow(wordcloud)
        plt.title(s)
        plt.axis('off')

    plt.figure(figsize=[20, 10])
    plt.subplot(1, 2, 1)
    wordcloud_draw(truedata, 'white', 'Most-common words about the topic')

    plt.subplot(1, 2, 2)
    wordcloud_draw(falsedata, 'white', 'Most-common words not about the topic')
    plt.show() # end wordcloud

    data['text_word_count'] = data['sentences'].apply(lambda x: len(x.split()))

    numerical_feature_cols = ['text_word_count']  # numerical_feature_cols = data['text_word_count']

    plt.figure(figsize=(20, 3))
    for i, col in enumerate(numerical_feature_cols):
        plt.subplot(1, 3, i + 1)
        sns.histplot(data=data, x=col, bins=50, color='#6495ED')
        plt.title(f"Distribution of Various word counts")
    plt.tight_layout()
    plt.show()

    plt.figure(figsize=(20, 3))
    for i, col in enumerate(numerical_feature_cols):
        plt.subplot(1, 3, i + 1)
        sns.histplot(data=data, x=col, hue='labels', bins=50)
        plt.title(f"Distribution of Various word counts with respect to target")
    plt.tight_layout()
    plt.show()


class TopicIdentifier:
    def __init__(self):
        self.running_loss = None
        self.optimizer = None
        self.dataloader = None
        self.model = None
        self.loss = None
        self.dataframe = None
        self.val_set = None
        self.train_set = None
        self.labels = None
        self.sentences = None
        self.embedded_data = None
        self.raw_data = None
        self.dataset = None

    def generate_training_data(self, topic, real=True):
        if real:
            self.raw_data = generate_data(topic, nr=15)
        else:
            data = load_data('save.p')
            self.raw_data = generate_data(topic, data=data, nr=15)

    def embedd_data(self, topic, real=True):
        def get_element(arr):
            return arr[0]
        if real:
            self.embedded_data = prepare_data_slowly(self.raw_data, topic)
        else:
            self.embedded_data = torch.load(f'embedded_data_{topic}.pt').transpose()
        tpl = tuple(map(get_element, tuple(np.array_split(self.embedded_data[0], len(self.embedded_data[0])))))
        self.sentences = torch.cat(tpl)
        self.labels = self.embedded_data[1]
        self.labels[self.labels == True] = 1.
        self.labels[self.labels == False] = 0.
        self.labels = np.expand_dims(self.labels, axis=1).astype('float32')
        self.labels = torch.from_numpy(self.labels)
        self.dataset = CustomTopicDataset(self.sentences, self.labels)

    def analyse_training_data(self):
        #check_length(self.raw_data)
        analyse_full_data(self.raw_data)

    def train(self, epochs=10, lr=0.001, val_frac=0.1, batch_size=25, loss=nn.BCELoss()):
        def get_acc(pred, target):
            pred_tag = torch.round(pred)

            correct_results_sum = (pred_tag == target).sum().float()
            acc = correct_results_sum / target.shape[0]
            acc = torch.round(acc * 100)

            return acc

        val_len = int(round(len(self.dataset)*val_frac))
        self.train_set, self.val_set = torch.utils.data.random_split(self.dataset, [len(self.dataset)-val_len, val_len])
        self.dataloader = DataLoader(dataset=self.train_set, batch_size=batch_size, shuffle=True)
        self.model = NeuralNetwork(self.dataset.shape)

        self.loss = loss
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=lr)

        r2loss = R2Score()
        mseloss = nn.MSELoss()
        bceloss = nn.BCELoss()
        accuracy = get_acc

        history = History(self.val_set, self.train_set, self.model, r2loss=r2loss, mseloss=mseloss, accuracy=accuracy, bceloss=bceloss)

        # main training loop
        for epoch in range(epochs):
            self.running_loss = 0.
            print(f'Starting new batch {epoch + 1}/{epochs}')
            for step, (inputs, labels) in enumerate(self.dataloader):
                y_pred = self.model(inputs)
                l = self.loss(y_pred, labels)
                l.backward()
                self.optimizer.step()
                self.optimizer.zero_grad()
                self.running_loss += l.item()
                # TODO: Make this dynamic to about 5 saves per epoch.
                if (step + 1) % math.floor(len(self.dataloader)) == 0:  # if (step+1) % 100 == 0:
                    print(f'current loss:\t\t{self.running_loss / 100}')
                    self.running_loss = 0
                    history.save(epoch + step / len(self.dataloader))
                    # save current state of the model to history
        now = str(d.now().isoformat()).replace(':', 'I').replace('.', 'i').replace('-', '_')
        os.mkdir(f"model_{now}")
        torch.save(self.model, f"model_{now}/model.pt")
        print(f'Model saved to "model_{now}/model.pt"')
        history.plot(f"model_{now}")
        return history, self.model

def load_data(filename):
    with open(filename, "rb") as f:
        data = pickle.load(f)
    return data

def try_model(model):
    a = input('Please enter your input sentence: ')
    a = long_roberta(a)
    pred = model(a)
    print(pred.item())
    print('Where 1 is about the topic and 0 is not.\n')


def prompt_engineering_acc(topic, dataframe):
    acc = []
    for idx, row in dataframe.reset_index().iterrows():
        inp = row['sentences']
        out = row['labels']
        prompt = f'Answer with either "yes" or "no". Is the following sentence about {topic}?\n\n{inp}\n\nAnswer:'
        response = openai.Completion.create(model="text-davinci-003", prompt=prompt, temperature=0, max_tokens=2)
        response = response['choices'][0]['text']
        if response.lower().startswith(' y'):
            response = 1
        elif response.lower().startswith(' n'):
            response = 0
        else:
            print(f'Bad response from openai: "{response}".')
        if isinstance(response, int):
            if response == out:
                acc.append(1)
            else:
                acc.append(0)
    return 100*sum(acc)/len(acc)



if __name__ == "__main__":
    topic = 'biology'
    topic2 = 'chemistry'
    ti = TopicIdentifier()
    t2 = TopicIdentifier()
    ti.generate_training_data(topic)
    t2.generate_training_data(topic2)
    acc = prompt_engineering_acc(topic, ti.raw_data)
    ac2 = prompt_engineering_acc(topic2, t2.raw_data)
    print(f'{topic} prompt engineering: {acc}')
    print(f'{topic2} prompt engineering: {ac2}')
    print(f'{topic} analyse ----------------------------')
    ti.analyse_training_data()
    print(f'{topic2} analyse ----------------------------')
    t2.analyse_training_data()
    ti.embedd_data(topic)
    t2.embedd_data(topic2)
    history, biology = ti.train(epochs=10, lr=0.0001, val_frac=0.1, batch_size=10, loss=nn.BCELoss())
    history2, chemistry = ti.train(epochs=10, lr=0.0001, val_frac=0.1, batch_size=10, loss=nn.BCELoss())
    while True:
        print('BIOLOGY:')
        try_model(biology)
        print('CHEMISTRY:')
        try_model(chemistry)



# Far higher diversity in not topic related samples needed. Normal conversation, random sequences of letters, etc.


This is the movie review code

In [None]:
# TODO comment to everything its purpose


# These are the start command when running this as jupyter notebook on colabs:

print('starting.')
!pip install transformers
!pip install sentence_transformers
!pip install torchmetrics
# https://pytorch.org/tutorials/beginner/introyt/trainingyt.html
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import math

# https://pytorch.org/tutorials/beginner/introyt/trainingyt.html

import numpy as np
from sentence_transformers import SentenceTransformer
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
import pandas as pd
from torchmetrics import R2Score


class CustomMovieDataset(Dataset):
    def __init__(self, reviews, sentiments):
        self.x = reviews
        self.y = sentiments
        self.length = self.x.shape[0]

    def __len__(self):
        return self.length

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]


class NeuralNetwork(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(input_size, 2048),
            nn.ReLU(),
            nn.Linear(2048, 2048),
            nn.ReLU(),
            nn.Linear(2048, 2048),
            nn.ReLU(),
            nn.Linear(2048, 1024),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        # print(x.shape)
        logits = self.linear_relu_stack(x)
        return logits


class History():
    def __init__(self, val_set, train_set, model, **kwargs):
        self.val_set = val_set
        self.train_set = train_set
        self.model = model
        self.kwargs = kwargs
        self.history = {'steps': []}
        for i in kwargs.keys():
            self.history.update({'val_'+i: []})
            self.history.update({'tra_'+i: []})
        self.valloader = None
        self.trainloader = None


    def save(self, step):
        short_history = {}
        for i in self.kwargs.keys():
            short_history.update({'val_'+i: []})
            short_history.update({'tra_'+i: []})
        k = 500
        short_train_set, waste = torch.utils.data.random_split(self.train_set, [k, len(self.train_set) - k])
        short_val_set, waste = torch.utils.data.random_split(self.val_set, [k, len(self.val_set) - k])
        self.valloader = DataLoader(dataset=short_val_set, batch_size=5, shuffle=True, num_workers=2)
        self.trainloader = DataLoader(dataset=short_train_set, batch_size=5, shuffle=True, num_workers=2)
        for i, ((val_in, val_label), (tra_in, tra_label)) in enumerate(zip(self.valloader, self.trainloader)):
            with torch.no_grad():
                self.model.eval()
                val_pred = self.model(val_in)
                tra_pred = self.model(tra_in)
                for j in self.kwargs.keys():
                    val_l = self.kwargs[j](val_pred, val_label).item()
                    tra_l = self.kwargs[j](tra_pred, tra_label).item()
                    short_history['val_'+j].append(val_l)
                    short_history['tra_'+j].append(tra_l)
                self.model.train()
        for i in self.kwargs.keys():
            self.history['val_' + i].append(sum(short_history['val_' + i]) / len(short_history['val_' + i]))
            self.history['tra_' + i].append(sum(short_history['tra_' + i]) / len(short_history['tra_' + i]))
        self.history['steps'].append(step)


    def plot(self):
        figures = []
        for i in self.kwargs.keys():
            fig, ax = plt.subplots()
            ax.plot(self.history['steps'], self.history['val_' + i], 'b')
            ax.plot(self.history['steps'], self.history['tra_' + i], 'r')
            print(f'{i}:')
            plt.show()
            figures.append(fig)
            plt.clf()
        return figures


def main(epochs=10, learning_rate=0.01, test_size=1000, train_batch_size=10, validation_batch_size=512, num_workers=2,
         loss=None, data_factor=1):
    if loss is None:
        loss = nn.MSELoss()  # TODO pass loss as function object
    url = 'https://raw.githubusercontent.com/Lokisfeuer/diamond/master/imdbdataset.csv'
    data = pd.read_csv(url)
    data = data.sample(frac=data_factor)
    data, sentiments = prepare_data(data)

    dataset = CustomMovieDataset(data, sentiments)
    train_set, val_set = torch.utils.data.random_split(dataset, [len(data) - test_size, test_size])
    print(len(val_set))
    print(len(train_set))

    valloader = DataLoader(dataset=val_set, batch_size=validation_batch_size, shuffle=True)
    dataloader = DataLoader(dataset=train_set, batch_size=train_batch_size, shuffle=True)
    print(data)
    model = NeuralNetwork(len(data[0]))
    r2loss = R2Score()
    mseloss = nn.MSELoss()
    bceloss = nn.BCELoss()

    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
    # optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    history = History(val_set, train_set, model, r2loss=r2loss, mseloss=mseloss, accuracy=get_acc, bceloss=bceloss)

    for epoch in range(epochs):
        running_l = 0
        print(f'Starting new batch {epoch + 1}/{epochs}')
        for step, (inputs, labels) in enumerate(dataloader):
            y_pred = model(inputs)
            # l = loss(y_pred, labels)
            l = mseloss(y_pred, labels)
            running_l += l.item()
            l.backward()
            optimizer.step()
            optimizer.zero_grad()
            if (step + 1) % 50 == 0:  # if (step+1) % 100 == 0:
                history.save(epoch * len(dataloader) + step)
                print(f'training loss: {running_l / 50}')
                running_l = 0
    history.plot()


def get_acc(pred, target):
    pred_tag = torch.round(pred)

    correct_results_sum = (pred_tag == target).sum().float()
    acc = correct_results_sum / target.shape[0]
    acc = torch.round(acc * 100)

    return acc


def prepare_data(data):
    np_data = data.to_numpy().transpose()
    model = SentenceTransformer('sentence-transformers/all-roberta-large-v1')
    file_data = torch.load('embedded_reviews.pt')
    print(type(file_data))
    x = []
    print(f'length of file_data: {len(file_data)}')
    for i in file_data:
        x.append(torch.from_numpy(i))
    print(f'length of x: {len(x)}')
    reviews = torch.cat(x)
    # reviews = model.encode(np_data[0])
    sentiments = np_data[1][:7100]
    print(f'length of reviews: {len(reviews)}')
    print(f'length of sentiments: {len(sentiments)}')
    sentiments[sentiments == 'positive'] = [1.]
    sentiments[sentiments == 'negative'] = [0.]
    sents = []
    for i in sentiments:
        sents.append([i])
    sentiments = np.array(sents, dtype=np.float32)
    sentiments = torch.from_numpy(sentiments)
    reviews = torch.tensor(reviews, dtype=torch.float32)
    sentiments = torch.tensor(sentiments, dtype=torch.float32)  # line needed? dtype?
    return reviews, sentiments

def prepare_data_slowly():
    url = 'https://raw.githubusercontent.com/Lokisfeuer/diamond/master/imdbdataset.csv'
    data = pd.read_csv(url)
    np_data = data.to_numpy().transpose()
    # use sentence embedding to encode the reviews
    model = SentenceTransformer('sentence-transformers/all-roberta-large-v1')
    all_reviews = []
    k = 100
    for i in range(round(len(np_data[0]) / k)):
        reviews = model.encode(np_data[0][k*i:k*i+k])
        all_reviews.append(reviews)
        torch.save(all_reviews, 'embedded_reviews.pt')
        print(f'saved {i+1} / {len(np_data[0]) / k}')

if __name__ == '__main__':
    # prepare_data_slowly()
    kwargs = {
        'epochs':10,
        'learning_rate':0.01,
        'test_size':500, # 1000  # 10% of full dataset
        'train_batch_size':25,
        'validation_batch_size':512,
        'num_workers':2,
        'loss':nn.BCELoss(),
        'data_factor': 1
    }
    main(**kwargs)
    # for jupyter:
    #   change reading of csv
    #   adjust start command from jupyter.


Short version of the movie review code to test.

In [None]:
# TODO comment to everything its purpose


# These are the start command when running this as jupyter notebook on colabs:

print('starting.')
!pip install transformers
!pip install sentence_transformers
!pip install torchmetrics
# https://pytorch.org/tutorials/beginner/introyt/trainingyt.html
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


import torch.nn as nn
from torch.utils.data import Dataset, DataLoader


class CustomMovieDataset(Dataset):
    def __init__(self, reviews, sentiments):
        self.x = reviews
        self.y = sentiments
        self.length = self.x.shape[0]

    def __len__(self):
        return self.length

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]


class NeuralNetwork(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(input_size, 2048),
            nn.ReLU(),
            nn.Linear(2048, 2048),
            nn.ReLU(),
            nn.Linear(2048, 2048),
            nn.ReLU(),
            nn.Linear(2048, 1024),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

    def forward(self, x):
        # print(x.shape)
        logits = self.linear_relu_stack(x)
        return logits



def main(epochs=10, learning_rate=0.01, train_batch_size=10, loss=None):
    if loss is None:
        loss = nn.MSELoss()  # TODO pass loss as function object
    url = 'https://raw.githubusercontent.com/Lokisfeuer/diamond/master/imdbdataset.csv'
    data = pd.read_csv(url)
    data, sentiments = prepare_data(data)

    dataset = CustomMovieDataset(data, sentiments)

    dataloader = DataLoader(dataset=dataset, batch_size=train_batch_size, shuffle=True)
    print(data)
    model = NeuralNetwork(len(data[0]))
    r2loss = R2Score()
    mseloss = nn.MSELoss()
    bceloss = nn.BCELoss()

    # optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    for epoch in range(epochs):
        running_l = 0
        print(f'Starting new batch {epoch + 1}/{epochs}')
        for step, (inputs, labels) in enumerate(dataloader):
            y_pred = model(inputs)
            # l = loss(y_pred, labels)
            l = loss(y_pred, labels)
            running_l += l.item()
            l.backward()
            optimizer.step()
            optimizer.zero_grad()
            if (step + 1) % 50 == 0:  # if (step+1) % 100 == 0:
                print(f'training loss: {running_l / 50}')
                running_l = 0


def prepare_data(data):
    np_data = data.to_numpy().transpose()
    file_data = torch.load('embedded_reviews.pt')
    print(type(file_data))
    x = []
    print(f'length of file_data: {len(file_data)}')
    for i in file_data:
        x.append(torch.from_numpy(i))
    print(f'length of x: {len(x)}')
    reviews = torch.cat(x)
    # reviews = model.encode(np_data[0])
    sentiments = np_data[1][:7100]
    print(f'length of reviews: {len(reviews)}')
    print(f'length of sentiments: {len(sentiments)}')
    sentiments[sentiments == 'positive'] = [1.]
    sentiments[sentiments == 'negative'] = [0.]
    sents = []
    for i in sentiments:
        sents.append([i])
    sentiments = np.array(sents, dtype=np.float32)
    sentiments = torch.from_numpy(sentiments)
    reviews = torch.tensor(reviews, dtype=torch.float32)
    sentiments = torch.tensor(sentiments, dtype=torch.float32)  # line needed? dtype?
    return reviews, sentiments


if __name__ == '__main__':
    # prepare_data_slowly()
    kwargs = {
        'epochs':10,
        'learning_rate':0.01,
        'train_batch_size':25,
        'loss':nn.BCEWithLogitsLoss()
    }
    main(**kwargs)
    # for jupyter:
    #   change reading of csv
    #   adjust start command from jupyter.


This is the full diamond price estimator code:

In [None]:
print('starting.')
!pip install transformers
!pip install sentence_transformers
!pip install torchmetrics
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# https://pytorch.org/tutorials/beginner/introyt/trainingyt.html

import numpy as np
import os
import math
import random
from datetime import datetime as d
import jupyter
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor
import matplotlib as mpl
import matplotlib.pyplot as plt
import torch.nn.functional as f
# from kaggle.api.kaggle_api_extended import KaggleApi
# api = KaggleApi()
# api.authenticate() # comment out for jupyter
import pandas as pd
from torchvision.io import read_image
from sklearn.preprocessing import MinMaxScaler
from torch.utils.tensorboard import SummaryWriter
from torchmetrics import R2Score
# from torchmetrics.functional import r2_score

class CustomDiamondDataset(Dataset):
    def __init__(self, data, prices):
        self.x = data
        self.y = prices
        self.length = self.x.shape[0]

    def __len__(self):
        return self.length

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]


class NeuralNetwork(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(input_size, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 1),
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits


def main(epochs=10, learning_rate=0.01, test_size=1000, train_batch_size=10, validation_batch_size=512, num_workers=2, loss=None, optimizer=None, data_factor=1):
    if loss is None:
        loss = 'nn.MSELoss()'
    if optimizer is None:
        optimizer = 'torch.optim.SGD(model.parameters(), lr=learning_rate)'

    writer = SummaryWriter('runs/diamond')
    url = 'https://raw.githubusercontent.com/Lokisfeuer/diamond/master/diamonds.csv'
    data = pd.read_csv(url)
    data = data.sample(frac=data_factor)
    data, prices, maxi, mini = normalize_data(data)
    # data = data[:100]
    # prices = prices[:100]
    data = torch.tensor(data, dtype=torch.float32)
    prices = torch.tensor(prices, dtype=torch.float32)
    dataset = CustomDiamondDataset(data, prices)
    train_set, val_set = torch.utils.data.random_split(dataset, [len(data)-test_size, test_size])

    valloader = DataLoader(dataset=val_set, batch_size=validation_batch_size, shuffle=True, num_workers=num_workers)
    dataloader = DataLoader(dataset=train_set, batch_size=train_batch_size, shuffle=True, num_workers=num_workers)
    model = NeuralNetwork(len(data[0]))
    loss = eval(loss)
    r2loss = R2Score()
    mseloss = nn.MSELoss()
    optimizer = eval(optimizer)
    # loss = nn.MSELoss()  # try others: r squared metric scale from -1 (opposite) to 1 (ideal) to infinite (wrong again); accuracy error
    # optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
    val_mse_loss = []
    val_r2loss = []
    val_real_price_percentage_loss = []
    percentages = []
    training_mse_loss = []
    training_r2loss = []
    training_real_price_percentage_loss = []
    x_axis = []
    for epoch in range(epochs):
        running_loss = 0.
        running_r2loss = 0.
        print(f'Starting new batch {epoch+1}/{epochs}')
        # check_accuracy(valloader, model, maxi, mini)
        for step, (inputs, labels) in enumerate(dataloader):
            # calculate r squarred loss
            y_pred = model(inputs)
            for pred, label in zip(y_pred, labels):
                pr = get_real_price(pred, maxi, mini)
                la = get_real_price(label, maxi, mini)
                percentages.append(abs(pr-la)/la)
            l = loss(y_pred, labels)
            # msel = mseloss(y_pred, labels)
            l.backward()
            optimizer.step()
            optimizer.zero_grad()
            running_loss += l.item()  # l.item()
            with torch.no_grad():
                model.eval()
                x = r2loss(y_pred, labels).item()
                if x >= 1:
                    print('x >= 1')
                running_r2loss += x
                model.train()
            if (step+1) % 100 == 0: # if (step+1) % 100 == 0:
                mse_l, r2_l, percent_l = evaluate_model(model, valloader, maxi, mini, loss, r2loss)
                val_mse_loss.append(mse_l)
                val_r2loss.append(r2_l)
                val_real_price_percentage_loss.append(percent_l)
                training_mse_loss.append(running_loss / 100)
                training_r2loss.append(running_r2loss / 100)
                training_real_price_percentage_loss.append(sum(percentages)/len(percentages)*100) # to static. Why?
                x_axis.append(epoch*len(dataloader) + step)
                writer.add_scalar('training_mse_loss', running_loss / 100, epoch*len(dataloader) + step)
                writer.add_scalar('training_real_price_percentage_loss', sum(percentages)/len(percentages)*100, epoch*len(dataloader) + step)
                running_loss = 0.
                running_r2loss = 0.
                percentages = []
        checkpoint = {
            'epoch': epoch,
            'model_state': model.state_dict(),
            'optim_state': optimizer.state_dict(),
        }
        torch.save(checkpoint, 'checkpoint.pth')
        # to load:
        # loaded_checkpoint =torch.load('checkpoint.pth')
    writer.close()
    args = [x_axis, val_mse_loss, training_mse_loss, val_r2loss, training_r2loss, val_real_price_percentage_loss, training_real_price_percentage_loss]
    print(args)
    graph(*args)
    file = '2model.pth'
    torch.save(model.state_dict(), file)

def graph(x_axis, val_mse_loss, training_mse_loss, val_r2loss, training_r2loss, val_real_price_percentage_loss, training_real_price_percentage_loss):
    path = os.path.abspath(os.getcwd())
    ra = str(random.randint(1,100))
    ver = str(2)
    now = str(d.now().isoformat()).replace(':', 'I').replace('.', 'i')
    fig, ax = plt.subplots()
    ax.plot(x_axis, val_mse_loss, 'b') # ? (0)
    ax.plot(x_axis, training_mse_loss, 'r')  # 0
    print('mse loss:')
    plt.show()
    # plt.savefig(f'plots/{ver}mse_ver{ra}_{now}.png') # comment out for jupyter
    plt.clf()
    fig, ax = plt.subplots()
    ax.plot(x_axis, val_r2loss, 'b') # ? (0)
    ax.plot(x_axis, training_r2loss, 'r') # ? (0)
    print('r2 loss:')
    plt.show()
    # plt.savefig(f'plots/{ver}r2_ver{ra}_{now}.png') # comment out for jupyter
    plt.clf()
    fig, ax = plt.subplots()
    ax.plot(x_axis, val_real_price_percentage_loss, 'b') # good
    ax.plot(x_axis, training_real_price_percentage_loss, 'r') # good
    print('real price percentage loss:')
    plt.show()
    # plt.savefig(f'plots/{ver}perc_ver{ra}_{now}.png') # comment out for jupyter
    plt.clf()


def load_model(data, file = '1model.pth'):
    valloader = DataLoader(dataset=val_set, batch_size=512, shuffle=True, num_workers=2)

    loaded_model = NeuralNetwork(len(data[0]))
    loaded_model.load_state_dict(torch.load(file))
    return loaded_model


def evaluate_model(model, valloader, maxi, mini, loss, r2loss):
    # print('\n\nStart evaluating')
    with torch.no_grad():
        # try using accuracy in addition to loss
        model.eval()
        percentages = []
        avg_mse_loss = []
        avg_r2_loss = []
        for step, (inputs, labels) in enumerate(valloader):
            mistakes = []
            y_pred = model(inputs)
            for pred, label in zip(y_pred, labels):
                pr = get_real_price(pred, maxi, mini)
                la = get_real_price(label, maxi, mini)
                # print(f'Estimation: {p}; True: {la}')
                mistakes.append(abs(pr-la))
                percentages.append(abs(pr-la)/la)
            l = loss(y_pred, labels)
            r2l = r2loss(y_pred, labels)
            # print(f'Average real-price error for this batch was: \t\t\t\t\t{sum(mistakes)/len(mistakes)}.')
            # print(f'Average real-price error relative to the price in percent was: '
            #       f'\t{sum(percentages)/len(percentages)*100}%.')
            # print(f'Average loss for this batch was \t\t\t\t\t\t\t\t{l.item()}\n')
            avg_mse_loss.append(l.item())
            avg_r2_loss.append(r2l.item())
        model.train()
        return sum(avg_mse_loss)/len(avg_mse_loss), sum(avg_r2_loss)/len(avg_r2_loss), sum(percentages)/len(percentages)*100

    # Graph test over training !
    # plot everything on the graph, accuracy, MSEloss, R^2loss, percentage_price%



# check accuracy causes Error - not used
def check_accuracy(loader, model, maxi, mini):
    model.eval()
    with torch.no_grad():
        aver = []
        for x, y in loader:
            correct = get_real_price(y.item(), maxi, mini)
            resp = model(x)
            price = get_real_price(resp.item(), maxi, mini)
            aver.append(abs(correct - price))
        model.train()
        print(sum(aver)/len(aver))
        return sum(aver)/len(aver)
            #scores = model(x)
            #res = scores.unsqueeze(1) - y
            #a = torch.mean(res).item()
            #aver.append(a)

            #_, predictions = scores.max(1)
            #num_correct += (predictions == y).sum()
            #num_samples += predictions.size(0)

        # print(f'Got {num_correct} / {num_samples} with accuracy {float(num_correct) / float(num_samples) * 100:.2f}')


# both robertas fully copied from https://huggingface.co/sentence-transformers/all-roberta-large-v1
def short_roberta(sentences):
    # sentences = ["This is an example sentence", "Each sentence is converted"]

    model = SentenceTransformer('sentence-transformers/all-roberta-large-v1')
    embeddings = model.encode(sentences)
    print(embeddings)
    return embeddings


def long_roberta(sentences):
    # Mean Pooling - Take attention mask into account for correct averaging
    def mean_pooling(model_output, attention_mask):
        token_embeddings = model_output[0]  # First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

    # Sentences we want sentence embeddings for
    # sentences = ['This is an example sentence', 'Each sentence is converted']

    # Load model from HuggingFace Hub
    tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-roberta-large-v1')
    model = AutoModel.from_pretrained('sentence-transformers/all-roberta-large-v1')

    # Tokenize sentences
    encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)

    # Perform pooling
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    # Normalize embeddings
    sentence_embeddings = f.normalize(sentence_embeddings, p=2, dim=1)

    return sentence_embeddings


def get_real_price(val, maxi, mini):
    x = (maxi-mini) * val + mini
    return math.exp(x)


def normalize_data(data):
    # max / min normalization the dataset
    # scalars stay normal columns, categories get different columns - one hot encoding
    # price on a logarythmic scale

    '''
    let's build a tensor with the following dimensions:
        carat
        *cut* (hot encoding)
            ideal
            premium
            good
            very good
            fair
        colour (auch hot encoding)
        clarity (dito)
        depth
        table
        price
        x
        y
        z
    then min max the full thing.
    '''
    def onehot():
        nb_classes = 6
        arr = np.array([[2, 3, 4, 0]])
        targets = arr.reshape(-1)
        one_hot_targets = np.eye(nb_classes)[targets]
        return one_hot_targets

    onehot()
    np_data = data.to_numpy()

    cut_index = {'Fair':0, 'Good':1, 'Very Good':2, 'Premium':3, 'Ideal':4}
    colour_index = {'J': 0, 'I': 1, 'H': 2, 'G': 3, 'F': 4, 'E': 5, 'D':6}
    # clarity: (I1(worst), SI2, SI1, VS2, VS1, VVS2, VVS1, IF(best))
    clarity_index = {'I1': 0, 'SI2': 1, 'SI1': 2, 'VS2': 3, 'VS1': 4, 'VVS2': 5, 'VVS1':6, 'IF': 7}
    indeces = [cut_index, colour_index, clarity_index]


    # carat, cut (5), colour (7), clarity (8), depth, table, price, x, y
    new_array = []
    prices = []
    for i, diamond in enumerate(np_data):
        diamond = diamond[1:]
        new_diamond = [diamond[0]]
        for j in range(3):
            index = indeces[j][diamond[j+1]]
            zeros = [0.]*len(indeces[j].keys())
            zeros[index] = 1.
            for k in zeros:
                new_diamond.append(k)
        for j in [4, 5, 7, 8]:
            new_diamond.append(diamond[j])
        new_array.append(new_diamond)
        prices.append(math.log(diamond[6]))

    maxi = max(prices)
    mini = min(prices)
    scaler = MinMaxScaler()
    data = pd.DataFrame(new_array)
    prices = pd.DataFrame(prices)
    normalized_data = scaler.fit_transform(data)
    normalized_prices = scaler.fit_transform(prices)

    return normalized_data, normalized_prices, maxi, mini


if __name__ == '__main__':
    kwargs = {
        'epochs':10,
        'learning_rate':0.01,
        'test_size':1000, # 1000
        'train_batch_size':10,
        'validation_batch_size':512,
        'num_workers':2,
        'loss':'nn.MSELoss()',
        'optimizer':'torch.optim.SGD(model.parameters(), lr=learning_rate)',
        'data_factor': 1
    }
    main(**kwargs)
    #args = [[29, 59, 89], [0.04725663047283888, 0.04288289994001389, 0.03785799648612738], [0.03140254817903042, 0.01449822638183832, 0.013357452619820832], [0.21161172389984131, 0.3071813404560089, 0.3442371547222137], [-0.1830847430229187, 0.05015255331993103, 0.08432324945926667], [86.34664962859036, 78.04877220648744, 74.22272207896643], [67.15759899285841, 91.4195255019661, 84.78499436740307]]
    #graph(*args)

    # for jupyter:
    #   comment out saving of graph (and not model?).
    #   change reading of csv
    #   adjust start command from jupyter.


starting.


Prediction beforehand: 306.19880845178545		correct was: 2821.999999999999
Starting new batch 1/1
