In [None]:
import torch
import random
import os
import numpy as np
import pandas as pd

def setup_seed(seed):
     torch.manual_seed(seed)
     torch.cuda.manual_seed_all(seed)
     np.random.seed(seed)
     random.seed(seed)
# Set seed for reproducibility
setup_seed(42)

In [None]:
import pandas as pd
import torch
def find_text(composition):
    file_path = os.path.join('../description/', composition + '.txt')
    with open(file_path, 'r') as file:
        text = file.read()
    return text


df = pd.read_csv('../unique_compositions.csv')

labels = {'BMG': 0,
          'Ribbon': 1,
          'NR': 2
          }

class Dataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.labels = [torch.tensor(labels[label]) for label in df['glass_forming_category']]
        self.texts = [tokenizer(normalize(find_text(composition)),
                                padding='max_length', 
                                max_length = 900, 
                                truncation=True,
                                return_tensors="pt") 
                      for composition in df['composition']]
        self.texts = [{k: torch.Tensor(v).long() for k, v in t.items()} for t in self.texts]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):
        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)
        return batch_texts, batch_y

In [None]:
from torch.utils.data import DataLoader, Subset

# get dataset
# dataset = Dataset(df)

# divide dataset into 3 classes
class_indices = {
    label: df[df['glass_forming_category'] == category].index.tolist()
    for category, label in labels.items()
}

# ratio of train set and test set
train_ratio = 0.8  # 训练集比例

train_indices = []
test_indices = []

# Stratified sampling for each category
for class_label, indices in class_indices.items():
    # Calculate  the number of samples in the training set for the current category
    class_size = len(indices)
    train_size = int(train_ratio * class_size)
    # random sample from indices with train_size and test_size
    random.shuffle(indices)
    train_indices.extend(indices[:train_size])
    test_indices.extend(indices[train_size:])

# create train dataset and test dataset
# train_dataset = Subset(dataset, train_indices)
# test_dataset = Subset(dataset, test_indices)

In [None]:
train_dataset = df.iloc[train_indices]
test_dataset = df.iloc[test_indices]
print(len(train_dataset))
print(len(test_dataset))

In [None]:
train_dataset.to_csv('train_dataset.csv', index=False)
test_dataset.to_csv('test_dataset.csv', index=False)
