In [None]:
import pandas as pd
import seaborn as sns
import torch
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from transformers import TrainingArguments, AutoTokenizer, AutoModelForSequenceClassification, Trainer
from torch.utils.data.dataset import Dataset

In [None]:
# Read the Excel file into a DataFrame
df = pd.read_excel('/kaggle/input/spsyhtdt/manipulated_data.xlsx')

In [None]:
# Define a list of columns that should not be chosen as label columns
not_chosen_columns = ['Text']

# Select label columns that are not in the list of not chosen columns
label_columns = [col for col in df.columns if col not in not_chosen_columns]

In [None]:
test_split = 0.2

# Initial train and test split.
train_df, test_df = train_test_split(
    df,
    test_size=test_split,
)
print(f"Number of rows in training set: {len(train_df)}")
print(f"Number of rows in test set: {len(test_df)}")

In [None]:
test_split = 0.5

# Initial train and test split.
test_df, eval_df = train_test_split(
    test_df,
    test_size=test_split,
)
print(f"Number of rows in training set: {len(test_df)}")
print(f"Number of rows in test set: {len(eval_df)}")

In [None]:
# Create a new DataFrame containing only the selected label columns
df_labels_train = train_df[label_columns]
df_labels_eval = eval_df[label_columns]

In [None]:
# Convert the label columns to lists for each row
labels_list_train = df_labels_train.values.tolist()
labels_list_eval = df_labels_eval.values.tolist()

In [None]:
labels_list_train = [[float(label) for label in labels] for labels in labels_list_train]
labels_list_eval = [[float(label) for label in labels] for labels in labels_list_eval]

In [None]:
train_texts = train_df['Text'].tolist()
train_labels = labels_list_train

eval_texts = eval_df['Text'].tolist()
eval_labels = labels_list_eval

tokenizer = AutoTokenizer.from_pretrained('sampathlonka/San-ALBERT')

In [None]:
train_encodings = tokenizer(train_texts, padding="max_length", truncation=True, max_length=512)
eval_encodings = tokenizer(eval_texts, padding="max_length", truncation=True, max_length=512)

In [None]:
class TextClassifierDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

In [None]:
train_dataset = TextClassifierDataset(train_encodings, train_labels)
eval_dataset = TextClassifierDataset(eval_encodings, eval_labels)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    "sampathlonka/San-ALBERT",
    problem_type="multi_label_classification",
    num_labels=49
)

In [None]:
training_arguments = TrainingArguments(
    output_dir="Trail",
    evaluation_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    save_total_limit=2,
    save_strategy="epoch",
)

trainer = Trainer(
    model=model,
    args=training_arguments,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer
)

In [None]:
trainer.train()

In [None]:
from huggingface_hub import login
login("*******************************",write_permission=True)

In [None]:
trainer.push_to_hub()

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("Hemanth-Sai/Trail")
model = AutoModelForSequenceClassification.from_pretrained("Hemanth-Sai/Trail")

In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-classification", model="Hemanth-Sai/Trail")

results=pipe("नमो नमः")

results