In [49]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel
from torch.utils.data import DataLoader, TensorDataset, random_split
from transformers import AdamW, get_scheduler

model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [61]:
# importing panda library 
import pandas as pd 
import os
import fnmatch
directory = "C:\\Users\\larsl\\OneDrive\\Desktop\\DataScience\\FaPraNLP\\Testdaten\\"
style = ['APA', 'MLA', 'Havard', 'ACM', 'IEEE']
dfStyleAll = pd.DataFrame(columns=['text', 'label', 'typ'])
dfStyleAll_complemantary = pd.DataFrame(columns=['text', 'label', 'typ'])
dfStyleBuffer = pd.DataFrame()

for styleItem in style:
    bufferList = [n for n in style if n != styleItem]
    for filename in os.listdir(directory):
        f = os.path.join(directory, filename)
        if fnmatch.fnmatch(filename, '*' + styleItem + '.txt'):
            dfStyleBuffer = pd.read_csv(f, sep=';') 
            dfStyleBuffer = dfStyleBuffer.rename(columns={"style": "label"})
            dfStyleAll = pd.concat([dfStyleAll, dfStyleBuffer])
        else:
            for bufferStyle in bufferList:
                if fnmatch.fnmatch(filename, '*' + bufferStyle + '.txt'):
                    dfStyleBuffer = pd.read_csv(f, sep=';') 
                    dfStyleBuffer = dfStyleBuffer.rename(columns={"style": "label"})
                    dfStyleAll_complemantary = pd.concat([dfStyleAll_complemantary, dfStyleBuffer])
    dfStyleAll['label'] = 1
    dfStyleAll.to_csv(styleItem + '_all.csv', index=False) 
    dfStyleAll_complemantary['label'] = 0
    dfStyleAll_complemantary.to_csv('not' + styleItem + '_all.csv', index=False) 
    dfStyleAll = pd.DataFrame(columns=['text', 'label', 'typ'])
    dfStyleAll_complemantary = pd.DataFrame(columns=['text', 'label', 'typ'])

    

dfBookAll = pd.concat([pd.read_csv('Havard_all.csv'), pd.read_csv('notHavard_all.csv')]) 
print(dfBookAll)


                                                  text  label      typ
0    Yann LeCun 2010. Compilers: Principles, Techni...      1  article
1    Grace Hopper, Alan Turing, Douglas Engelbart, ...      1  article
2    Douglas Engelbart, Michael Jordan, Bjarne Stro...      1  article
3    Donald Davies, John Backus, and Donald E. Knut...      1  article
4    Andrew Ng, and Claude Shannon 2022. Bayesian D...      1  article
..                                                 ...    ...      ...
525  Grace Hopper, , Edsger W. Dijkstra. Algorithm ...      0     book
526  Sebastian Thrun, , Yukihiro Matsumoto, James G...      0     book
527  Alan Turing, , Yann LeCun, Niklaus Wirth, Vint...      0     book
528  John von Neumann, , Niklaus Wirth, Donald E. K...      0     book
529  Jane Smith, , Dennis Ritchie, Bjarne Stroustru...      0     book

[665 rows x 3 columns]


In [62]:
data = {
    'label': [],
    'text': []
}

data['label'] = dfBookAll['label'].tolist()
data['text'] = dfBookAll['text'].tolist()

In [63]:
from datasets import Dataset

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)



dfData = pd.DataFrame(data)
dataset = Dataset.from_pandas(dfData)

split_dataset = dataset.train_test_split(test_size=0.2, seed=42, shuffle=True)
train_dataset = split_dataset['train']
val_dataset = split_dataset['test']

#Die Funktion map wendet die preprocess_function auf jedes Element des Datasets an. 
#Deine preprocess_function führt die Tokenisierung der text-Spalte durch. 
#Da die map-Funktion das resultierende Objekt der Funktion (return der preprocess_function) zu den bestehenden Daten hinzufügt, 
#bleiben die ursprünglichen Spalten (label und text) erhalten. 
#Zusätzlich werden neue Spalten für die tokenisierten Daten hinzugefügt, wie z.B. input_ids und attention_mask.

tokenized_train_data = train_dataset.map(preprocess_function, batched=True)
tokenized_val_data = val_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/532 [00:00<?, ? examples/s]

Map:   0%|          | 0/133 [00:00<?, ? examples/s]

In [64]:
from transformers import DataCollatorWithPadding
import evaluate
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

accuracy = evaluate.load("accuracy")


def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    pre = precision_score(labels, preds)
    rec = recall_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "recall": rec, "precession": pre}

In [65]:
id2label = {0: "NONHAVARD", 1: "HAVARD"}
label2id = {"NONHAVARD": 0, "HAVARD": 1}

In [66]:
from transformers import TrainingArguments, Trainer, AutoConfig

config = AutoConfig.from_pretrained(model_ckpt, num_labels=2, label2id=label2id, id2label=id2label)
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, config=config).to(device)

training_args = TrainingArguments(
    output_dir="HAVARD_recognizer",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=20,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_data,
    eval_dataset=tokenized_val_data,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Recall,Precession
1,No log,0.255613,0.759398,0.655549,0.0,0.0
2,No log,0.018097,1.0,1.0,1.0,1.0
3,No log,0.003124,1.0,1.0,1.0,1.0
4,No log,0.00178,1.0,1.0,1.0,1.0
5,No log,0.00124,1.0,1.0,1.0,1.0
6,No log,0.000954,1.0,1.0,1.0,1.0
7,No log,0.000759,1.0,1.0,1.0,1.0
8,No log,0.000627,1.0,1.0,1.0,1.0
9,No log,0.000524,1.0,1.0,1.0,1.0
10,No log,0.000451,1.0,1.0,1.0,1.0


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=680, training_loss=0.027526181311730076, metrics={'train_runtime': 268.2068, 'train_samples_per_second': 39.671, 'train_steps_per_second': 2.535, 'total_flos': 140670028357152.0, 'train_loss': 0.027526181311730076, 'epoch': 20.0})

In [71]:
from transformers import pipeline
text="Chen, X., and Liang, J. (2024). Pair Programming with ChatGPT. In Proceedings of the 55th ACM Technical Symposium on Computer Science Education V. 2 (pp. 1600–1601). Association for Computing Machinery."
classifier = pipeline("text-classification", model="ACM_recognizer")
print(classifier(text))
classifier = pipeline("text-classification", model="APA_recognizer")
print(classifier(text))
classifier = pipeline("text-classification", model="HAVARD_recognizer")
print(classifier(text))

[{'label': 'NONACM', 'score': 0.9988192915916443}]
[{'label': 'APA', 'score': 0.9991631507873535}]
[{'label': 'NONHAVARD', 'score': 0.9997310042381287}]
