In [6]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F

In [7]:
#1. Prepare dataset
#2. Load pretrained Tokenizer, call it with dataset -> encoding
#3. Build Pytorch Dataset with encodings
#4. Load pretrained Model
#5. a) Load Trainer and train it
#   b) or use native Pytorch training pipeline

In [7]:
import os
os.environ['http_proxy'] = 'http://192.41.170.23:3128'
os.environ['https_proxy'] = 'http://192.41.170.23:3128'

In [9]:
model_name =  "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# classifier = pipeline("sentiment-analysis", model = model_name)
# results = classifier(["We are very happy to show tou the ☺️ face transformers library","We hope you dont hate it"])    
# for result in results:
#     print(result)

Downloading:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/255M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

{'label': 'POSITIVE', 'score': 0.9996298551559448}
{'label': 'NEGATIVE', 'score': 0.9600156545639038}


In [10]:
classifier = pipeline("sentiment-analysis", model = model_name)
results = classifier(["We are very happy to show tou the ☺️ face transformers library",
                      "We hope you dont hate it"])    
for result in results:
    print(result)

{'label': 'POSITIVE', 'score': 0.9996298551559448}
{'label': 'NEGATIVE', 'score': 0.9600156545639038}


In [11]:
tokens = tokenizer.tokenize("We are very happy to show you the ☺️ Transformers library")
token_ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = tokenizer("We are very happy to show you the ☺️ Transformers library")

print(f'Tokens :{tokens}')
print(f'Tokens :{token_ids}')
print(f'Tokens :{input_ids}')

Tokens :['we', 'are', 'very', 'happy', 'to', 'show', 'you', 'the', '[UNK]', 'transformers', 'library']
Tokens :[2057, 2024, 2200, 3407, 2000, 2265, 2017, 1996, 100, 19081, 3075]
Tokens :{'input_ids': [101, 2057, 2024, 2200, 3407, 2000, 2265, 2017, 1996, 100, 19081, 3075, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [12]:
X_train = ["We are very happy to show tou the ☺️ face transformers library",
                      "We hope you dont hate it"]
batch = tokenizer(X_train, padding=True, truncation=True, max_length=512, return_tensors = "pt")
print(batch)

{'input_ids': tensor([[  101,  2057,  2024,  2200,  3407,  2000,  2265,  2000,  2226,  1996,
           100,  2227, 19081,  3075,   102],
        [  101,  2057,  3246,  2017,  2123,  2102,  5223,  2009,   102,     0,
             0,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]])}


In [15]:
with torch.no_grad():
    outputs = model(**batch, labels = torch.tensor([1,0])) #unpack batch
    print(outputs)
    
    predictions = F.softmax(outputs.logits, dim = 1)
    print(predictions)
    
    labels = torch.argmax(predictions, dim = 1)
    print(labels)
    
    labels =[model.config.id2label[label_id] for label_id in labels.tolist()]
    print(labels)

SequenceClassifierOutput(loss=tensor(0.0206), logits=tensor([[-3.8563,  4.0451],
        [ 1.6841, -1.4944]]), hidden_states=None, attentions=None)
tensor([[3.7010e-04, 9.9963e-01],
        [9.6002e-01, 3.9984e-02]])
tensor([1, 0])
['POSITIVE', 'NEGATIVE']


In [None]:
#fine-tune our model
# save_directory = "saved"
# tokenizer.save_pretrained(save_directory)
# model.save_pretrained(save_directory)

# tokenizer = AutoTokenizer.from_pretrained(save_directory)
# model = AutoModelForSequenceClassification.from_pretrained(save_directory)

In [23]:
model_name = "oliverguhr/german-sentiment-bert"


tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

X_train_german = ["Mit keinem guten Ergebniss","Das ist gar nicht mal so gut",
    "Total awesome!","nicht so schlecht wie erwartet",
    "Der Test verlief positiv.","Sie fährt ein grünes Auto."]

batch = tokenizer(X_train_german, padding=True, truncation=True, max_length=512, return_tensors = "pt")
#batch = torch.tensor(batch["input_ids"])
print(batch)

with torch.no_grad():
    outputs = model(**batch)
    label_ids = torch.argmax(outputs.logits, dim = 1)
    print(label_ids)
    
    labels = [model.config.id2label[label_id] for label_id in label_ids.tolist()]
    print(labels)

{'input_ids': tensor([[    3,   304,  8524,  5569,  2011, 26902,     4,     0,     0],
        [    3,   295,   127,  2523,   149,  2723,   181,  1522,     4],
        [    3, 19990,    18,  7117,  4741, 26982,     4,     0,     0],
        [    3,   149,   181,  6975,   246,  6303,     4,     0,     0],
        [    3,   233,  4496, 14332,  8453, 26914,     4,     0,     0],
        [    3,   371,  9755,    39, 19044, 26902,  3512, 26914,     4]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1]])}
tensor([1, 1, 0, 0, 2, 2])
['negative', 'negative', 'positive'

#### Manual

In [3]:
from pathlib import Path
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer
from transformers import Trainer, TrainingArguments

In [4]:
#1. Prepare dataset
#2. Load pretrained Tokenizer, call it with dataset -> encoding
#3. Build Pytorch Dataset with encodings
#4. Load pretrained Model
#5. a) Load Trainer and train it
#   b) or use native Pytorch training pipeline

In [6]:
model_name = "distilbert-base-uncased"

def read_imdb_split(split_dir):
    split_dir = Path(split_dir)
    texts = []
    labels = []
    for label_dir in ["pos","neg"]:
        for text_file in (split_dir,label_dir).iterdir():
            texts.append(text_file.read_text())
            labels.append(0 if label_dir == "neg" else 1)
            
    return texts, labels

#large movie review datasets
#http://ai.standford.edu/~amaas/data/sentiment

# train_texts, val_texts, train_lables , val_lables = train_test_split(train_texts, train_labels, test_size = 0.2)

# #class IMbDastsset():
# #.
# #........



# tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)


# train_encodings = tokenizer(train_texts, truncation = True, padding = True)
# val_encodings = tokenizer(val_texts, truncation = True, padding = True)
# test_encodings = tokenizer(test_texts, truncation = True, padding = True)


# train_dataset = IMDbDataset(train_encodings, train_labels)
# val_dataset = IMDbDataset(val_encodings, val_labels)
# test_dataset = IMDbDataset(test_encodings, test_labels)

# training_args = TrainingArguments (
    

    
# )

In [5]:
# from torch.utils.data import DataLoader
# from transformers import AdamW

# device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
# model.to(device)
# model.train()

# train_loader = DataLoader(train_dataset, batch_size = 16, shuffle = True)

# optim = AdamW(model.parameters(), lr = 5e-5)

# num.........




# outputs = model(input_ids, attention_mask = attention_mask, labels = labels)

#https://www.youtube.com/watch?v=GSt00_-0ncQ