In [77]:
# Transformers installation
! pip install transformers datasets
# To install from source instead of the last release, comment the command above and uncomment the following one.
# ! pip install git+https://github.com/huggingface/transformers.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [78]:
from datasets import load_dataset

dataset = load_dataset("winvoker/turkish-sentiment-analysis-dataset")
dataset["train"][100]

Using custom data configuration winvoker--turkish-sentiment-analysis-dataset-dbdc52d492aad425
Reusing dataset csv (/root/.cache/huggingface/datasets/winvoker___csv/winvoker--turkish-sentiment-analysis-dataset-dbdc52d492aad425/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/2 [00:00<?, ?it/s]

{'dataset': 'urun_yorumlari',
 'label': 'Positive',
 'text': 'telefonu alalı 1 yıl oldu nerdeyse sorunsuz kullanıyorum,hic bir şekilde yavaşlama olmadi,zaten saf android olduğu için aldım,saf andoid kullanıcıları için gayet ucuz ve kaliteli telefon,kesinlik general mobile olarak düşünmeyin bu telefon google ile birlikte yapıldı,ayrıca bu fiyata bu özellikte baska telefon yok ,tabsiye ederim'}

As you now know, you need a tokenizer to process the text and include a padding and truncation strategy to handle any variable sequence lengths. To process your dataset in one step, use 🤗 Datasets [`map`](https://huggingface.co/docs/datasets/process.html#map) method to apply a preprocessing function over the entire dataset:

In [79]:
small_train_dataset = dataset["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = dataset["test"].shuffle(seed=41).select(range(1000))

Loading cached shuffled indices for dataset at /root/.cache/huggingface/datasets/winvoker___csv/winvoker--turkish-sentiment-analysis-dataset-dbdc52d492aad425/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-c1ac4143b5a7c71a.arrow
Loading cached shuffled indices for dataset at /root/.cache/huggingface/datasets/winvoker___csv/winvoker--turkish-sentiment-analysis-dataset-dbdc52d492aad425/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-a583bcb13675f6f2.arrow


In [80]:
from transformers import AutoTokenizer
from datasets import ClassLabel

tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-cased")

labels = ClassLabel(names=["Positive","Notr","Negative"])

def preprocess_function(batch):
    tokens = tokenizer(batch["text"], truncation=True, padding="max_length", max_length=256)
    tokens["label"] = labels.str2int(batch["label"])
    return tokens

preprocessed_small_train_dataset = small_train_dataset.map(preprocess_function, batched=True)
preprocessed_small_eval_dataset = small_eval_dataset.map(preprocess_function, batched=True)

Loading cached processed dataset at /root/.cache/huggingface/datasets/winvoker___csv/winvoker--turkish-sentiment-analysis-dataset-dbdc52d492aad425/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-da9bf934f7a82a65.arrow


  0%|          | 0/1 [00:00<?, ?ba/s]

<a id='trainer'></a>

In [81]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator(return_tensors="tf")

In [82]:
tf_train_dataset = preprocessed_small_train_dataset.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["labels"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=12,
)

tf_validation_dataset = preprocessed_small_eval_dataset.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["labels"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=12,
)

In [83]:
import tensorflow as tf
from transformers import TFAutoModelForSequenceClassification

model = TFAutoModelForSequenceClassification.from_pretrained("dbmdz/bert-base-turkish-cased", num_labels=3, id2label={0:"Positive", 1:"Notr", 2:"Negative"})

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-cased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [84]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=tf.metrics.SparseCategoricalAccuracy(),
)

model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fbe78a97950>

<a id='pytorch_native'></a>

In [85]:
from transformers import pipeline

classifier = pipeline(task="text-classification", model=model, tokenizer=tokenizer)

In [86]:
classifier("Çok güzel bir ürün çok beğendim")

[{'label': 'Positive', 'score': 0.996821403503418}]

In [87]:
classifier("Berbat bir ürün hiç beğenmedim")

[{'label': 'Negative', 'score': 0.9532924890518188}]

In [90]:
classifier("rastgele bir cümle")

[{'label': 'Positive', 'score': 0.861530601978302}]