In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import seaborn as sns
from collections import Counter
from wordcloud import WordCloud
from transformers import BertTokenizer, BertModel
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import (accuracy_score, classification_report, confusion_matrix, recall_score, precision_score, f1_score, roc_curve, auc)
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import (Dense, Dropout, BatchNormalization, Embedding, Flatten, Input, Concatenate)
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau





In [10]:
from transformers import TFDistilBertModel, DistilBertTokenizer

# Načtení modelu a tokenizéru
model_name = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = TFDistilBertModel.from_pretrained(model_name)

# Funkce pro získání embeddingů v dávkách
def get_distilbert_embeddings(texts, batch_size=16, max_length=128):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        inputs = tokenizer(batch_texts, padding=True, truncation=True, return_tensors='tf', max_length=max_length)
        outputs = model(**inputs)
        batch_embeddings = np.mean(outputs.last_hidden_state.numpy(), axis=1)  # Průměr posledních skrytých stavů
        embeddings.append(batch_embeddings)
    return np.vstack(embeddings)

# Funkce pro načtení a předzpracování dat
def preprocess_data(df):
    titles = df['title'].tolist()
    X_title_distilbert = get_distilbert_embeddings(titles, batch_size=8)
    return X_title_distilbert

# Načtení a sloučení dat
def load_and_merge_data(train_file, test_file, eval_file):
    train_df = pd.read_csv(train_file, sep=';')
    test_df = pd.read_csv(test_file, sep=';')
    eval_df = pd.read_csv(eval_file, sep=';')
    df_all = pd.concat([train_df, test_df, eval_df], ignore_index=True)
    return df_all

# Vytvoření modelu s fine-tuningem
def create_model_with_finetuning(input_shape, num_classes):
    input_layer = tf.keras.layers.Input(shape=(input_shape,), dtype=tf.float32, name="input_layer")
    x = tf.keras.layers.Dense(128, activation='relu')(input_layer)
    output = tf.keras.layers.Dense(num_classes, activation='softmax')(x)
    
    model = tf.keras.models.Model(inputs=input_layer, outputs=output)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Trénování modelu
def train_model(df_all):
    X = preprocess_data(df_all)
    Y = df_all['label'].astype(int).values  # Převedení labelů na čísla
    
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
    
    model = create_model_with_finetuning(X_train.shape[1], len(set(Y)))
    
    early_stopping = EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True)
    reduce_lr = ReduceLROnPlateau(monitor="val_loss", factor=0.1, patience=3, verbose=1)
    
    model.fit(X_train, y_train, epochs=10, batch_size=16, validation_split=0.2, callbacks=[early_stopping, reduce_lr])
    
    y_pred = model.predict(X_test)
    y_pred = np.argmax(y_pred, axis=1)
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy on test data: {accuracy:.4f}')

# Načtení a trénování modelu
df_all = load_and_merge_data('train (2).csv', 'test (1).csv', 'evaluation.csv')
train_model(df_all)


Epoch 1/10
[1m1624/1624[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 13ms/step - accuracy: 0.8351 - loss: 0.3883 - val_accuracy: 0.9215 - val_loss: 0.2012 - learning_rate: 5.0000e-05
Epoch 2/10
[1m1624/1624[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 4ms/step - accuracy: 0.9270 - loss: 0.1917 - val_accuracy: 0.9293 - val_loss: 0.1781 - learning_rate: 5.0000e-05
Epoch 3/10
[1m1624/1624[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.9369 - loss: 0.1670 - val_accuracy: 0.9356 - val_loss: 0.1656 - learning_rate: 5.0000e-05
Epoch 4/10
[1m1624/1624[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.9408 - loss: 0.1557 - val_accuracy: 0.9415 - val_loss: 0.1568 - learning_rate: 5.0000e-05
Epoch 5/10
[1m1624/1624[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.9441 - loss: 0.1468 - val_accuracy: 0.9413 - val_loss: 0.1525 - learning_rate: 5.0000e-05
Epoch 6/10
[1m1624/1624[0m [32m━━━━