In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, f1_score
# Wczytanie danych
data = pd.read_csv("/kaggle/input/reddit-mental-health-data/data_to_be_cleansed.csv")

# Zakładamy, że kolumny to: "#", "title", "text", "target"
# Kolumna "#" jest zbędna, więc ją ignorujemy
print(data.columns)
data = data.rename(columns={"Unnamed: 0":"#"})
data = data.drop(columns=['#'])
print(data.columns)

Index(['Unnamed: 0', 'text', 'title', 'target'], dtype='object')
Index(['text', 'title', 'target'], dtype='object')


In [None]:
# Łączenie 'title' i 'text' w jeden ciąg wejściowy
data["combined"] = data["title"].fillna("") + " " + data["text"].fillna("")

# Podział na zbiory (train, val, test)
print("[INFO] Dzielenie danych na zbiory...")
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42, stratify=data['target'])
train_data, val_data = train_test_split(train_data, test_size=0.1, random_state=42, stratify=train_data['target'])

print(f"Rozmiar zbioru treningowego: {len(train_data)}")
print(f"Rozmiar zbioru walidacyjnego: {len(val_data)}")
print(f"Rozmiar zbioru testowego: {len(test_data)}")

# Przygotowanie wektorów TF-IDF
print("[INFO] Dopasowywanie TfidfVectorizer na zbiorze treningowym...")
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1,2))  # można dostosować parametry

X_train = vectorizer.fit_transform(train_data["combined"])
y_train = train_data["target"]

X_val = vectorizer.transform(val_data["combined"])
y_val = val_data["target"]

X_test = vectorizer.transform(test_data["combined"])
y_test = test_data["target"]

# Trening modelu LogisticRegression
print("[INFO] Trening modelu LogisticRegression...")
model = LogisticRegression(max_iter=1000, class_weight='balanced')
model.fit(X_train, y_train)

# Ewaluacja na zbiorze walidacyjnym
print("[INFO] Ewaluacja na zbiorze walidacyjnym...")
val_preds = model.predict(X_val)
val_acc = accuracy_score(y_val, val_preds)
val_f1 = f1_score(y_val, val_preds, average='weighted')
print(f"Walidacja - Accuracy: {val_acc:.4f}, F1: {val_f1:.4f}")

# Ewaluacja na zbiorze testowym
print("[INFO] Ewaluacja na zbiorze testowym...")
test_preds = model.predict(X_test)
test_acc = accuracy_score(y_test, test_preds)
test_f1 = f1_score(y_test, test_preds, average='weighted')
print(f"Test - Accuracy: {test_acc:.4f}, F1: {test_f1:.4f}")

print("[TEST] Szczegółowy raport klasyfikacji:")
print(classification_report(y_test, test_preds, target_names=["Stress","Depression","Bipolar","Personality","Anxiety"]))

print("[INFO] Proces zakończony pomyślnie.")

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import numpy as np

# ********************** ETAP 1: WCZYTANIE DANYCH **********************

print("[INFO] Wczytywanie danych EmoBank...")
data = pd.read_csv("/kaggle/input/emobank/emobank.csv")  # Zakładamy, że plik jest TSV

# Filtrujemy wiersze z pustym tekstem
data = data.dropna(subset=["text"])

# Podział wg split
train_data = data[data["split"] == "train"]
val_data = data[data["split"] == "dev"]
test_data = data[data["split"] == "test"]

print(f"Train: {len(train_data)}, Val: {len(val_data)}, Test: {len(test_data)}")

# ********************** ETAP 2: WEKTORYZACJA TEKSTU **********************

print("[INFO] Wektoryzacja tekstu za pomocą TF-IDF...")
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train = vectorizer.fit_transform(train_data["text"])
X_val = vectorizer.transform(val_data["text"])
X_test = vectorizer.transform(test_data["text"])

y_train = train_data[["V", "A", "D"]].values
y_val = val_data[["V", "A", "D"]].values
y_test = test_data[["V", "A", "D"]].values

# Konwersja do formatu odpowiedniego dla PyTorch
X_train = X_train.toarray()
X_val = X_val.toarray()
X_test = X_test.toarray()

y_train = y_train.astype(np.float32)
y_val = y_val.astype(np.float32)
y_test = y_test.astype(np.float32)

# ********************** ETAP 3: DEFINICJA DATASET I DATALOADER **********************

class EmoBankDataset(Dataset):
    def __init__(self, features, targets):
        self.X = torch.tensor(features, dtype=torch.float32)
        self.y = torch.tensor(targets, dtype=torch.float32)
    
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# Tworzenie zestawów danych
train_dataset = EmoBankDataset(X_train, y_train)
val_dataset = EmoBankDataset(X_val, y_val)
test_dataset = EmoBankDataset(X_test, y_test)

# Tworzenie DataLoaderów
batch_size = 32

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

print("[INFO] Zestawy danych przygotowane.")

# ********************** ETAP 4: DEFINICJA MODEL **********************

class EmotionRegressor(nn.Module):
    def __init__(self, input_dim):
        super(EmotionRegressor, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(128, 64)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(64, 3)  # Wyjście: V, A, D
    
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu1(out)
        out = self.fc2(out)
        out = self.relu2(out)
        out = self.fc3(out)
        return out

input_dim = X_train.shape[1]
model = EmotionRegressor(input_dim)

# Sprawdzenie, czy GPU jest dostępne
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"[INFO] Używane urządzenie: {device}")

# ********************** ETAP 5: DEFINICJA STRATY I OPTYMIZATORA **********************

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# ********************** ETAP 6: TRENING MODEL **********************

num_epochs = 20

print("[INFO] Rozpoczynanie treningu...")
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for batch_idx, (inputs, targets) in enumerate(train_loader):
        inputs, targets = inputs.to(device), targets.to(device)
        
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        
        if (batch_idx + 1) % 100 == 0 or (batch_idx + 1) == len(train_loader):
            print(f"Epoch [{epoch+1}/{num_epochs}], Batch [{batch_idx+1}/{len(train_loader)}], Loss: {loss.item():.4f}")
    
    avg_loss = running_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{num_epochs}] - Średnia strata: {avg_loss:.4f}")
    
    # Walidacja po każdej epoce
    model.eval()
    val_losses = []
    with torch.no_grad():
        for inputs, targets in val_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            val_losses.append(loss.item())
    avg_val_loss = np.mean(val_losses)
    print(f"Epoch [{epoch+1}/{num_epochs}] - Strata walidacyjna: {avg_val_loss:.4f}\n")

print("[INFO] Trening zakończony.")

# ********************** ETAP 7: EWALUACJA NA ZBIORZE TESTOWYM **********************

print("[INFO] Ewaluacja na zbiorze testowym...")
model.eval()
test_predictions = []
test_targets = []
with torch.no_grad():
    for inputs, targets in test_loader:
        inputs = inputs.to(device)
        outputs = model(inputs)
        test_predictions.append(outputs.cpu().numpy())
        test_targets.append(targets.numpy())

test_predictions = np.vstack(test_predictions)
test_targets = np.vstack(test_targets)

# Obliczanie MSE dla każdego wymiaru
mse_v = mean_squared_error(test_targets[:, 0], test_predictions[:, 0])
mse_a = mean_squared_error(test_targets[:, 1], test_predictions[:, 1])
mse_d = mean_squared_error(test_targets[:, 2], test_predictions[:, 2])

print(f"Test MSE - Valence (V): {mse_v:.4f}")
print(f"Test MSE - Arousal (A): {mse_a:.4f}")
print(f"Test MSE - Dominance (D): {mse_d:.4f}")

# ********************** ETAP 8: ZAPIS MODELU I WEKTORYZERU **********************

import joblib
import os

model_dir = "trained_emobank_model"
if not os.path.exists(model_dir):
    os.makedirs(model_dir)

# Zapis modelu
torch.save(model.state_dict(), os.path.join(model_dir, "emotion_regressor.pth"))
print(f"[INFO] Model zapisany w {os.path.join(model_dir, 'emotion_regressor.pth')}")

# Zapis wektoryzera
joblib.dump(vectorizer, os.path.join(model_dir, "tfidf_vectorizer.pkl"))
print(f"[INFO] Wektoryzer zapisany w {os.path.join(model_dir, 'tfidf_vectorizer.pkl')}")

print("[INFO] Proces zakończony pomyślnie.")


[INFO] Wczytywanie danych EmoBank...
Train: 8062, Val: 999, Test: 1000
[INFO] Wektoryzacja tekstu za pomocą TF-IDF...
[INFO] Zestawy danych przygotowane.
[INFO] Używane urządzenie: cuda
[INFO] Rozpoczynanie treningu...
Epoch [1/20], Batch [100/252], Loss: 0.3551
Epoch [1/20], Batch [200/252], Loss: 0.1128
Epoch [1/20], Batch [252/252], Loss: 0.1106
Epoch [1/20] - Średnia strata: 1.7269
Epoch [1/20] - Strata walidacyjna: 0.1052

Epoch [2/20], Batch [100/252], Loss: 0.0968
Epoch [2/20], Batch [200/252], Loss: 0.1172
Epoch [2/20], Batch [252/252], Loss: 0.0638
Epoch [2/20] - Średnia strata: 0.0829
Epoch [2/20] - Strata walidacyjna: 0.0888

Epoch [3/20], Batch [100/252], Loss: 0.0615
Epoch [3/20], Batch [200/252], Loss: 0.0819
Epoch [3/20], Batch [252/252], Loss: 0.0599
Epoch [3/20] - Średnia strata: 0.0663
Epoch [3/20] - Strata walidacyjna: 0.0835

Epoch [4/20], Batch [100/252], Loss: 0.0448
Epoch [4/20], Batch [200/252], Loss: 0.0606
Epoch [4/20], Batch [252/252], Loss: 0.0888
Epoch [4/2