In [1]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
!pip install keras



In [3]:
import numpy as np
import pandas as pd
import re
import string
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.optimizers import Adam


def clean_text(text):
    text = re.sub(' ', '', text)  
    text = re.sub(r'http\S+', '', text)  
    stop_words = set(stopwords.words('english'))  
    punctuation = list(string.punctuation)
    stop_words.update(punctuation)  
    text = " ".join(word for word in text.split() if word.lower() not in stop_words)
    return text


data_path = '/kaggle/input/fake-or-real-news/fake_or_real_news.csv'  
data = pd.read_csv(data_path)


data['text'] = data['text'].apply(clean_text)


tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(data['text'])
sequences = tokenizer.texts_to_sequences(data['text'])
X = pad_sequences(sequences, maxlen=500)


label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data['label'])
y = y.reshape(-1, 1)


X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


vocab_size = 5000
embedding_dim = 400
hidden_dim = 256
output_dim = 1

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim))  
model.add(LSTM(hidden_dim, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))
model.add(LSTM(hidden_dim, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(output_dim, activation='sigmoid'))

2024-03-25 22:48:48.278494: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-25 22:48:48.278622: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-25 22:48:48.406870: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
from keras.callbacks import ModelCheckpoint

model_checkpoint_path = '/content/drive/MyDrive/lab1_mlops_weights/model.keras'
checkpoint = ModelCheckpoint(model_checkpoint_path, monitor='val_loss', verbose=1, save_best_only=True)

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=3, batch_size=32, validation_data=(X_val, y_val), callbacks=[checkpoint])

Epoch 1/3
[1m159/159[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - accuracy: 0.6875 - loss: 0.5568
Epoch 1: val_loss improved from inf to 0.27781, saving model to /content/drive/MyDrive/lab1_mlops_weights/model.keras
[1m159/159[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m228s[0m 1s/step - accuracy: 0.6881 - loss: 0.5561 - val_accuracy: 0.8745 - val_loss: 0.2778
Epoch 2/3
[1m159/159[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - accuracy: 0.9162 - loss: 0.2116
Epoch 2: val_loss improved from 0.27781 to 0.24355, saving model to /content/drive/MyDrive/lab1_mlops_weights/model.keras
[1m159/159[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m220s[0m 1s/step - accuracy: 0.9162 - loss: 0.2115 - val_accuracy: 0.8998 - val_loss: 0.2435
Epoch 3/3
[1m159/159[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - accuracy: 0.9443 - loss: 0.1369
Epoch 3: val_loss did not improve from 0.24355
[1m159/159[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

<keras.src.callbacks.history.History at 0x78491d647580>

In [5]:
loss, accuracy = model.evaluate(X_val, y_val)
print("Validation Loss:", loss)
print("Validation Accuracy:", accuracy)

[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 316ms/step - accuracy: 0.8936 - loss: 0.2629
Validation Loss: 0.26683464646339417
Validation Accuracy: 0.8887134790420532


In [6]:
!pip install optuna



In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import optuna
import numpy as np
from tensorflow.keras.datasets import reuters
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_len = 500  
(X_train, y_train), (X_test, y_test) = reuters.load_data(path="reuters.npz")
X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)

X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.long)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.long)

batch_size = 32
train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = TensorDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

class NeuralNetwork(nn.Module):
    def __init__(self, input_dim, output_dim, n_hidden, n_units, dropout_rate):
        super(NeuralNetwork, self).__init__()
        layers = [nn.Linear(input_dim, n_units), nn.ReLU(), nn.Dropout(dropout_rate)]

        for _ in range(n_hidden):
            layers += [nn.Linear(n_units, n_units), nn.ReLU(), nn.Dropout(dropout_rate)]

        layers += [nn.Linear(n_units, output_dim)]
        self.network = nn.Sequential(*layers)

    def forward(self, x):
        x = x.view(x.size(0), -1)  
        logits = self.network(x)
        return logits

def create_model(trial, input_dim, output_dim):
    n_hidden = trial.suggest_int('n_hidden', 2, 5)  
    n_units = trial.suggest_int('n_units', 64, 256)  
    dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.5)  
    model = NeuralNetwork(input_dim, output_dim, n_hidden, n_units, dropout_rate)
    return model

def objective(trial):
    model = create_model(trial, X_train.shape[1], len(np.unique(y_train)))
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-1)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(10):
        model.train()
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            output = model(X_batch)
            loss = criterion(output, y_batch)
            loss.backward()
            optimizer.step()

    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            output = model(X_batch)
            _, predicted = torch.max(output.data, 1)
            total += y_batch.size(0)
            correct += (predicted == y_batch).sum().item()

    accuracy = correct / total
    return accuracy

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10, n_jobs=-1)  

print(study.best_params)
best_model = create_model(study.best_trial, X_train.shape[1], len(np.unique(y_train)))
best_model.eval()
correct = 0
total = 0
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        output = best_model(X_batch)
        _, predicted = torch.max(output.data, 1)
        total += y_batch.size(0)
        correct += (predicted == y_batch).sum().item()

accuracy = correct / total
print(f"Accuracy of the best model: {accuracy}")

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/reuters.npz
[1m2110848/2110848[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


[I 2024-03-25 23:00:45,065] A new study created in memory with name: no-name-d881c0b4-e52e-49e4-9c64-015bb7abc659
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-1)
[I 2024-03-25 23:01:34,875] Trial 2 finished with value: 0.3619768477292965 and parameters: {'n_hidden': 2, 'n_units': 185, 'dropout_rate': 0.1629614248640175, 'learning_rate': 0.005055637749419858}. Best is trial 2 with value: 0.3619768477292965.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-1)
[I 2024-03-25 23:01:45,734] Trial 3 finished with value: 0.3619768477292965 and parameters: {'n_hidden': 3, 'n_units': 162, 'dropout_rate': 0.2834695340624026, 'learning_rate': 0.011412376447596868}. Best is trial 2 with value: 0.3619768477292965.
[I 2024-03-25 23:02:02,487] Trial 0 finished with value: 0.37934105075690117 and parameters: {'n_hidden': 5, 'n_units': 210, 'dropout_rate': 0.3320325732628582, 'learning_rate': 3.583670240374301e-05}. Best is trial 0 with value: 0.37934105075690

{'n_hidden': 3, 'n_units': 243, 'dropout_rate': 0.1557973179987776, 'learning_rate': 0.0004761334972841304}
Accuracy of the best model: 0.020480854853072127
