In [2]:
import sys
sys.path.append('../scripts')  
from models import *
from df_handling import *
from search import *
import pandas as pd
from harvester import *
from text_embedding import *
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold
import random
from imblearn.over_sampling import SMOTE
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
writer  = SummaryWriter(log_dir='new_runs_4.9')


In [3]:
#setting seeds for the classification task
seed=42
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
np.random.seed(seed)

# torch.backends.cudnn.deterministic = True


In [4]:
class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float64)  # Convert arrays to PyTorch tensors
        self.y = torch.tensor(y, dtype=torch.float64)    # Assuming y contains integer labels

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]
    

def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def count_layers(model):
    """Count the number of layers in a PyTorch model."""
    return sum(1 for _ in model.children())

In [114]:
#neural network for n=3, model 5
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.fc1 = nn.Linear(6144+768*3,4608).double()
        self.dropout1 = nn.Dropout(p=0.2)
        self.fc2 = nn.Linear(4608,3072).double()
        self.dropout2 = nn.Dropout(p=0.2)
        self.fc3 = nn.Linear(3072,1536).double()
        self.dropout3 = nn.Dropout(p=0.2)
        self.fc4 = nn.Linear(1536,768).double()
        self.dropout4 = nn.Dropout(p=0.2)
        self.fc5 = nn.Linear(768,384).double()
        self.dropout5 = nn.Dropout(p=0.2)
        self.fc6 = nn.Linear(384,2).double()
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout1(x)
        x = self.relu(self.fc2(x))
        x = self.dropout2(x)
        x = self.relu(self.fc3(x))
        x = self.dropout3(x)
        x = self.relu(self.fc4(x))
        x = self.dropout4(x)
        x = self.relu(self.fc5(x))
        x = self.dropout5(x)
        x = self.fc6(x)
        return x

device  = ("cuda" if torch.cuda.is_available() else "cpu")   
model = NeuralNetwork()
model.to(device)

#store the number of parameters for the model
num_parameters = count_parameters(model)
print(num_parameters)

59288450


In [119]:
# n=1 feature classification
num_epochs = 10
batch_size = 128 
num_folds = 10
best_val_loss = np.Inf
min_delta = 0.0001
patience = 5
num_folds=5
kf = KFold(n_splits=num_folds)
pca1 = PCA(n_components=1)
greek2 = pd.read_csv('../data/sen_greek3.csv')
greek2 = unravel_df(greek2, ['text_embedding_claim','text_embedding_par', 'text_embedding_sen'])
cyprus2 = pd.read_csv('../data/sen_cyprus3.csv')
cyprus2 = unravel_df(cyprus2, ['text_embedding_claim','text_embedding_par', 'text_embedding_sen'])
test_df2 = pd.read_csv('../data/sen_check4facts3.csv')
test_df2 = unravel_df(test_df2, ['text_embedding_claim','text_embedding_par', 'text_embedding_sen'])


#we proceed to create the new dataframe
test_df2['claim_id'] += int(greek2['claim_id'].iloc[-1]) +1
cyprus2['claim_id'] += int(test_df2['claim_id'].iloc[-1]) +1
df_train = pd.concat([greek2,test_df2,cyprus2], ignore_index=True)
df_train = df_train[df_train.label<=1]
df_train.reset_index(drop=True, inplace=True)


combined_array = list()
label = list()
for i in df_train.claim_id.unique():
    df = df_train[df_train.claim_id==i]

    claim=np.array(df.iloc[0,8])
    label.append(df.iloc[0,9])

    par= []
    for j in range(len(df)):
        par = np.concatenate([par,df.iloc[j,10]])
        
        sen = np.array(df.iloc[j,13], dtype=np.float64)
        sen = sen.reshape(768,2)
        sen = pca1.fit_transform(sen)
        sen = np.squeeze(sen.reshape(1,-1))
        par = np.concatenate([par,sen])


    claim = np.concatenate([claim, par])
    combined_array.append(np.ravel(claim))

#creating training, testing and validation datasets
arr = [combined_array[i] for i in range(len(combined_array))]

#making a 80/10/10 train, test and validation split
X_train, X_test, y_train, y_test= train_test_split(arr  ,np.array(label).astype('int'), test_size=0.2, random_state=42)


#oversampling method to add extra data for class balance
smote = SMOTE(random_state=42, k_neighbors=4)
X_train, y_train = smote.fit_resample(X_train, y_train)


#pca feature reduction 
# X_train_scaled, X_test_scaled, X_val_scaled = list(),list(),list()

# for x_train in X_train:
#     x_train = np.array(x_train, dtype=np.float64)
#     x_train = x_train.reshape(768, 11)
#     pca = PCA(n_components = 1) 
#     X_train_scaled.append(pca.fit_transform(x_train))
    
# for x_test in X_test:
#     x_test = np.array(x_test, dtype=np.float64)
#     x_test = x_test.reshape(768, 11)
#     pca = PCA(n_components = 1) 
#     X_test_scaled.append(pca.fit_transform(x_test))


# train_dataset = CustomDataset(np.squeeze(X_train_scaled), y_train)
# test_dataset = CustomDataset(np.squeeze(X_test_scaled), y_test)



train_dataset = CustomDataset(X_train, y_train)
test_dataset = CustomDataset(X_test, y_test)



# # Create data loaders
# train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False, worker_init_fn=seed_worker)
# test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, worker_init_fn=seed_worker)




for learning_rate in [0.01]:
    for w in  [10]:
        print(f"lr={learning_rate}, weight_decay={w}: ")
        #loss and test loader
        criterion = nn.CrossEntropyLoss()
        test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, worker_init_fn=seed_worker)

        #lists to save the best val/train loss on each fold
        kfold_accs_val = list()
        kfold_accs_train = list()
        #storing the best val loss across all folds
        best_val_loss_folds=np.Inf
        for fold, (train_indices, val_indices) in enumerate(kf.split(train_dataset)):
            # print(f"Fold {fold + 1}, lr={learning_rate}, weight_decay={w}: ")

            #optimizer, and data loader initialization
            model = NeuralNetwork()
            model.to(device)
            optimizer = torch.optim.AdamW(model.parameters(), lr= learning_rate, weight_decay=w) 
            train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, sampler=torch.utils.data.SubsetRandomSampler(train_indices))
            val_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, sampler=torch.utils.data.SubsetRandomSampler(val_indices))
            

            #best_val_loss variable keeps the best validation score for each fold
            best_val_loss = np.Inf
          

            #training
            total_steps = len(train_loader)
            for i in range(num_epochs):
                model.train()
                train_loss = 0.0
                for batch_idx, (data, target) in enumerate(train_loader):

                    #reshaping the training data passing the them to the gpu
                    data = data.to(device)
                    target = target.type(torch.LongTensor)
                    target = target.to(device)
                    

                    #forward pass
                    outputs = model(data)
                    loss = criterion(outputs, target)

                    #backward pass and optimize 
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()
                    
                    train_loss += loss.item()

                # Compute average training loss for the epoch
                avg_train_loss = train_loss / len(train_loader)  
                
            

                #validation
                model.eval()
                val_loss = 0.0

                with torch.no_grad():
                    avg_val_loss=0
                    for data, target in val_loader:
                        target = target.type(torch.LongTensor)
                        data, target = data.to(device), target.to(device)

                        #compute validation loss
                        outputs = model(data)
                        loss_val = criterion(outputs,target)
                        val_loss += loss_val.item()
                        
            
                    #print validation and training loss     
                    avg_val_loss = val_loss / len(val_loader)
                    

                    #early stopping condition
                    if avg_val_loss<best_val_loss - min_delta:
                        best_val_loss = avg_val_loss
                        best_train_loss = avg_train_loss
                        current_patience = 0
                        #update best validation loss across all folds
                        if best_val_loss< best_val_loss_folds-min_delta:
                            best_val_loss_folds = best_val_loss
                            torch.save(model.state_dict(), 'model.pth')
                        
                    else:
                        current_patience +=1
                        if current_patience>= patience:
                            # print (f'''Epoch [{i+1}/{num_epochs}],  Training_loss: {avg_train_loss:.4f}, Validation_loss: {avg_val_loss:.4f},''')
                            # print(f'Early stopping afther {i+1} epochs')
                            break
                    

                
                #print (f'''Epoch [{i+1}/{num_epochs}],  Training_loss: {avg_train_loss:.4f}, Validation_loss: {avg_val_loss:.4f},''')

            #save the best validation loss and the corresponding training loss to this 
            kfold_accs_val.append(round(best_val_loss,3))
            kfold_accs_train.append(round(best_train_loss,3))

            #if(fold==num_folds-1):
                # print(f"KFold cross validation results: {kfold_accs_val}")
                # print(f"KFold cross validation training results: {kfold_accs_train}")
                # print(f"Validation loss mean {np.mean(kfold_accs_val)}" )
                # print(f"Train loss mean {np.mean(kfold_accs_train)}" )

        #Log training and validation loss to tensorboard

        # writer.add_scalars(f'Model Complexity (n=3), weight_decay={w}, for learning rate = {learning_rate}', {
        #                                 'train': np.mean(kfold_accs_train),
        #                                 'val': np.mean(kfold_accs_val),
        #                             }, num_parameters)
              
        #testing
        with torch.no_grad():
            n_correct=0
            n_samples=0
            model.load_state_dict(torch.load('model.pth'))
            model.eval()
            for data, target in test_loader:

                #reshaping the testing data passing the them to the gpu
                data = data.to(device)
                target = target.type(torch.LongTensor)
                target = target.to(device)
                outputs = model(data)

                # max returns (value ,index)
                _, predicted = torch.max(outputs.data, 1)
                n_samples += target.size(0)
                n_correct += (predicted == target).sum().item()

            acc = 100.0 * n_correct / n_samples
            print(f'Accuracy of the network on the n=3: {acc} %')

            writer.close()

lr=0.01, weight_decay=10: 
Accuracy of the network on the n=3: 70.49180327868852 %
