In [15]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader, TensorDataset

import pickle
import numpy as np
import pandas as pd
import nltk
from transformers import BertTokenizer, BertModel

import time
import torch.multiprocessing as mp
from tqdm import tqdm
import os
import ast
import re

from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

In [2]:
from temp import do_somthing, simple_worker, worker_get_bert_embedding_sentence_based

In [3]:
torch.backends.cudnn.benchmark = True
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


In [4]:
# Initialize the tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').cuda()

In [5]:
def parallel_embedding_computation(df, num_processes=4):
    print("In parallel_embedding_computation")
    manager = mp.Manager()
    return_dict = manager.dict()
    progress_counter = manager.Value('i', 0)
    lock = manager.Lock()

    total_rows = len(df)
    print(f"{total_rows} rows in df")
    
    # Split data into chunks for each process
    chunk_size = len(df) // num_processes
    processes = []
    
    print("Start create process")
    for i in range(num_processes):
        start_idx = i * chunk_size
        end_idx = len(df) if i == num_processes - 1 else (i + 1) * chunk_size
        data_chunk = df['clean_text'][start_idx:end_idx].tolist()
        return_dict[i] = []
        p = mp.Process(target=worker_get_bert_embedding_sentence_based, args=(data_chunk, return_dict, i, progress_counter, total_rows, lock))
        processes.append(p)
        p.start()
        print(f"Process {i} started")

    for p in processes:
        p.join()
        print(f"Process {p.name} joined")
   
    # Combine results
    all_embeddings = []
    for i in range(num_processes):
        all_embeddings.extend(return_dict[i])

    df['text_embedding'] = all_embeddings
    return df

In [6]:
def preprocess_embedding(embedding_str):
    # Remove the brackets and split by spaces
    embedding_str = embedding_str.strip('[]')
    # Add commas between the numbers
    embedding_str = embedding_str.replace('  ', ' ')
    embedding_str = embedding_str.replace(' ', ',')
    # Re-add the brackets
    embedding_str = f"[{embedding_str}]"
    return embedding_str

In [7]:
# a = "[-2.63857126e-01  9.52238515e-02 -3.86462845e-02 -1.23622872e-01 4.53184135e-02  4.09205258e-02  3.81095201e-01  5.09688675e-01 9.10583660e-02 -1.22776248e-01  4.08156626e-02 -1.97499439e-01]"
# df['text_embedding'].iloc[0] = np.array(ast.literal_eval(preprocess_embedding(a)))

# print(df.dtypes['text_embedding'])
# print(df['text_embedding'].iloc[0])

In [8]:
def convert_to_numpy_array(embedding_str):
    try:
        return np.array(ast.literal_eval(embedding_str), dtype=np.float32)
    except (SyntaxError, ValueError) as e:
        print(f"Error parsing embedding: {embedding_str}, Error: {e}")
        return np.nan

In [9]:
class LSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
        super(LSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        h_0 = torch.zeros(num_layers, x.size(0), hidden_dim).to(device)
        c_0 = torch.zeros(num_layers, x.size(0), hidden_dim).to(device)
        
        # LSTM layer
        out, _ = self.lstm(x, (h_0, c_0))
        
        # Fully connected layer
        out = self.fc(out[:, -1, :])
        return out

In [10]:
# Define the LSTM model with packed sequences
class ArticleClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
        super(ArticleClassifier, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x, lengths):
        # Pack the padded sequence
        lengths = lengths.cpu()
        packed_input = pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
        packed_output, (ht, ct) = self.lstm(packed_input)
        # Use the hidden state from the last time step
        output = self.fc(ht[-1])
        return output

In [11]:
def string_to_numpy_array(input_str):
    # Replace 'array(' with '[' and '), dtype=float32)' with ']'
    input_str = re.sub(r'\s+', ' ', input_str)
    input_str = input_str.replace('array(', '').replace(', dtype=float32)', '')
    
    if "array(" in input_str:
        start_index = input_str.find("array(")

        # Extract the desired substring
        substring = input_str[start_index - 5 : start_index + len("array(") + 5]
        print(substring)
    elif "dtype=float32" in input_str:
        start_index = input_str.find("dtype=float32")
        substring = input_str[start_index - 5 : start_index + len("dtype=float32") + 5]
        print("Here: ", substring)
            
    
    # Use ast.literal_eval to safely evaluate the string to a Python list
    list_of_arrays = ast.literal_eval(input_str)
    
    # Convert the list of lists into a numpy array
    numpy_array = np.array(list_of_arrays, dtype=np.float32)
    
    return numpy_array

In [12]:
# numerical_features = df[['reference', 'external_link', 'internal_link', 'table', 'formula', 
#                          'images', 'section', 'subsection', 'subsubsection', 'paragraph', 'sentence',
#                          'flesch', 'flesch_kincaid', 'smog_index', 'coleman_liau', 'automated_readability', 
#                          'difficult_words', 'dale_chall', 'linsear', 'gunning_fog']]

# classes = {'High': 1, 'Low': 0}
# df['2_classes'] = df['2_classes'].map(classes)

# # Normalize the numerical features
# scaler = StandardScaler()
# numerical_features_scaled = scaler.fit_transform(numerical_features)

# # Combine numerical features with text embeddings
# X = np.hstack((np.vstack(df['text_embedding'].values), numerical_features_scaled))
# y = df['2_classes'].values

# # Split the data into training and test sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Convert data to PyTorch tensors
# X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
# X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
# y_train_tensor = torch.tensor(y_train, dtype=torch.long)
# y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
# test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

# train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
# test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# input_dim = X_train_tensor.shape[1]
# hidden_dim = 64
# output_dim = 2
# num_layers = 2

# model = LSTMClassifier(input_dim, hidden_dim, output_dim, num_layers)
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)

# # Loss and optimizer
# criterion = nn.CrossEntropyLoss()
# optimizer = optim.Adam(model.parameters(), lr=0.001)

# # Training the model
# num_epochs = 20

# for epoch in range(num_epochs):
#     model.train()
#     for X_batch, y_batch in train_loader:
#         X_batch, y_batch = X_batch.to(device), y_batch.to(device)

#         optimizer.zero_grad()
#         outputs = model(X_batch.unsqueeze(1))
#         loss = criterion(outputs, y_batch)
#         loss.backward()
#         optimizer.step()

#     print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# # Evaluate the model
# model.eval()
# with torch.no_grad():
#     correct = 0
#     total = 0
#     for X_batch, y_batch in test_loader:
#         X_batch, y_batch = X_batch.to(device), y_batch.to(device)
#         outputs = model(X_batch.unsqueeze(1))
#         _, predicted = torch.max(outputs.data, 1)
#         total += y_batch.size(0)
#         correct += (predicted == y_batch).sum().item()

#     print(f'Accuracy: {100 * correct / total}%')

In [13]:
def convert_embedding(dataset):

    if __name__ == '__main__':
        if not os.path.exists(f'Data/dataset_text_embedding_NO_overlap_({dataset}).pkl'):
            df = pd.read_csv(f'Data/grouped_dataset_({dataset}).csv', keep_default_na=False)

            start_time = time.time()
            mp.set_start_method('spawn')
            df = parallel_embedding_computation(df, num_processes=8)
            
            with open(f'Data/dataset_text_embedding_NO_overlap_({dataset}).pkl', 'wb') as f:
                pickle.dump(df['text_embedding'], f)
            
#             df.to_csv(f'Data/dataset_text_embedding_NO_overlap_({dataset}).csv', index=False)
            end_time = time.time()
            elapsed_time = end_time - start_time
            print(f"Elapsed time: {elapsed_time/60:.2f} minutes")
        else:
            df = pd.read_csv(f'Data/grouped_dataset_({dataset}).csv', keep_default_na=False)
            
            # Load the column from the .pkl file
            with open(f'Data/dataset_text_embedding_NO_overlap_({dataset}).pkl', 'rb') as f:
                text_embedding = pickle.load(f)
                
            text_embedding = pd.Series(text_embedding, index=df.index)
            df = pd.concat([df, text_embedding.rename('text_embedding')], axis=1)
            
        return df

In [14]:
df = convert_embedding("Balance")
df.shape

In parallel_embedding_computation
15998 rows in df
Start create process
Process 0 started
Process 1 started
Process 2 started
Process 3 started
Process 4 started
Process 5 started
Process 6 started
Process 7 started
Process Process-2 joined
Process Process-3 joined
Process Process-4 joined
Process Process-5 joined
Process Process-6 joined
Process Process-7 joined
Process Process-8 joined
Process Process-9 joined


NameError: name 'pickle' is not defined

In [None]:
# s0 = df['2_classes'][df['2_classes'] == "High"].sample(1051).index
# s1 = df['2_classes'][df['2_classes'] == "Low"].sample(8000 - 1051).index
# df = df.loc[s0.union(s1)]

# df['text_embedding'] = df['text_embedding'].apply(string_to_numpy_array)
 
# print(df.dtypes['text_embedding'])  # Should be 'object'
# print(type(df['text_embedding'].iloc[0]))  # Should be <class 'numpy.ndarray'>
# print(df['text_embedding'].iloc[0].shape)
    
# numerical_features = df[['reference', 'external_link', 'internal_link', 'table', 'formula', 
#                          'images', 'section', 'subsection', 'subsubsection', 'paragraph', 'sentence',
#                          'flesch', 'flesch_kincaid', 'smog_index', 'coleman_liau', 'automated_readability', 
#                          'difficult_words', 'dale_chall', 'linsear', 'gunning_fog']]

# #     classes = {'High': 1, 'Low': 0}
# #     df['2_classes'] = df['2_classes'].map(classes)
    
# classes = {'FA': 0, 'GA': 1, 'B': 2, 'C': 3, 'Start': 4, 'Stub': 5}
# df['rate'] = df['rate'].map(classes)
    
# # Normalize the numerical features
# scaler = StandardScaler()
# numerical_features_scaled = scaler.fit_transform(numerical_features)

# text_embeddings = [torch.tensor(embedding) for embedding in df['text_embedding']]
# lengths = [embedding.size(0) for embedding in text_embeddings]
# padded_embeddings = pad_sequence(text_embeddings, batch_first=True)  # Shape: (batch_size, max_seq_length, 768)
# print(f"Embedding Shape: {padded_embeddings.shape}")

# num_features = numerical_features_scaled.shape[1]
# numerical_features_tensor = torch.tensor(numerical_features_scaled, dtype=torch.float32)
# numerical_features_expanded = numerical_features_tensor.unsqueeze(1).expand(-1, padded_embeddings.size(1), -1)  # Shape: (batch_size, max_seq_length, num_features)
# print(f"Numerical Features Shape: {numerical_features_expanded.shape}")

# combined_input = torch.cat((padded_embeddings, numerical_features_expanded), dim=2)  # Shape: (batch_size, max_seq_length, 768 + num_features)
# print(f"Concate Shape: {combined_input.shape}")

# #     labels = torch.tensor(df['2_classes'].values)
# labels = torch.tensor(df['rate'].values)
    
# # Split the data into training and test sets
# X_train, X_test, y_train, y_test, lengths_train, lengths_test = train_test_split(combined_input, labels, lengths, test_size=0.2, random_state=42)

# # Convert data to PyTorch tensors and create DataLoader
# train_dataset = TensorDataset(X_train, torch.tensor(lengths_train), y_train)
# test_dataset = TensorDataset(X_test, torch.tensor(lengths_test), y_test)
# train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
# test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
    
# # Define model parameters
# embedding_dim = 768
# num_features = numerical_features_scaled.shape[1]
# input_dim = embedding_dim + num_features
# hidden_dim = 512
# output_dim = 6
# num_layers = 3

# # Initialize the model, loss function, and optimizer
# model = ArticleClassifier(input_dim, hidden_dim, output_dim, num_layers)
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)

# # Loss and optimizer
# criterion = nn.CrossEntropyLoss()
# optimizer = optim.Adam(model.parameters(), lr=0.001)

# # Training the model
# num_epochs = 50

# for epoch in range(num_epochs):
#     model.train()
#     for X_batch, lengths_batch, y_batch in train_loader:
#         X_batch, lengths_batch, y_batch = X_batch.to(device), lengths_batch.to(device), y_batch.to(device)

#         optimizer.zero_grad()
#         outputs = model(X_batch, lengths_batch)
#         loss = criterion(outputs, y_batch)
#         loss.backward()
#         optimizer.step()

#     print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')
        
# # Evaluate the model
# model.eval()
# with torch.no_grad():
#     correct = 0
#     total = 0
#     for X_batch, lengths_batch, y_batch in test_loader:
#         X_batch, lengths_batch, y_batch = X_batch.to(device), lengths_batch.to(device), y_batch.to(device)
#         outputs = model(X_batch, lengths_batch)
#         _, predicted = torch.max(outputs.data, 1)
#         total += y_batch.size(0)
#         correct += (predicted == y_batch).sum().item()

#     print(f'Accuracy: {100 * correct / total}%')

In [None]:
# from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

# print(df.dtypes['text_embedding'])  # Should be 'object'
# print(type(df['text_embedding'].iloc[0]))  # Should be <class 'numpy.ndarray'>
# print(df['text_embedding'].iloc[0].shape)

# numerical_features = df[['reference', 'external_link', 'internal_link', 'table', 'formula', 
#                          'images', 'section', 'subsection', 'subsubsection', 'paragraph', 'sentence',
#                          'flesch', 'flesch_kincaid', 'smog_index', 'coleman_liau', 'automated_readability', 
#                          'difficult_words', 'dale_chall', 'linsear', 'gunning_fog']]

# # Normalize the numerical features
# scaler = StandardScaler()
# numerical_features_scaled = scaler.fit_transform(numerical_features)

# text_embeddings = [torch.tensor(embedding) for embedding in df['text_embedding']]
# lengths = [embedding.size(0) for embedding in text_embeddings]
# padded_embeddings = pad_sequence(text_embeddings, batch_first=True)  # Shape: (batch_size, max_seq_length, 768)
# print(f"Embedding Shape: {padded_embeddings.shape}")

# num_features = numerical_features_scaled.shape[1]
# numerical_features_tensor = torch.tensor(numerical_features_scaled, dtype=torch.float32)
# numerical_features_expanded = numerical_features_tensor.unsqueeze(1).expand(-1, padded_embeddings.size(1), -1)  # Shape: (batch_size, max_seq_length, num_features)
# print(f"Numerical Features Shape: {numerical_features_expanded.shape}")

# combined_input = torch.cat((padded_embeddings, numerical_features_expanded), dim=2)  # Shape: (batch_size, max_seq_length, 768 + num_features)
# print(f"Concate Shape: {combined_input.shape}")

In [None]:
# [array([-1.47300020e-01,  2.19794288e-01,  3.63137573e-02, 3.15326490e-02, -8.03771734e-01,
#         3.10615003e-01, -2.80626625e-01, -1.62438020e-01,  4.79527861e-01], dtype=float32), 
#  array([-4.09086078e-01, -1.04075804e-01,  2.62742979e-03, -2.40705684e-01, 8.03771734e-01,
#         3.10615003e-01, -2.80626625e-01, -1.62438020e-01,  4.79527861e-01], dtype=float32)]

In [None]:
# import numpy as np
# import ast

# # Your input string
# input_str = "[array([-1.47300020e-01,  2.19794288e-01,  3.63137573e-02,  7.73137435e-02, 8.94427113e-03, -3.19040328e-01,  4.53024775e-01,  6.59894586e-01],dtype=float32), array([-4.09086078e-01, -1.04075804e-01,  2.62742979e-03, -2.40705684e-01,-4.44055535e-02, -1.48875922e-01,  2.35171378e-01,  6.17567778e-01],dtype=float32)]"

# # Function to convert string to list of numpy arrays
# def string_to_numpy_array(input_str):
#     # Replace 'array(' with '[' and '), dtype=float32)' with ']'
#     input_str = input_str.replace('array(', '').replace(',dtype=float32)', '')
    
#     # Use ast.literal_eval to safely evaluate the string to a Python list
#     list_of_arrays = ast.literal_eval(input_str)
    
#     # Convert the list of lists into a numpy array
#     numpy_array = np.array(list_of_arrays, dtype=np.float32)
    
#     return numpy_array

# # Convert the string to numpy array
# result_array = string_to_numpy_array(input_str)

# # Print the result
# print(result_array.shape)