In [4]:
import os
import re
import glob

def tokenize_cpp_code_with_comments(code):
    # Define a regex pattern to match tokens and comments
    token_pattern = r'//.*?$|/\*.*?\*/|[\w]+|[^\w\s]'
    tokens = re.findall(token_pattern, code, re.DOTALL | re.MULTILINE)
    # Remove empty strings and strip whitespace
    tokens = [token.strip() for token in tokens if token.strip()]
    return tokens

def tokenize_cpp_files_in_directory(directory):
    tokenized_codes = {}
    
    # Traverse through all subdirectories and files
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith('.cpp'):
                file_path = os.path.join(root, file)
                with open(file_path, 'r', encoding='utf-8') as f:
                    code = f.read()
                    tokens = tokenize_cpp_code_with_comments(code)
                    tokenized_codes[file_path] = tokens
                    
    return tokenized_codes

# Example usage
directory_path = './HUMAN'  # Replace with your directory path
tokenized_data = tokenize_cpp_files_in_directory(directory_path)


In [5]:
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np

# Step 1: Create a vocabulary mapping
# Flatten the tokenized data and create a unique list of tokens
all_tokens = set(token for tokens in tokenized_data.values() for token in tokens)
token_to_id = {token: idx for idx, token in enumerate(all_tokens)}
id_to_token = {idx: token for token, idx in token_to_id.items()}

# Step 2: Convert tokenized lists to numerical format
def convert_tokens_to_ids(tokenized_list):
    return [token_to_id[token] for token in tokenized_list]

numerical_data = {file_path: convert_tokens_to_ids(tokens) for file_path, tokens in tokenized_data.items()}

# Step 3: Pad sequences to ensure uniform length
def pad_sequences(sequences, max_length):
    padded_sequences = []
    for seq in sequences:
        padded_seq = seq + [0] * (max_length - len(seq))  # 0 can represent padding
        padded_sequences.append(padded_seq)
    return padded_sequences

# Determine maximum sequence length for padding
max_length = max(len(seq) for seq in numerical_data.values())
padded_data = pad_sequences(list(numerical_data.values()), max_length)

# Step 4: Create a PyTorch Dataset and DataLoader
class CodeDataset(Dataset):
    def __init__(self, padded_data):
        self.data = padded_data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx], dtype=torch.long)

# Create a dataset and dataloader
dataset = CodeDataset(padded_data)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

# Example usage of DataLoader
for batch in dataloader:
    print(batch)  # Each batch will contain padded sequences ready for the neural network


tensor([[1956, 2172, 1323,  ...,    0,    0,    0],
        [1956, 2172, 1323,  ...,    0,    0,    0]])
tensor([[1956, 2172, 1323,  ...,    0,    0,    0],
        [1956, 2172, 1323,  ...,    0,    0,    0]])
tensor([[1956, 2172, 1323,  ...,    0,    0,    0],
        [1956, 2172, 1323,  ...,    0,    0,    0]])
tensor([[1956, 2172, 1323,  ...,    0,    0,    0],
        [1956, 2172, 1323,  ...,    0,    0,    0]])
tensor([[1956, 2172, 1323,  ...,    0,    0,    0],
        [1956, 2172, 1323,  ...,    0,    0,    0]])
tensor([[1956, 2172, 1323,  ...,    0,    0,    0],
        [1956, 2172, 1323,  ...,    0,    0,    0]])
tensor([[1956, 2172, 1323,  ...,    0,    0,    0],
        [1956, 2172, 1323,  ...,    0,    0,    0]])
tensor([[1956, 2172, 1323,  ...,    0,    0,    0],
        [1956, 2172, 1323,  ...,    0,    0,    0]])
tensor([[1956, 2172, 1323,  ...,    0,    0,    0],
        [1956, 2172, 1323,  ...,    0,    0,    0]])
tensor([[1956, 2172, 1323,  ...,    0,    0,    0],
   

In [1]:
import torch

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cpu


In [3]:
import torch
print(torch.version.cuda)  # Should show the installed CUDA version
print(torch.backends.cudnn.is_available())  # Should return True if cuDNN is available

None
False
