<a href="https://colab.research.google.com/github/LorenzoCorbinelli/MLSA-project/blob/main/project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Loading dataset

In [1]:
!pip install transformers datasets



In [2]:
from google.colab import drive
drive.mount('/content/drive')

# Step 1: Download the dataset
!wget http://files.srl.inf.ethz.ch/data/py150_files.tar.gz

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
--2025-01-10 14:32:28--  http://files.srl.inf.ethz.ch/data/py150_files.tar.gz
Resolving files.srl.inf.ethz.ch (files.srl.inf.ethz.ch)... 129.132.114.90
Connecting to files.srl.inf.ethz.ch (files.srl.inf.ethz.ch)|129.132.114.90|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://files.sri.inf.ethz.ch/data/py150_files.tar.gz [following]
--2025-01-10 14:32:29--  https://files.sri.inf.ethz.ch/data/py150_files.tar.gz
Resolving files.sri.inf.ethz.ch (files.sri.inf.ethz.ch)... 129.132.114.90
Connecting to files.sri.inf.ethz.ch (files.sri.inf.ethz.ch)|129.132.114.90|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 199067128 (190M) [application/gzip]
Saving to: ‘py150_files.tar.gz.2’


2025-01-10 14:32:45 (12.9 MB/s) - ‘py150_files.tar.gz.2’ saved [199067128/199067128]



In [3]:
!tar -xvzf py150_files.tar.gz

data.tar.gz
github_repos.txt
python100k_train.txt
python50k_eval.txt
README.md


In [4]:
!tar -xvzf data.tar.gz

[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
data/mathcamp/pypicloud/git_hooks/hook.py
data/mathcamp/pypicloud/tests/
data/mathcamp/pypicloud/tests/test_simple.py
data/mathcamp/pypicloud/tests/test_packages.py
data/mathcamp/pypicloud/tests/test_access_backends.py
data/mathcamp/pypicloud/tests/test_security.py
data/mathcamp/pypicloud/tests/test_api.py
data/mathcamp/pypicloud/tests/test_storage.py
data/mathcamp/pypicloud/tests/test_util.py
data/mathcamp/pypicloud/tests/test_auth.py
data/mathcamp/pypicloud/pypicloud/
data/mathcamp/pypicloud/pypicloud/scripts.py
data/mathcamp/pypicloud/pypicloud/storage/
data/mathcamp/pypicloud/pypicloud/storage/files.py
data/mathcamp/pypicloud/pypicloud/views/
data/mathcamp/pypicloud/pypicloud/views/packages.py
data/mathcamp/pypicloud/pypicloud/views/simple.py
data/mathcamp/pypicloud/pypicloud/views/api.py
data/mathcamp/pypicloud/pypicloud/models.py
data/mathcamp/pypicloud/pypicloud/cache/
data/mathcamp/pypicloud/pypicloud/cache/__init_

After unzipping the data.tar.gz folder, we retrieve the data folder with all the snippets of code that will end in .py

In [5]:
import os
import pandas as pd

def load_python_files(root_dir):
    """Loads all Python files in a directory recursively into a DataFrame."""
    all_files = []
    for dirpath, dirnames, filenames in os.walk(root_dir):
        for filename in filenames:
            if filename.endswith(".py"):
                filepath = os.path.join(dirpath, filename)
                try:
                    with open(filepath, 'r', encoding='utf-8') as f:
                        content = f.readlines()
                        filtered_content = []
                        inside_triple_quotes = False
                        for line in content:
                            stripped_line = line.strip()
                            if '"""' in stripped_line:
                                if stripped_line.count('"""') == 2:
                                    continue  # Ignore lines with both opening and closing triple quotes
                                inside_triple_quotes = not inside_triple_quotes
                                continue
                            if inside_triple_quotes or stripped_line.startswith('#'):
                                continue
                            filtered_content.append(line)
                        all_files.append({'filepath': filepath, 'snippet_of_code': ''.join(filtered_content)})
                except Exception as e:
                    print(f"Error reading file {filepath}: {e}")
    return pd.DataFrame(all_files)


Extract all the source code of the python files into a dataframe. One file is reported into a single line, including the reference of the file itself

In [6]:
# Example usage:
data_dir = "/content/data"  # Replace with the actual path to your data folder
df_python_files = load_python_files(data_dir)

Error reading file /content/data/cloudera/hue/desktop/core/ext-py/pysqlite/pysqlite2/test/dbapi.py: 'utf-8' codec can't decode byte 0xe4 in position 120: invalid continuation byte
Error reading file /content/data/cloudera/hue/desktop/core/ext-py/pysqlite/pysqlite2/test/__init__.py: 'utf-8' codec can't decode byte 0xe4 in position 133: invalid continuation byte
Error reading file /content/data/cloudera/hue/desktop/core/ext-py/pysqlite/pysqlite2/test/userfunctions.py: 'utf-8' codec can't decode byte 0xe4 in position 184: invalid continuation byte
Error reading file /content/data/cloudera/hue/desktop/core/ext-py/tablib-0.10.0/tablib/packages/xlrd/biffh.py: 'utf-8' codec can't decode byte 0xa9 in position 94: invalid start byte
Error reading file /content/data/cloudera/hue/desktop/core/ext-py/tablib-0.10.0/tablib/packages/xlrd/formatting.py: 'utf-8' codec can't decode byte 0xa9 in position 82: invalid start byte
Error reading file /content/data/cloudera/hue/desktop/core/ext-py/tablib-0.10.

In [7]:
import torch
import numpy as np
from transformers import RobertaTokenizer, RobertaConfig, AutoModel

In [8]:
df_train = df_python_files["snippet_of_code"].iloc[0:100]
df_eval = df_python_files["snippet_of_code"].iloc[100:150]

#Utility functions

In [9]:
def chunk_tokens(tokens_ids, size):
  for snippet_ids in tokens_ids:
    for i in range(0, len(snippet_ids), size):
      yield snippet_ids[i:i + size]

In [10]:
def padding_tokens(tokens_ids, pad_element_id, size):
  padded_tokens = []
  for i, snippet in enumerate(tokens_ids):
    if len(snippet) < size:
      padding = [pad_element_id]*(size-len(snippet))
      padded_tokens.append(snippet + padding)
  return padded_tokens

#Tokenization

In [11]:
def tokenize_dataset(hf_dataset, tokenizer, **kwargs):
    token_ids = tokenizer(hf_dataset, return_tensors='pt', **kwargs)
    return token_ids

In [12]:
from transformers import AutoTokenizer
auto_tokenizer = AutoTokenizer.from_pretrained('microsoft/codebert-base')
tokenizer_kwargs = dict(truncation=True, padding=True, max_length=500, add_special_tokens=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [13]:
import torch
from datasets import Dataset as ds
from torch.utils.data import DataLoader, TensorDataset, Dataset

train_tensor_dataset = tokenize_dataset(df_train.to_list(), auto_tokenizer, **tokenizer_kwargs)
eval_tensor_dataset = tokenize_dataset(df_eval.to_list(), auto_tokenizer, **tokenizer_kwargs)

datasetTrain = ds.from_dict(train_tensor_dataset)
datasetEval = ds.from_dict(eval_tensor_dataset)

datasetTrain.set_format(type='torch', columns=['input_ids', 'attention_mask'])
datasetEval.set_format(type='torch', columns=['input_ids', 'attention_mask'])

generator = torch.Generator()
train_loader = DataLoader(datasetTrain, batch_size=2, shuffle=True, generator=generator)
eval_loader = DataLoader(datasetEval, batch_size=2)

# Model

In [14]:
import torch
from torch.optim import Adam
from transformers import AutoModel

# Optimizer and device
model = AutoModel.from_pretrained("microsoft/codebert-base")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = Adam(model.parameters(), lr=1e-5)

# Training loop
for epoch in range(1):  # Adjust the number of epochs
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        inputs = {key: val for key, val in batch.items() if key in auto_tokenizer.model_input_names}
        outputs = model(**inputs)
        # loss = outputs.loss
        # loss.backward()
        optimizer.step()
    print(f"Epoch {epoch + 1} completed")


Epoch 1 completed


In [15]:
outputs

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-1.9161e-01, -1.3297e-01,  6.0081e-03,  ..., -1.8798e-02,
          -5.9706e-01,  5.4814e-01],
         [-1.9708e-01, -1.2227e-01,  1.4850e-01,  ..., -6.7107e-01,
          -1.6946e-01,  2.3771e-01],
         [-3.1314e-04, -1.4490e-01, -8.4258e-03,  ..., -3.4974e-01,
          -5.3183e-01,  6.5599e-01],
         ...,
         [-2.1057e-01, -2.0525e-01,  3.3944e-01,  ..., -1.4541e-01,
          -2.8466e-01, -1.0410e-01],
         [-4.8199e-01,  4.4165e-01,  2.9251e-01,  ..., -1.0385e+00,
          -6.4392e-01,  2.8290e-01],
         [-1.0847e-01,  3.5838e-02,  9.8714e-02,  ..., -1.6527e-01,
          -6.3000e-01,  6.2493e-01]],

        [[-1.2940e-01, -4.7839e-02, -1.1857e-01,  ...,  4.3101e-02,
          -4.1649e-01,  6.1408e-01],
         [-2.6899e-01,  8.3896e-02,  5.1084e-02,  ..., -4.0055e-01,
          -3.2857e-01,  1.9675e-01],
         [-2.5300e-01, -6.8958e-02,  2.1323e-02,  ..., -4.1052e-02,
          -3.