# Setup & Imports

In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.16.0-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.16.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6


In [2]:
from datasets import load_dataset
import pandas as pd

# NLP
from nltk import ngrams, FreqDist
from nltk.tokenize import word_tokenize

# Exploratory Data Analysis

Since the task is to benchmark `Refact-1.6B` using `HumanEvalPack` for Python, I will limit myself to the Python portion of the dataset for the EDA aswell.

In [3]:
dataset = load_dataset("bigcode/commitpackft", "python")

Downloading data:   0%|          | 0.00/59.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/56025 [00:00<?, ? examples/s]

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['commit', 'old_file', 'new_file', 'old_contents', 'new_contents', 'subject', 'message', 'lang', 'license', 'repos'],
        num_rows: 56025
    })
})

In [5]:
df = pd.DataFrame(dataset['train'])

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56025 entries, 0 to 56024
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   commit        56025 non-null  object
 1   old_file      56025 non-null  object
 2   new_file      56025 non-null  object
 3   old_contents  56025 non-null  object
 4   new_contents  56025 non-null  object
 5   subject       56025 non-null  object
 6   message       56025 non-null  object
 7   lang          56025 non-null  object
 8   license       56025 non-null  object
 9   repos         56025 non-null  object
dtypes: object(10)
memory usage: 4.3+ MB


In [7]:
df.head()

Unnamed: 0,commit,old_file,new_file,old_contents,new_contents,subject,message,lang,license,repos
0,e905334869af72025592de586b81650cb3468b8a,sentry/queue/client.py,sentry/queue/client.py,"""""""\nsentry.queue.client\n~~~~~~~~~~~~~~~~~~~\...","""""""\nsentry.queue.client\n~~~~~~~~~~~~~~~~~~~\...",Declare queues when broker is instantiated,Declare queues when broker is instantiated\n,Python,bsd-3-clause,"imankulov/sentry,BuildingLink/sentry,zenefits/..."
1,45fc612fdc5a354dbf0bacccd345b1aebcc73e59,tests/test_openweather.py,tests/test_openweather.py,# -*- coding: utf-8 -*-\nimport bot_mock\nfrom...,# -*- coding: utf-8 -*-\nimport bot_mock\nfrom...,"Revert ""Fix openweather unit tests""","Revert ""Fix openweather unit tests""\n\nThis re...",Python,bsd-3-clause,"rnyberg/pyfibot,EArmour/pyfibot,aapa/pyfibot,a..."
2,22faee82e1f070532c0dfe5777136e842233a1f0,src/dashboard/src/main/templatetags/percentage.py,src/dashboard/src/main/templatetags/percentage.py,"from django.template import Node, Library\n\nr...","from django.template import Node, Library\n\nr...","Fix % only showing 0 or 100%, everything betwe...","Fix % only showing 0 or 100%, everything betwe...",Python,agpl-3.0,"artefactual/archivematica-history,artefactual/..."
3,950ac9130bafe1fced578bf61d746b047830bfa0,automata/base/exceptions.py,automata/base/exceptions.py,"#!/usr/bin/env python3\n""""""Exception classes s...","#!/usr/bin/env python3\n""""""Exception classes s...","Remove ""validation"" from RejectionException do...","Remove ""validation"" from RejectionException do...",Python,mit,caleb531/automata
4,462ae981ed5b9cc9a8f46e97dfe7908c0827ea64,account_invoice_line_description/res_config.py,account_invoice_line_description/res_config.py,# -*- coding: utf-8 -*-\n#####################...,# -*- coding: utf-8 -*-\n#####################...,"Fix implied_group, it still refers to the old ...","Fix implied_group, it still refers to the old ...",Python,agpl-3.0,"Antiun/account-invoicing,hbrunn/account-invoic..."


## Python file types

Since I limited myself to Python, `lang` should only contain `Python`. Furthermore, I expect most of the filenames to end with `.py`. If they do not, they should be some sort of Python related configuration file.

In [8]:
assert(len(df['lang'].duplicated(keep=False)) == len(df))

In [9]:
def verify_file_types_in(columns: [str], df: pd.DataFrame) -> pd.DataFrame:
  for col in columns:
    python_files_in_col = len(df['old_file'].str.endswith('.py'))
    non_python_files_in_col = len(df) - python_files_in_col
    print(f'Column `{col}` contains {python_files_in_col} filenames ending on ".py" and {non_python_files_in_col} filenames that do not end on ".py".')

verify_file_types_in(['old_file', 'new_file'], df)

Column `old_file` contains 56025 filenames ending on ".py" and 0 filenames that do not end on ".py".
Column `new_file` contains 56025 filenames ending on ".py" and 0 filenames that do not end on ".py".


Due to the fact that there are no files not ending on ".py" I conclude that the dataset does not include any auxiliary configuration files. This is a good sign for the use case of predicting method names during refactoring as configuration files would no be able to contribute directly.

## Difference between `subject` and `message` columns

Next, I will investigate if and how the `subject` and `message` columns differ.

In [10]:
df['subject_message_diff'] = [message.replace(subject, '') for subject, message in zip(df.subject, df.message)]
df['subject_message_diff']

0                                                       \n
1        \n\nThis reverts commit 36e100e649f0a337228a6d...
2           \n\n\nAutoconverted from SVN (revision:1548)\n
3                                                       \n
4                                                       \n
                               ...                        
56020                                                   \n
56021                                                   \n
56022                                                   \n
56023                                                   \n
56024                                                   \n
Name: subject_message_diff, Length: 56025, dtype: object

The key difference seems to be the explicit inclusion of line break delimiters. Let's verify this hunch.

In [11]:
df['subject_message_diff'].str.contains(r"\n").value_counts()

True     53951
False     2074
Name: subject_message_diff, dtype: int64

In most cases the only difference is the explicit inclusion of the line break delimiter. However, there are some cases where the difference is more pronounced. Let's take a look at these cases.

In [12]:
non_linebreak_subject_message_diff_df = df[~df['subject_message_diff'].str.contains(r"\n")].loc[:, ['subject', 'message', 'subject_message_diff']]
non_linebreak_subject_message_diff_df

Unnamed: 0,subject,message,subject_message_diff
9,Change the version of the package.,Change the version of the package.,
32,Update for compatibility with python 3,Update for compatibility with python 3,
47,Deal with MD and RST doc,[c] Deal with MD and RST doc,[c]
62,Add standard Ansible exception handling,Add standard Ansible exception handling,
77,Fix string formatting for NotRegistered exception,Fix string formatting for NotRegistered exception,
...,...,...,...
55909,Add configparser import to avoid windows packa...,Add configparser import to avoid windows packa...,
55943,Add a few subreddits to @r_wholesome,Add a few subreddits to @r_wholesome,
55957,Make dsamp a visible component of blimpy,Make dsamp a visible component of blimpy,
55979,Update key map to add 192,Update key map to add 192,


Most of the subject message differences **not** containing `\n` seem to be empty.

In [13]:
non_empty_subject_message_diffs = len(non_linebreak_subject_message_diff_df[non_linebreak_subject_message_diff_df['subject_message_diff'] != ''])
print(f'Of the entries not containing `\\n` {len(non_linebreak_subject_message_diff_df) - non_empty_subject_message_diffs} entries or {(1-(non_empty_subject_message_diffs/len(non_linebreak_subject_message_diff_df)))*100} percent are empty.')

Of the entries not containing `\n` 1986 entries or 95.75699132111862 percent are empty.


I conclude the the difference and thus additional information encoded in the `message` column is minor.

## N-grams of commit subjects (`message` column)

Because of the above result, I will perform this part of the analysis only on the `message` column. While the data in this column does contain some noise in  the form of explicit line break delimiters `\n`, some entries actually contain added information.

In [39]:
def __remove_line_break_escape_sequence(messages: pd.Series) -> pd.Series:
  return messages.str.replace('\\n', '', regex=True)

def __reduce_to_alphanumeric_and_whitespace(messages: pd.Series) -> pd.Series:
  return messages.str.replace(pat='[^a-zA-Z0-9\\s]', repl='', regex=True)

def clean_messages_in(messages: pd.Series) -> pd.Series:
  messages = __reduce_to_alphanumeric_and_whitespace(messages)
  messages = __remove_line_break_escape_sequence(messages)

  return messages

In [41]:
messages_df = clean_messages_in(df['message'])

In [None]:
tokenized_text = text_series.apply(word_tokenize)

# Function to extract and compute probabilities for n-grams
def extract_and_prob_ngrams(tokens, n):
    n_grams = list(ngrams(tokens, n))
    freq_dist = FreqDist(n_grams)
    total_ngrams = len(n_grams)
    prob_ngrams = [(gram, freq / total_ngrams) for gram, freq in freq_dist.items()]
    return sorted(prob_ngrams, key=lambda x: x[1], reverse=True)[:20]

# Extract and compute probabilities for unigrams, bigrams, and trigrams
prob_unigrams = extract_and_prob_ngrams(tokenized_text, 1)
prob_bigrams = extract_and_prob_ngrams(tokenized_text, 2)
prob_trigrams = extract_and_prob_ngrams(tokenized_text, 3)