# Setup & Imports

In [2]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.16.0-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.16.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6


In [14]:
from datasets import load_dataset
import pandas as pd

# Exploratory Data Analysis

Since the task is to benchmark `Refact-1.6B` using `HumanEvalPack` for Python, I will limit myself to the Python portion of the dataset for the EDA aswell.

In [5]:
dataset = load_dataset("bigcode/commitpackft", "python")

Downloading data:   0%|          | 0.00/59.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/56025 [00:00<?, ? examples/s]

In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['commit', 'old_file', 'new_file', 'old_contents', 'new_contents', 'subject', 'message', 'lang', 'license', 'repos'],
        num_rows: 56025
    })
})

In [15]:
df = pd.DataFrame(dataset['train'])

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56025 entries, 0 to 56024
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   commit        56025 non-null  object
 1   old_file      56025 non-null  object
 2   new_file      56025 non-null  object
 3   old_contents  56025 non-null  object
 4   new_contents  56025 non-null  object
 5   subject       56025 non-null  object
 6   message       56025 non-null  object
 7   lang          56025 non-null  object
 8   license       56025 non-null  object
 9   repos         56025 non-null  object
dtypes: object(10)
memory usage: 4.3+ MB


In [18]:
df.head()

Unnamed: 0,commit,old_file,new_file,old_contents,new_contents,subject,message,lang,license,repos
0,e905334869af72025592de586b81650cb3468b8a,sentry/queue/client.py,sentry/queue/client.py,"""""""\nsentry.queue.client\n~~~~~~~~~~~~~~~~~~~\...","""""""\nsentry.queue.client\n~~~~~~~~~~~~~~~~~~~\...",Declare queues when broker is instantiated,Declare queues when broker is instantiated\n,Python,bsd-3-clause,"imankulov/sentry,BuildingLink/sentry,zenefits/..."
1,45fc612fdc5a354dbf0bacccd345b1aebcc73e59,tests/test_openweather.py,tests/test_openweather.py,# -*- coding: utf-8 -*-\nimport bot_mock\nfrom...,# -*- coding: utf-8 -*-\nimport bot_mock\nfrom...,"Revert ""Fix openweather unit tests""","Revert ""Fix openweather unit tests""\n\nThis re...",Python,bsd-3-clause,"rnyberg/pyfibot,EArmour/pyfibot,aapa/pyfibot,a..."
2,22faee82e1f070532c0dfe5777136e842233a1f0,src/dashboard/src/main/templatetags/percentage.py,src/dashboard/src/main/templatetags/percentage.py,"from django.template import Node, Library\n\nr...","from django.template import Node, Library\n\nr...","Fix % only showing 0 or 100%, everything betwe...","Fix % only showing 0 or 100%, everything betwe...",Python,agpl-3.0,"artefactual/archivematica-history,artefactual/..."
3,950ac9130bafe1fced578bf61d746b047830bfa0,automata/base/exceptions.py,automata/base/exceptions.py,"#!/usr/bin/env python3\n""""""Exception classes s...","#!/usr/bin/env python3\n""""""Exception classes s...","Remove ""validation"" from RejectionException do...","Remove ""validation"" from RejectionException do...",Python,mit,caleb531/automata
4,462ae981ed5b9cc9a8f46e97dfe7908c0827ea64,account_invoice_line_description/res_config.py,account_invoice_line_description/res_config.py,# -*- coding: utf-8 -*-\n#####################...,# -*- coding: utf-8 -*-\n#####################...,"Fix implied_group, it still refers to the old ...","Fix implied_group, it still refers to the old ...",Python,agpl-3.0,"Antiun/account-invoicing,hbrunn/account-invoic..."


Since I limited myself to Python, `lang` should only contain `Python`. Furthermore, I expect most of the filenames to end with `.py`. If they do not, they should be some sort of Python related configuration file.

In [22]:
assert(len(df['lang'].duplicated(keep=False)) == len(df))

In [33]:
def verify_file_types_in(columns: [str], df: pd.DataFrame) -> pd.DataFrame:
  for col in columns:
    python_files_in_col = len(df['old_file'].str.endswith('.py'))
    non_python_files_in_col = len(df) - python_files_in_col
    print(f'Column {col} contains {python_files_in_col} filenames ending on ".py" and {non_python_files_in_col} filenames that do not end on ".py".')


In [34]:
verify_file_types_in(['old_file', 'new_file'], df)

Column old_file contains 56025 filenames ending on ".py" and 0 filenames that do not end on ".py".
Column new_file contains 56025 filenames ending on ".py" and 0 filenames that do not end on ".py".
