In [1]:
import pandas as pd
from omegaconf import OmegaConf
import os
import matplotlib.pyplot as plt
import datasets

In [2]:
def draw_dist(df, field, p, b=20):
    if p == None: 
        small_df = df[field].astype('int64')
        q = 1
    else:
        q = df[field].astype('int64').quantile(p)
        small_df = df[df[field].astype('int64') <= q][field].astype('int64')
    print(f"There is {len(small_df)}/{len(df)} small repos less the {q} in {field}")
    plt.hist(small_df, bins=b, edgecolor='black')
    
    plt.title(f'{field} distribution')
    plt.xlabel(field)
    plt.ylabel('frequency')
    plt.show()

In [3]:
metrics = [
    'repo_symbols_count',
    'repo_tokens_count',
    'repo_words_count',
    'repo_lines_count',
    'repo_files_count',
    'repo_code_symbols_count',
    'repo_code_tokens_count',
    'repo_code_words_count',
    'repo_code_lines_count',
    'repo_code_files_count',
    'description_symbols_count',
    'description_tokens_count',
    'description_words_count',
    'description_lines_count',
    'readme',
    'readme_symbols_count',
    'readme_tokens_count',
    'readme_words_count',
    'readme_lines_count'
]

# Java

In [4]:
df = datasets.load_dataset('JetBrains-Research/template-generation', 'java', split='dev', cache_dir=None).to_pandas()

In [5]:
draw_dist(df, 'size', 0.9)

In [6]:
draw_dist(df, 'code_lines', None)

In [7]:
pd.set_option('display.float_format', '{:.0f}'.format)

In [8]:
df[metrics].describe()

# Python

In [9]:
df = datasets.load_dataset('JetBrains-Research/template-generation', 'py', split='dev', cache_dir=None).to_pandas()

In [10]:
df['description_words_count'] = df.apply(lambda dp: len(dp['description'].split()), axis=1)

In [11]:
draw_dist(df, 'description_words_count', 0.99)

In [12]:
draw_dist(df, 'size', 0.9)

In [13]:
draw_dist(df, 'code_lines', None)

In [14]:
df[metrics].describe()

# Kotlin

In [15]:
df = datasets.load_dataset('JetBrains-Research/template-generation', 'kt', split='dev', cache_dir=None).to_pandas()

In [16]:
df['description_words_count'] = df.apply(lambda dp: len(dp['description'].split()), axis=1)

In [17]:
draw_dist(df, 'description_words_count', 0.99)

In [18]:
draw_dist(df, 'size', 0.75)

In [19]:
draw_dist(df, 'code_lines', None)

In [20]:
df[metrics].describe()