In [1]:
import datasets
import pandas as pd
import matplotlib.pyplot as plt

In [4]:
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter

nltk.download('punkt')
nltk.download('stopwords')

In [21]:
def get_topics(df, field, k):
    translator = str.maketrans('', '', string.punctuation)
    descriptions = " ".join(map(lambda d: str(d), df[field]))
    descriptions = descriptions.translate(translator)
    
    tokens = word_tokenize(descriptions)
    tokens = [word.lower() for word in tokens]
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if not word in stop_words]
    word_counts = Counter(tokens)
    word_counts_df = pd.DataFrame.from_dict(word_counts, orient='index').sort_values(by=0, ascending=False)
    
    plt.figure(figsize=(14, 6))
    plt.bar(word_counts_df.index[:k], word_counts_df[0][:k])
    plt.title(f'Word Distribution in {field} Field')
    plt.xlabel('Word')
    plt.ylabel('Frequency')
    plt.xticks(rotation='vertical')
    
    plt.show()

# Java

In [18]:
df = datasets.load_dataset('JetBrains-Research/template-generation', 'java', split='dev', cache_dir=None)

In [22]:
get_topics(df, 'description', 100)

In [23]:
get_topics(df, 'topics', 100)

Libraries: ['springboot', 'selenium', 'junit', 'mockito', 'okhttp', 'javafx']

Tech: ['apache', 'azure', 'sql', 'kubernetes', 'redisson', 'kafka', 'aws', 'oauth', 'mqtt', 'java', 'maven', 'gradle']

Topics: ['multithreading']

# Python

In [26]:
df = datasets.load_dataset('JetBrains-Research/template-generation', 'py', split='dev', cache_dir=None)

In [27]:
get_topics(df, 'description', 100)

In [29]:
get_topics(df, 'topics', 75)

Libraries: ['django', 'fastapi', 'streamlit', 'asyncio']

Tech: ['docker', 'postgresql', 'dockercompose', 'poetry', 'githubactions', 'mongodb', 'aws', 'kubernetes']

Topics: ['deeplearning', 'machinelearning', 'telegrambot', 'computervision', 'datascience', 'telegram', 'ai', 'chatgpt', 'discordbot', 'natualllanguageprocessing', 'llm', 'webapp']

# Kotlin

In [30]:
df = datasets.load_dataset('JetBrains-Research/template-generation', 'kt', split='dev', cache_dir=None)

In [31]:
get_topics(df, 'description', 100)

In [32]:
get_topics(df, 'topics', 75)

Libraries: ['kotlincoroutines', 'axon', 'ktor', 'vaadin', 'springboot', 'kotlinmultiplatform', 'forge', 'kotlinnative']

Tech: ['jwt', 'junit']

Topics: ['functionalprogramming', 'auth', 'restapi', 'ast']