In [None]:
pip install -U sentence-transformers

In [29]:
import numpy  as np
import pandas as pd
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
from sklearn.metrics import accuracy_score
from sklearn.manifold import TSNE
import matplotlib.pyplot  as plt
from pathlib import Path
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix
import os
import pickle
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments

## Convert Data to Dataframe

In [30]:
repos_dict = json.loads(Path('repos_dict.json').read_text())

In [31]:
REPO_NAME = 'repo name'
FILE_NAME = 'file name'
LANG = 'language'
CODE = 'code'
CLASSIFICATION_PATH = 'Classification.pkl'
TRANSFORMER_PATH = './transformer'
valid_languages = ['Java', 'Python', 'Shell', 'C++', 'Go']

In [32]:
def get_concatanated(repo):
    text = ''
    for language in valid_languages:
        path = Path(f'preprocessed_data/{repo}/{language}')
        if not path.exists():
            continue
        for p in path.iterdir():
            code = p.read_text()
            text += code
            text += ' '
    return text

def get_valid_test():
    result = []
    for repo in repos_dict:
        if repo['language'] not in valid_languages:
            continue
        code = get_concatanated(repo['dir_name'])
        result.append((repo['dir_name'], repo['url'], repo['language'], code))
    return pd.DataFrame(result ,columns=[REPO_NAME, FILE_NAME, LANG, CODE])

In [33]:
repos_df = get_valid_test()

In [34]:
set(repos_df[LANG])

{'C++', 'Go', 'Java', 'Python', 'Shell'}

In [35]:
repos_df

Unnamed: 0,repo name,file name,language,code
0,gothinkster_realworld,https://github.com/gothinkster/realworld,Shell,"# set - x SCRIPTDIR = """" $ ( dirname """" ) """" A..."
1,nvm-sh_nvm,https://github.com/nvm-sh/nvm,Shell,"# find_name ( ) { find test - name """" <> \ | ]..."
2,jgraph_drawio-desktop,https://github.com/jgraph/drawio-desktop,Shell,# set - eo pipefail # SOURCE_FILE_PATH = '' # ...
3,mathiasbynens_dotfiles,https://github.com/mathiasbynens/dotfiles,Shell,"# cd """" $ { BASH_SOURCE } """" ; git pull origin..."
4,dwmkerr_hacker-laws,https://github.com/dwmkerr/hacker-laws,Shell,"# # # # version = $NUM date = $ ( date """" ) if..."
...,...,...,...,...
145,sanic-org_sanic,https://github.com/sanic-org/sanic,Python,from __future__ import annotations from inspec...
146,nginx-proxy_nginx-proxy,https://github.com/nginx-proxy/nginx-proxy,Python,import pytest def test_network_web1 ( docker_c...
147,binux_pyspider,https://github.com/binux/pyspider,Python,# # # # # # import json import time from pymon...
148,openai_gpt-2,https://github.com/openai/gpt-2,Python,import numpy as np import tensorflow as tf fro...


## Data

In [36]:
class Data():
    
    def __init__(self, df):
        self.df = df
        
    def get_true_labels(self):
        true_labels = [valid_languages.index(language) for language in self.df[LANG].tolist()]
        return np.array(true_labels)
    
    def get_code(self):
        return self.df[CODE].tolist()
    
    def get_labeled_data(self):
        return self.get_code(), self.get_true_labels()

In [37]:
repos_data = Data(repos_df)

## Split Data

In [38]:
X_train, y_train = repos_data.get_labeled_data()
X_train, X_val_test, y_train, y_val_test = train_test_split(X_train, y_train, test_size=0.2, random_state=1)
X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.5, random_state=1)

## Classification

In [39]:
class Classification():
    
    def __init__(self):
        self.vectorizer = TfidfVectorizer(
            ngram_range=(1,2),
            max_df=0.9,
            min_df=0.1,
            stop_words=None,
            norm='l2'
        )
        
    def train(self, X_train, y_train):
        self.y_train = y_train
        self.doc_term_mat = self.vectorizer.fit_transform(X_train)
        self.clf = LogisticRegression(random_state=0, multi_class='multinomial').fit(self.doc_term_mat, y_train)
        
    def predict(self, X):
        vectorize_X = self.get_vectorized(X)
        return self.clf.predict(vectorize_X)
    
    def predict_code(self, code):
        preprocessed_code = [preprocess(code)]
        index = self.predict(preprocessed_code)[0]
        language = valid_languages[index]
        return language
    
    def get_vectorized(self, X):
        vectorize_X = self.vectorizer.transform(X)
        return vectorize_X
    
    def get_accuracy(self, X, y):
        vectorize_X = self.get_vectorized(X)
        return self.clf.score(vectorize_X, y)
    
    def get_f1_score(self, X, y, average):
        predicted = self.predict(X)
        return f1_score(y, predicted, average=average)
    
    def get_confusion_matrix(self, X, y):
        predicted = self.predict(X)
        return confusion_matrix(y, predicted, labels=list(range(len(valid_languages))))

In [40]:
classification = Classification()

## Train

In [41]:
classification.train(X_train, y_train)

## Calculate f1 score

In [42]:
f1_macro = classification.get_f1_score(X_val_test, y_val_test, average='macro')
f1_macro

0.938095238095238

In [43]:
f1_micro = classification.get_f1_score(X_val_test, y_val_test, average='micro')
f1_micro

0.9333333333333333

As you see our f1-score both in macro case and micro case is more than 0.9

## Calculate Accuracy

In [44]:
accuracy = classification.get_accuracy(X_val_test, y_val_test)
accuracy

0.9333333333333333

## Confusion Matrix

In [45]:
classification_confusion_matrix = classification.get_confusion_matrix(X_val_test, y_val_test)
classification_confusion_matrix

array([[6, 0, 0, 0, 0],
       [0, 5, 0, 0, 0],
       [0, 0, 5, 0, 0],
       [0, 0, 2, 6, 0],
       [0, 0, 0, 0, 6]])

## Transformer

In [46]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [47]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=5, problem_type="single_label_classification")
model = model.to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classi

In [48]:
train_encoding = tokenizer(X_train, truncation=True, padding=True)
val_encoding = tokenizer(X_val, truncation=True, padding=True)
test_encoding = tokenizer(X_test, truncation=True, padding=True)

## Dataset

In [49]:
class LanguageDataset(Dataset):
    def __init__(self, encoding, labels):
        self.encoding = encoding
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(value[idx]) for key, value in self.encoding.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

In [50]:
train_dataset = LanguageDataset(train_encoding, y_train)
val_dataset = LanguageDataset(val_encoding, y_val)
test_dataset = LanguageDataset(test_encoding, y_test)

## Transformer Class

In [51]:
class Transformer():

    def __init__(self, model, tokenizer):
        self.model =  model
        self.tokenizer = tokenizer

    def set_trainer(
        self, 
        train_dataset, 
        val_dataset, 
        per_device_train_batch_size=10, 
        per_device_eval_batch_size=5, 
        learning_rate=3e-05,
        weight_decay=0.0,
        adam_beta1=0.93,
        adam_beta2=0.999,
        adam_epsilon=1e-08,
        num_train_epochs=100,
        warmup_steps=100,
        logging_steps=20,
        save_steps=2000,
        ):

        training_args = TrainingArguments(
            output_dir = './results',
            do_train=True,
            do_eval=True,
            per_device_train_batch_size=per_device_train_batch_size,
            per_device_eval_batch_size=per_device_eval_batch_size,
            learning_rate=learning_rate,
            weight_decay=weight_decay,
            adam_beta1=adam_beta1,
            adam_beta2=adam_beta2,
            adam_epsilon=adam_epsilon,
            num_train_epochs=num_train_epochs,
            warmup_steps=warmup_steps,
            logging_dir='./logs',
            logging_steps=logging_steps,
            save_steps=save_steps,
        )

        self.trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
    )
        
    def train(self):
        self.trainer.train()

    def predict(self, X, dataset=True):
        self.model.eval()

        if dataset:
            X_dataloader = DataLoader(dataset=X, batch_size=5, shuffle=False)
            predicted_labels = []
            
            with torch.no_grad():
                for inputs in X_dataloader:
                    inputs_cuda = {key: value.to(device) for key, value in inputs.items()}
                    logits = self.model(**inputs_cuda).logits
                    predicted_class_id = logits.argmax(dim=1)
                    predicted_labels += [self.model.config.id2label[index.item()] for index in predicted_class_id]
            return np.array([int(x.replace('LABEL_', '')) for x in predicted_labels])

        else:
            inputs = self.tokenizer(X, truncation=True, padding=True)

            with torch.no_grad():
                inputs_cuda = {key: value.to(device) for key, value in inputs.items()}
                logits = self.model(**inputs_cuda).logits

            predicted_class_id = logits.argmax().item()
            return self.model.config.id2label[predicted_class_id]

    def get_accuracy(self, X, y):
        predicted = self.predict(X)
        return np.mean(predicted == y)

    def get_f1_score(self, X, y, average):
        predicted = self.predict(X)
        return f1_score(y, predicted, average=average)
    
    def get_confusion_matrix(self, X, y):
        predicted = self.predict(X)
        return confusion_matrix(y, predicted, labels=list(range(len(valid_languages))))

In [52]:
transformer = Transformer(model, tokenizer)

## Train

In [53]:
transformer.set_trainer(train_dataset, val_dataset)
transformer.train()

***** Running training *****
  Num examples = 120
  Num Epochs = 100
  Instantaneous batch size per device = 10
  Total train batch size (w. parallel, distributed & accumulation) = 10
  Gradient Accumulation steps = 1
  Total optimization steps = 1200


Step,Training Loss
20,1.6107
40,1.5567
60,1.3453
80,0.9419
100,0.6092
120,0.4162
140,0.2756
160,0.1753
180,0.1628
200,0.1493




Training completed. Do not forget to share your model on huggingface.co/models =)




## Calculate f1 score

In [54]:
f1_macro = transformer.get_f1_score(test_dataset, y_test, average='macro')
f1_macro

0.7551515151515151

In [55]:
f1_micro = transformer.get_f1_score(test_dataset, y_test, average='micro')
f1_micro

0.8000000000000002

## Calculate Accuracy

In [56]:
accuracy = transformer.get_accuracy(test_dataset, y_test)
accuracy

0.8

## Confusion Matrix

In [57]:
transformer_confusion_matrix = transformer.get_confusion_matrix(test_dataset, y_test)
transformer_confusion_matrix

array([[5, 0, 0, 0, 0],
       [0, 2, 0, 1, 0],
       [0, 0, 1, 0, 0],
       [1, 0, 1, 1, 0],
       [0, 0, 0, 0, 3]])

## Save and Load Object

In [60]:
def save_object(obj, file_name):
    with open(file_name, 'wb') as file:
        pickle.dump(obj, file, pickle.HIGHEST_PROTOCOL)
        
def load_object(file_name):
    with open(file_name, 'rb') as file:
        return pickle.load(file)

## Save Model

In [None]:
save_object(classification, CLASSIFICATION_PATH)

In [61]:
transformer.tokenizer.save_pretrained(TRANSFORMER_PATH)
transformer.model.save_pretrained(TRANSFORMER_PATH)

tokenizer config file saved in ./transformer/tokenizer_config.json
Special tokens file saved in ./transformer/special_tokens_map.json
Configuration saved in ./transformer/config.json
Model weights saved in ./transformer/pytorch_model.bin


## Load Model

In [None]:
classification = load_object(CLASSIFICATION_PATH)

In [42]:
tokenizer = DistilBertTokenizer.from_pretrained(TRANSFORMER_PATH)
model = DistilBertForSequenceClassification.from_pretrained(TRANSFORMER_PATH, num_labels=5, problem_type="single_label_classification").to(device)
transformer = Transformer(model, tokenizer)

Didn't find file ./transformer/added_tokens.json. We won't load it.
loading file ./transformer/vocab.txt
loading file None
loading file ./transformer/special_tokens_map.json
loading file ./transformer/tokenizer_config.json
loading configuration file ./transformer/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout":

## Technical Report

<div dir = "rtl">
    ابتدا تمامی اطلاعات بدست آمده از repositoryها را در یک dataframe می‌ریزیم.
    سپس برای استفاده بهتر از این dataframe، آن را به صورت یک کلاس Data در می‌آوریم.
    سپس داده‌ها را به دسته‌های train، validation و test تقسیم می‌کنیم.
    در قسمت دسته‌بندی کلاس Classification را می‌سازیم. با استفاده از tf-idf کد ها را به فضای برداری می‌بریم. سپس با استفاده از logistic regression مدل را train می‌کنیم. 
    در قسمت transformer از DistilBERT-base-uncased transformer استفاده می‌کنیم. داکیومنتیشن مربوط به این transformer در این <a href="https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english">لینک</a> قرار دارد. 
    سپس امبدینگ مربوط به داده‌های هر دسته را بدست می‌آوریم.
    سپس برای هر دسته یک Dataset می‌سازیم تا بتوانیم بهتر از آنها استفاده کنیم.
    در ادامه نیز کلاس Transformer را می‌سازیم. 
    سپس بعد از ساختن object کلاس Transformer، آن را در 100 epoch آموزش می‌دهیم.
</div>

<div dir = "rtl">
    تمام توضیحات چگونگی crawling و دیگر مواردی که در بخش clustring نیز انجام شده‌اند در notebook مخصوص clustering قابل مشاهده است.
</div>