In [52]:
# 1. Carregando as bibliotecas base
import numpy as np
import pandas as pd
import gensim

from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

import seaborn as sns
from importlib import reload
import multiprocessing
import joblib

import scripts.preprocess as pp
import scripts.embeddings as eb
import scripts.convert as cv

reload(cv)
reload(pp)
reload(eb)

<module 'scripts.embeddings' from '/mnt/c/Users/luanm/projects/pece-monografia/src/models/scripts/embeddings.py'>

In [53]:
# 2. Carregar base de commits que será tratada

# Transformar blocos de código em commits
cv.convert_to_jsonl('./code_snippets', './inputs/commits.jsonl')

# Definir dataframe
df = pd.read_json('./inputs/commits.jsonl', lines=True)
df_labels = pd.read_csv('./inputs/labels.csv', sep=',')

labels = df_labels['Label']

print(f"Trecho de código com maior tamanho: {df['new_contents'].str.len().min()}\n")
print(f"Trecho de código com maior tamanho: {df['new_contents'].str.len().max()}\n")
print(f"Tamanho dos labels: {len(labels)}\n")

df.describe()

['GeneratedClass_1.cs', 'GeneratedClass_2.cs', 'GeneratedClass_3.cs', 'GeneratedClass_4.cs', 'GeneratedClass_5.cs', 'GeneratedClass_6.cs', 'GeneratedClass_7.cs', 'GeneratedClass_8.cs', 'GeneratedClass_9.cs', 'GeneratedClass_10.cs', 'GeneratedClass_11.cs', 'GeneratedClass_12.cs', 'GeneratedClass_13.cs', 'GeneratedClass_14.cs', 'GeneratedClass_15.cs', 'GeneratedClass_16.cs', 'GeneratedClass_17.cs', 'GeneratedClass_18.cs', 'GeneratedClass_19.cs', 'GeneratedClass_20.cs', 'GeneratedClass_21.cs', 'GeneratedClass_22.cs', 'GeneratedClass_23.cs', 'GeneratedClass_24.cs', 'GeneratedClass_25.cs', 'GeneratedClass_26.cs', 'GeneratedClass_27.cs', 'GeneratedClass_28.cs', 'GeneratedClass_29.cs', 'GeneratedClass_30.cs', 'GeneratedClass_31.cs', 'GeneratedClass_32.cs', 'GeneratedClass_33.cs', 'GeneratedClass_34.cs', 'GeneratedClass_35.cs', 'GeneratedClass_36.cs', 'GeneratedClass_37.cs', 'GeneratedClass_38.cs', 'GeneratedClass_39.cs', 'GeneratedClass_40.cs', 'GeneratedClass_41.cs', 'GeneratedClass_42.cs', 

Unnamed: 0,new_contents
count,115.0
unique,108.0
top,
freq,5.0


In [54]:
# 3. Preprocessar commits
df['new_contents'] = df['new_contents'].apply(pp.clean_code)

df.head()

Unnamed: 0,new_contents
0,
1,<NEWLINE> using <LIBRARY> <NEWLINE>
2,
3,<ATTRIBUTE> <NEWLINE>
4,public <VARIABLENAME> <NEWLINE>


In [55]:
# 4. Escrever dataframe em csv para validação
df.to_csv('./outputs/data_preprocessed.csv', sep=";", index=False)

x_cleaned = df['new_contents']

In [56]:
# 5. Tokenização
x_tokenized = [[c for c in s.split(" ") if c != ""] for s in x_cleaned]

for x in x_tokenized:
    print(x)

[]
['<NEWLINE>', 'using', '<LIBRARY>', '<NEWLINE>']
[]
['<ATTRIBUTE>', '<NEWLINE>']
['public', '<VARIABLENAME>', '<NEWLINE>']
['public', 'IntPtr', 'FindControlHandle(IntPtr', 'windowsHandle,', '<VARIABLENAME>', ')', '<NEWLINE>']
['public', '<VARIABLENAME>', '{', 'get;', 'set;', '}']
['try', '<NEWLINE>', '{', '<NEWLINE>', 'await', '<PRIVATEVARIABLE>', '<METHOD_EXEC:', '()>;', '<NEWLINE>', '<VARIABLENAME>', '=', 'true;', '<NEWLINE>', '}', '<NEWLINE>', 'catch', '(', '<VARIABLENAME>', ')', '<NEWLINE>', '{', '<NEWLINE>', '<VARIABLENAME>', '=', 'false;', '<NEWLINE>', '}', '<NEWLINE>']
['<NEWLINE>', 'return', 'this;', '<NEWLINE>']
['protected', '<VARIABLENAME>', '<VARIABLENAME>', 'CreateContract(', '<VARIABLENAME>', '<VARIABLENAME>', ')']
['<VARIABLENAME>', '=', '<CLASSINIT>', '<UpdateInfo>();', '<NEWLINE>', '<VARIABLENAME>', '=', '<CLASSINIT>', '<string>();', '<NEWLINE>']
['static', 'void', 'Main(string[]', '<VARIABLENAME>', ')', '<NEWLINE>']
['<METHOD_DEF:LENGHT_GTT:True>', '{']
['<METHOD_D

In [57]:
# 5. Treinar gramática com Word2Vec

# Número de cores da máquina
cores = multiprocessing.cpu_count()

print(f"Número de cores: {cores}")

# Treinar o modelo
model = gensim.models.Word2Vec(
    vector_size=200, 
    sg=1,     
    workers=cores-3, 
    window=20, 
    seed=42,
    alpha=0.01, 
    min_alpha=0.005)

model.build_vocab(x_tokenized, progress_per=10000)

model.train(x_tokenized, total_examples=model.corpus_count, epochs=20)

# Verificar similaridade
similarity_by_key_sample = model.wv.similar_by_key("public")
print(f"Exemplo de similaridade: + {similarity_by_key_sample}")

most_similar_sample = model.wv.similar_by_key("<METHOD_DEF:LENGHT_GTT:True>")
print(f"Exemplo de similaridade: + {most_similar_sample}")

# Número total de palavras do corpus
print(f"Número total de palavra do corpus: {model.corpus_total_words}")

Número de cores: 22
Exemplo de similaridade: + [('static', 0.9710027575492859), ('readonly', 0.965536892414093), ('private', 0.960656464099884), ('class', 0.9553068280220032), ('<CLASSNAME>', 0.9479354023933411), ('get;', 0.9467548727989197), ('protected', 0.9380561113357544), ('<METHOD_DEF:LENGHT_GTT:False>', 0.934917688369751), (':', 0.9297769069671631), ('set;', 0.9275515079498291)]
Exemplo de similaridade: + [('return', 0.9902473092079163), ('get', 0.9789010882377625), ('<CLASSMETHODINVOCATION>', 0.9636496901512146), ('false;', 0.9599233865737915), ('$', 0.949687123298645), ('-', 0.9341035485267639), ('!=', 0.9328575134277344), ('true;', 0.9291867613792419), ('<OBJECTMETHODINVOCATION>', 0.9246307015419006), (');', 0.9141374826431274)]
Número total de palavra do corpus: 3618


In [1]:
# 6. Criação dos vetores a partir do vocabulário
sample_vec = eb.textToVector(model.wv, "<METHOD_DEF:LENGHT_GTT:True> a", 200, 20)

[print(x) for x in sample_vec] 

x_vecs = np.asarray([eb.textToVector(model.wv, " ".join(x), 200, 20) for x in x_tokenized])
print(x_vecs.shape)

# Reduzir dimensão para treinamento do modelo:
pca_model = PCA(n_components=40)
pca_model.fit(x_vecs)
print("Taxa de variância: ", sum(pca_model.explained_variance_ratio_))

x_vecs = pca_model.transform(x_vecs)
x_vecs.shape

NameError: name 'eb' is not defined

In [59]:
vocabulary = list(model.wv.index_to_key) 
print(vocabulary)

model.save("./outputs/word2vec_model.model")

['<NEWLINE>', '<VARIABLENAME>', ';', '{', '}', '=', 'using', '<LIBRARY>', '<OBJECTPROPERTYACCESS>', 'public', '<STRINGVALUE>', '<METHOD_EXEC:', '<NUMBERVALUE>', '(', ')', 'private', '<METHOD_DEF:LENGHT_GTT:True>', 'if', 'class', '<CLASSNAME>', 'return', '<CLASSMETHODINVOCATION>', '<METHOD_DEF:LENGHT_GTT:False>', '<OBJECTMETHODINVOCATION>', '(<PARAM>)>;', 'get;', '<ATTRIBUTE>', 'new', 'static', '()>;', '<PARAM>,', '+', '+=', '<CLASSINIT>', '<NAMESPACE>', 'namespace', '(<PARAM>,', '<PARAM>)>;', 'void', 'readonly', ',', 'set;', '=>', '<PRIVATEVARIABLE>', 'throw', 'internal', 'delegate', 'false;', 'for', 'protected', 'in', '==', 'true;', '<ARRAYACCESS>', '<', '-', '>', 'foreach', ':', '!=', ');', 'get', '$']


In [60]:
# 7. Treinar SVM com GridSearchCV
X_train, X_test, y_train, y_test = train_test_split(
    x_vecs, 
    labels, 
    test_size=0.3, 
    random_state=72, 
    stratify=labels)

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)

X_test_scaled = scaler.transform(X_test)

clf = svm.SVC()

param_grid = {
    'C': [0.1, 1, 10],  # Regularização
    'kernel': ['linear', 'rbf'],  # Função de kernel
    'gamma': ['scale', 'auto'],  # Parâmetro do kernel
}

grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=2, scoring='f1_macro')

# Treinando o modelo com Grid Search
grid_search.fit(X_train_scaled, y_train)

# Melhor combinação de hiperparâmetros encontrada
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_scaled)

print(f"Melhores parâmetros: {grid_search.best_params_}")

print(classification_report(y_test, y_pred,zero_division=0))

Melhores parâmetros: {'C': 1, 'gamma': 'scale', 'kernel': 'linear'}
              precision    recall  f1-score   support

           0       0.86      0.92      0.89        26
           1       0.60      0.75      0.67         4
           2       1.00      0.50      0.67         2
           3       0.00      0.00      0.00         3

    accuracy                           0.80        35
   macro avg       0.61      0.54      0.56        35
weighted avg       0.76      0.80      0.77        35



In [61]:
# 8. Treinar RandomForest com GridSearchCV
X_train, X_test, y_train, y_test = train_test_split(x_vecs, labels, test_size=0.3, random_state=72, stratify=labels)

# Instanciando o classificador Random Forest
rf = RandomForestClassifier(random_state=42)

# Definir os parâmetros para o GridSearch
param_grid = {
    'n_estimators': [10, 50, 100, 200],        # Número de árvores na floresta
    'max_depth': [None, 10, 20, 30],           # Profundidade máxima das árvores
    'min_samples_split': [2, 5, 10],           # Mínimo de amostras para dividir um nó
    'min_samples_leaf': [1, 2, 4],             # Mínimo de amostras por folha
    'bootstrap': [True, False],                # Usar amostragem bootstrap
}

# Configurar GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, 
    cv=2, scoring='f1_macro')

# Treinando com o GridSearch
grid_search.fit(X_train, y_train)

# Melhor combinação de hiperparâmetros encontrada
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

print(f"Melhores parâmetros: {grid_search.best_params_}")

print(classification_report(y_test, y_pred, zero_division=0))

Melhores parâmetros: {'bootstrap': False, 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
              precision    recall  f1-score   support

           0       0.79      1.00      0.88        26
           1       0.00      0.00      0.00         4
           2       1.00      0.50      0.67         2
           3       1.00      0.33      0.50         3

    accuracy                           0.80        35
   macro avg       0.70      0.46      0.51        35
weighted avg       0.73      0.80      0.74        35



In [65]:
# 9. Treinar XGBoost com GridSearchCV

X_train, X_test, y_train, y_test = train_test_split(x_vecs, labels, test_size=0.3, random_state=72, stratify=labels)

clf = xgb.XGBClassifier(eval_metric='mlogloss')

param_grid = {
    'n_estimators': [50, 100],              # Número de árvores
    'max_depth': [3, 9],                    # Profundidade máxima das árvores
    'learning_rate': [0.01],                # Taxa de aprendizado
    'subsample': [0.8],                     # Subamostragem
    'colsample_bytree': [0.8],              # Colunas usadas em cada árvore
    'gamma': [0.1],                         # Regularização para reduzir overfitting
}

# Configurar GridSearchCV
grid_search = GridSearchCV(
    estimator=clf, param_grid=param_grid, 
    scoring='f1_macro', cv=2)

# Treinando com o GridSearch
grid_search.fit(X_train, y_train)

# Melhor combinação de hiperparâmetros encontrada
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

print(f"Melhores parâmetros: {grid_search.best_params_}")

print(classification_report(y_test, y_pred, zero_division=0))

Melhores parâmetros: {'colsample_bytree': 0.8, 'gamma': 0.1, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.8}
              precision    recall  f1-score   support

           0       0.81      1.00      0.90        26
           1       1.00      0.25      0.40         4
           2       1.00      0.50      0.67         2
           3       1.00      0.33      0.50         3

    accuracy                           0.83        35
   macro avg       0.95      0.52      0.62        35
weighted avg       0.86      0.83      0.79        35

