In [None]:
import pandas as pd
import matplotlib.pyplot as plt

import random
import numpy as np
import json
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression

from sklearn.feature_selection import RFECV
from sklearn.model_selection import KFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import recall_score, precision_score, f1_score
from sklearn.preprocessing import RobustScaler

random.seed(42)
np.random.seed(42) 

# Feature Selection


In [None]:
selected_features_all = [
'attack_in_file_content',
'attack_in_message',
'attack_in_patch',
'authent_in_file_content',
'authent_in_message',
'authent_in_patch',
'authent_in_title',
'author_in_top_100',
'author_to_commiter_date_diff',
'authored_by_bot',
'avg_method_complexity_avg',
'avg_method_parameter_count_avg',
'avg_method_token_count_avg',
'certificat_in_file_content',
'certificat_in_message',
'certificat_in_patch',
'changed_files',
'changed_lines_count_avg',
'changed_lines_count_max',
'changed_methods_count_avg',
'changed_methods_count_max',
'changes_to_file_in_next_50_commits_avg',
'changes_to_file_in_prev_50_commits_avg',
'commits_next_30_days',
'commits_next_7_days',
'commits_prev_7_days',
'commits_since_last_merge',
'commits_to_next_merge',
'committed_by_bot',
'corrupt_in_file_content',
'corrupt_in_message',
'corrupt_in_patch',
'crash_in_file_content',
'crash_in_message',
'crash_in_patch',
'crash_in_title',
'deadlock_in_file_content',
'deadlock_in_message',
'dmm_unit_complexity',
'dmm_unit_interfacing',
'dmm_unit_size',
'exploit_in_file_content',
'exploit_in_message',
'exploit_in_patch',
'file_complexity_avg',
'file_size_avg',
'file_size_max',
'file_token_count_avg',
'file_token_count_max',
'has_mvn_code',
'has_mvn_like_code',
'has_npm_code',
'has_pypi_code',
'is_add',
'is_delete',
'is_file_recently_added',
'is_file_recently_removed',
'is_modify',
'is_rename',
'leak_in_file_content',
'leak_in_message',
'leak_in_patch',
'malicious_in_file_content',
'malicious_in_message',
'malicious_in_patch',
'max_method_complexity_max',
'max_method_parameter_count_max',
'max_method_token_count_max',
'methods_with_authent_count_avg',
'methods_with_authent_count_max',
'methods_with_certificat_count_avg',
'methods_with_certificat_count_max',
'methods_with_secur_count_avg',
'methods_with_secur_count_max',
'modified_lines_count_avg',
'modified_lines_count_max',
'modified_lines_ratio_max',
'removed_lines_count_avg',
'removed_lines_count_max',
'removed_lines_ratio_max',
'same_author_as_commiter',
'secur_in_file_content',
'secur_in_message',
'secur_in_patch',
'segfault_in_file_content',
'segfault_in_message',
'segfault_in_patch',
'sensit_in_file_content',
'sensit_in_message',
'sensit_in_patch',
'test_in_path',
'time_to_next_commit',
'time_to_next_merge',
'time_to_prev_commit',
'total_methods_count_avg',
'total_methods_count_max',
'vulnerab_in_file_content',
'vulnerab_in_message',
'vulnerab_in_patch',
'vulnerab_in_title',
]

In [None]:
selected_features_mvn = [
  'attack_in_file_content',
'attack_in_message',
'attack_in_patch',
'authent_in_file_content',
'authent_in_patch',
'author_in_top_100',
'author_to_commiter_date_diff',
'avg_method_complexity_max',
'certificat_in_file_content',
'certificat_in_patch',
'changed_files',
'changed_lines_count_max',
'changed_methods_count_avg',
'changed_methods_count_max',
'changes_to_file_in_next_50_commits_avg',
'changes_to_file_in_next_50_commits_max',
'changes_to_file_in_prev_50_commits_avg',
'changes_to_file_in_prev_50_commits_max',
'commits_next_30_days',
'commits_next_7_days',
'commits_prev_7_days',
'commits_since_last_merge',
'commits_to_next_merge',
'committed_by_bot',
'corrupt_in_file_content',
'crash_in_file_content',
'deadlock_in_file_content',
'dmm_unit_complexity',
'dmm_unit_interfacing',
'dmm_unit_size',
'exploit_in_file_content',
'file_nloc_avg',
'file_nloc_max',
'file_size_max',
'has_mvn_code',
'has_mvn_like_code',
'has_npm_code',
'has_pypi_code',
'is_add',
'is_delete',
'is_file_recently_added',
'is_file_recently_removed',
'is_modify',
'leak_in_file_content',
'malicious_in_file_content',
'max_method_complexity_avg',
'max_method_complexity_max',
'max_method_nloc_avg',
'max_method_nloc_max',
'max_method_parameter_count_avg',
'max_method_parameter_count_max',
'max_method_token_count_avg',
'max_method_token_count_max',
'methods_with_authent_count_avg',
'methods_with_authent_count_max',
'methods_with_certificat_count_avg',
'methods_with_secur_count_avg',
'methods_with_secur_count_max',
'methods_with_vulnerab_count_avg',
'modified_lines_count_avg',
'modified_lines_count_max',
'modified_lines_ratio_avg',
'modified_lines_ratio_max',
'removed_lines_count_max',
'removed_lines_ratio_avg',
'removed_lines_ratio_max',
'same_author_as_commiter',
'secur_in_file_content',
'secur_in_message',
'secur_in_patch',
'sensit_in_file_content',
'sensit_in_patch',
'test_in_path',
'time_to_next_commit',
'time_to_next_merge',
'time_to_prev_commit',
'total_methods_count_avg',
'vulnerab_in_file_content',
'vulnerab_in_message',
'vulnerab_in_patch'
]

In [None]:
selected_features_npm = [
'added_lines_count_max',
'added_lines_ratio_max',
'attack_in_file_content',
'attack_in_message',
'attack_in_patch',
'authent_in_file_content',
'authent_in_patch',
'author_in_top_100',
'author_to_commiter_date_diff',
'avg_method_complexity_avg',
'avg_method_nloc_avg',
'avg_method_nloc_max',
'avg_method_parameter_count_avg',
'avg_method_parameter_count_max',
'avg_method_token_count_avg',
'avg_method_token_count_max',
'certificat_in_file_content',
'changed_files',
'changed_lines_count_avg',
'changed_lines_count_max',
'changed_methods_count_avg',
'changed_methods_count_max',
'changes_to_file_in_next_50_commits_max',
'changes_to_file_in_prev_50_commits_max',
'commits_next_30_days',
'commits_next_7_days',
'commits_prev_7_days',
'commits_since_last_merge',
'commits_to_next_merge',
'corrupt_in_file_content',
'crash_in_file_content',
'dmm_unit_complexity',
'dmm_unit_interfacing',
'dmm_unit_size',
'file_nloc_avg',
'file_nloc_max',
'file_size_avg',
'file_size_max',
'has_npm_code',
'has_npm_like_code',
'has_pypi_code',
'is_add',
'is_file_recently_added',
'is_file_recently_removed',
'is_modify',
'is_rename',
'leak_in_file_content',
'max_method_complexity_max',
'max_method_nloc_max',
'max_method_parameter_count_max',
'max_method_token_count_max',
'modified_lines_count_avg',
'modified_lines_count_max',
'modified_lines_ratio_avg',
'modified_lines_ratio_max',
'removed_lines_count_avg',
'removed_lines_count_max',
'removed_lines_ratio_avg',
'removed_lines_ratio_max',
'same_author_as_commiter',
'secur_in_file_content',
'secur_in_message',
'secur_in_patch',
'sensit_in_file_content',
'test_in_filename',
'test_in_path',
'time_to_next_commit',
'time_to_next_merge',
'time_to_prev_commit',
'total_methods_count_avg',
'vulnerab_in_file_content',
'vulnerab_in_message',
'vulnerab_in_patch']

In [None]:
selected_features_pypi = [
'attack_in_file_content',
'attack_in_message',
'authent_in_file_content',
'authent_in_patch',
'author_to_commiter_date_diff',
'avg_method_parameter_count_avg',
'certificat_in_file_content',
'changed_files',
'changed_lines_count_avg',
'changed_lines_count_max',
'changed_methods_count_avg',
'changed_methods_count_max',
'changes_to_file_in_next_50_commits_max',
'changes_to_file_in_prev_50_commits_max',
'commits_next_30_days',
'commits_next_7_days',
'commits_prev_7_days',
'commits_since_last_merge',
'commits_to_next_merge',
'corrupt_in_file_content',
'dmm_unit_complexity',
'dmm_unit_interfacing',
'dmm_unit_size',
'file_size_avg',
'file_size_max',
'file_token_count_max',
'has_pypi_code',
'is_add',
'is_file_recently_added',
'is_modify',
'max_method_complexity_max',
'max_method_nloc_max',
'max_method_parameter_count_max',
'max_method_token_count_avg',
'max_method_token_count_max',
'modified_lines_count_avg',
'modified_lines_count_max',
'modified_lines_ratio_max',
'removed_lines_count_avg',
'removed_lines_count_max',
'removed_lines_ratio_max',
'secur_in_file_content',
'secur_in_message',
'secur_in_patch',
'test_in_path',
'time_to_next_commit',
'time_to_next_merge',
'time_to_prev_commit',
'total_methods_count_avg',
'total_methods_count_max',
'vulnerab_in_message',
]

# Config

In [None]:
allx = r'/data/features.csv'
npm = r'/data/features_npm.csv'
pypi = r'/data/features_pypi.csv'
mvn = r'/data/features_mvn.csv'

df_all = pd.read_csv(mvn)
selected_features = selected_features_mvn
result_file = r'/res/res_mvn_'

In [None]:
# Sanitize time_to_next_merge where nan is a valid value
max_merge = df_all['time_to_next_merge'].max() * 10
df_all.loc[df_all['time_to_next_merge'].isna(),'time_to_next_merge'] = max_merge

df_all.fillna(0, axis=0, inplace=True)
df_all.fillna(0, axis=1, inplace=True)

In [None]:
_byrepo = df_all.groupby('label_repo_full_name')
repos = df_all['label_repo_full_name'].unique()
train_repos = random.sample(list(repos), int(0.75*len(repos)))
df = df_all[df_all.apply(lambda x: x['label_repo_full_name'] in train_repos, axis=1)]
eval_df = df_all[df_all.apply(lambda x: x['label_repo_full_name'] not in train_repos, axis=1)]

print(df_all.shape[0],(df_all['label_security_related']==True).sum())
print(df.shape[0], (df['label_security_related']==True).sum())
print(eval_df.shape[0], (eval_df['label_security_related']==True).sum())


In [None]:

df_selected = df
X = df_selected[selected_features]
y = df_selected['label_security_related']

eval_X = eval_df[selected_features]
eval_y = eval_df['label_security_related']


In [None]:
# parameters

VERBOSE_LVL = 10
oversample_ratio = 1 #  DONT OVERSAMPLE IN CROSS FOLD VALIDATION
undersample_ratio = 4

In [None]:
x_positive = X.where(y==True).dropna()
x_negative = X.where(y==False).dropna()

x_positive = pd.concat([x_positive for i in range(oversample_ratio)])
x_negative = x_negative.sample(len(x_positive)*undersample_ratio)

X_resampled = pd.concat([x_negative, x_positive])
y_resampled = np.array([False]*len(x_negative) + [True]*len(x_positive))

scaler = RobustScaler().fit(X_resampled)
X_resampled = scaler.transform(X_resampled)

eval_X_resampled = scaler.transform(eval_X)


In [None]:
# # RUN_1
# n_estimators = [40, 60, 80, 100, 120, 140]
# criterion = ['gini', 'entropy', 'log_loss']
# max_depth = [None]
# min_samples_split = [2, 3, 5, 7]
# min_samples_leaf = [1, 2, 4, 8]
# min_weight_fraction_leaf = [0.0]
# max_features = ['sqrt', 'log2']

# RUN_2
n_estimators = [100]
criterion = ['gini']
max_depth = [None, 50, 100, 150, 200]
min_samples_split = [2, 3]
min_samples_leaf = [1]
min_weight_fraction_leaf = [0.0, 0.01]
max_features = ['sqrt', 'log2']


parameters = {
'n_estimators' : n_estimators, 
'criterion':criterion,
'max_depth':max_depth,
'min_samples_split':min_samples_split,
'min_samples_leaf':min_samples_leaf,
'min_weight_fraction_leaf':min_weight_fraction_leaf,
'max_features':max_features,
}

clf = GridSearchCV(RandomForestClassifier(),parameters, cv=4, scoring='f1', verbose=3)
clf.fit(X_resampled, y_resampled)



In [None]:
print(clf.best_params_)
print(clf.best_score_)

In [None]:
result = {
    'best_params':clf.best_params_,
    'best_score': clf.best_score_,
    'model': 'RF'
}
with open(result_file + 'rf.json', 'w') as f:
  json.dump(result, f)


In [None]:
penalty = ['l1', 'l2', 'elasticnet', 'none']
solvers = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
class_weights = [None, 'balanced']


parameters = {
'penalty' : penalty, 
'solver':solvers,
'class_weight':class_weights,
'max_iter': [20000]
}

clfLR = GridSearchCV(LogisticRegression(),parameters, cv=4, scoring='f1', verbose=3)
clfLR.fit(X_resampled, y_resampled)



In [None]:
print(clfLR.best_params_)
print(clfLR.best_score_)

In [None]:
result = {
    'best_params':clfLR.best_params_,
    'best_score': clfLR.best_score_,
    'model': 'LR'
}
with open(result_file + 'lr.json', 'w') as f:
  json.dump(result, f)


# Predict

In [None]:
from sklearn.model_selection import cross_val_predict


X = df_all[selected_features]
y = df_all['label_security_related']

model = LogisticRegression(
    #PARAMS
)
model = RandomForestClassifier(
    #PARAMS
)

y_pred = cross_val_predict(model, X, y, cv=5)



In [None]:
for i in range(len(y_pred)):
  print(y_pred, y_pred['label_sha'][i])