-----------------------------
Copyright 2022 Software Improvement Group

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-----------------------------

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

import random
import numpy as np
random.seed(42)
np.random.seed(42) 

In [None]:
allx = r'/data/features.csv'
npm = r'/data/features_npm.csv'
pypi = r'/data/features_pypi.csv'
mvn = r'/data/features_mvn.csv'

df_all = pd.read_csv(allx)

In [None]:
# Sanitize time_to_next_merge where nan is a valid value
max_merge = df_all['time_to_next_merge'].max() * 10
df_all.loc[df_all['time_to_next_merge'].isna(),'time_to_next_merge'] = max_merge

df_all.fillna(0, axis=0, inplace=True)
df_all.fillna(0, axis=1, inplace=True)

In [None]:
print(df_all.shape[0])
print((df_all['label_security_related']==True).sum())

In [None]:
_byrepo = df_all.groupby('label_repo_full_name')
repos = df_all['label_repo_full_name'].unique()
train_repos = random.sample(list(repos), int(0.9*len(repos)))
df = df_all[df_all.apply(lambda x: x['label_repo_full_name'] in train_repos, axis=1)]
eval_df = df_all[df_all.apply(lambda x: x['label_repo_full_name'] not in train_repos, axis=1)]

print(df.shape[0], (df['label_security_related']==True).sum())
print(eval_df.shape[0], (eval_df['label_security_related']==True).sum())


# Feature Selection


In [None]:
broken_features = [
    'file_changed_method_count_avg',
    'file_changed_method_count_max',
]

In [None]:
no_variance_features = [
    'methods_with_attack_count_avg',
    'methods_with_corrupt_count_avg',
    'methods_with_crash_count_avg',
    'methods_with_deadlock_count_avg',
    'methods_with_deadlock_count_max',
    'methods_with_exploit_count_avg',
    'methods_with_exploit_count_max',
    'methods_with_segfault_count_avg',
    'methods_with_segfault_count_max',
    'methods_with_sensit_count_avg',
    'methods_with_vulnerab_count_avg',

    'has_npm_like_code', 
    'has_pypi_like_code',
]

In [None]:
highly_correlated_features = [
  'attack_in_title',
  'corrupt_in_title',
  'deadlock_in_title',
  'malicious_in_title',
  'segfault_in_title',
  'sensit_in_title',
  'secur_in_title',
  # 'vulnerab_in_title',
  'exploit_in_title',
  'certificat_in_title',
  # 'authent_in_title',
  'leak_in_title',
  # 'crash_in_title',

  'added_lines_count_avg',
  'added_lines_count_max',
  'added_lines_ratio_avg',
  'added_lines_ratio_max',
  'avg_method_complexity_max',
  'max_method_complexity_avg',
  'max_method_nloc_avg',
  'avg_method_nloc_max',
  'avg_method_parameter_count_max',
  'max_method_parameter_count_avg',
  'file_complexity_max',
  'max_method_token_count_avg',
  'avg_method_token_count_max',
  'test_in_filename',
  'modified_lines_ratio_avg',
  'removed_lines_ratio_avg',
  'avg_method_nloc_avg',
  'max_method_nloc_max',
  'file_nloc_avg',
  'file_nloc_max',
  'changes_to_file_in_next_50_commits_max',
  'changes_to_file_in_prev_50_commits_max',

]

In [None]:

df_selected = df
df_selected = df_selected[df_selected.columns.difference(broken_features)]
df_selected = df_selected[df_selected.columns.difference(no_variance_features)]
df_selected = df_selected[df_selected.columns.difference(highly_correlated_features)]

X = df_selected[df_selected.columns.difference(['label_repo_full_name', 'label_sha', 'label_commit_date', 'label_security_related'])]
y = df_selected['label_security_related']


In [None]:
from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=0.001)
sel.fit(X)
sup = sel.get_support()
i = -1
for x in X:
       i+=1
       if not sup[i]:
              print(f'reject \'{x}\',')


In [None]:
columns = [
]
# columns = ['methods_with_segfault_count_max']

df_sec = df[df['label_security_related'] == True]
for col in columns:
    dfx = df[df[col]<10] 
    dfx_sec = df_sec[df_sec[col]<10] 
    print(col)
    violin_parts = plt.violinplot(dfx[col])
    violin_parts['bodies'][0].set_facecolor('blue')
    violin_parts = plt.violinplot(dfx_sec[col])
    violin_parts['bodies'][0].set_facecolor('red')
    plt.show()

In [None]:
correlation_matrix = X.corr()
corr = correlation_matrix.values
column_names = correlation_matrix.columns

for i in range(len(column_names)):
    for j in range(i+1, len(column_names)):
        if abs(corr[i,j])> 0.75:
            print('reject', column_names[i], ' ', column_names[j], ' ', corr[i,j])

# TRAINING

In [None]:
# parameters

VERBOSE_LVL = 10


oversample_ratio = 1
undersample_ratio = 8

RF_estimators = 60


train_SVC_linear = True
train_SVC_sigmoid = False
train_LR = True



In [None]:
import numpy as np

# manual rebalancing

x_positive = X.where(y==True).dropna()
x_negative = X.where(y==False).dropna()

x_positive = pd.concat([x_positive for i in range(oversample_ratio)])
x_negative = x_negative.sample(len(x_positive)*undersample_ratio)

X_resampled = pd.concat([x_negative, x_positive])
y_resampled = np.array([False]*len(x_negative) + [True]*len(x_positive))


In [None]:
print(x_positive.shape, x_negative.shape, X_resampled.shape)

In [None]:
# scaling
from sklearn.preprocessing import RobustScaler

scaler = RobustScaler().fit(X_resampled)
X_resampled = scaler.transform(X_resampled)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression

from sklearn.feature_selection import RFECV
from sklearn.model_selection import KFold
from sklearn.model_selection import RandomizedSearchCV

from sklearn.metrics import recall_score, precision_score



In [None]:
# Train selector & RF
    
model = RandomForestClassifier(
    n_estimators = RF_estimators,
    random_state=42)

selector = RFECV(model, 
    step=1, 
    cv=5,
    min_features_to_select=40,
    scoring = 'f1',
    verbose=VERBOSE_LVL)

selector.fit(X_resampled, y_resampled)

print('max f1:', max(selector.cv_results_['mean_test_score']))

In [None]:
if VERBOSE_LVL > 5:
  colums = list(X.columns)

  print('Selected features')
  for i in range(len(selector.support_)):
      if selector.support_[i]:
          print(colums[i])

In [None]:
import numpy as np

df_eval_selected = eval_df
df_eval_selected = df_eval_selected[df_eval_selected.columns.difference(broken_features)]
df_eval_selected = df_eval_selected[df_eval_selected.columns.difference(no_variance_features)]
df_eval_selected = df_eval_selected[df_eval_selected.columns.difference(highly_correlated_features)]

eval_X = df_eval_selected[df_eval_selected.columns.difference(['label_repo_full_name', 'label_sha', 'label_commit_date', 'label_security_related'])]
eval_y = df_eval_selected['label_security_related']


eval_x_positive = eval_X.where(eval_y).dropna()
eval_x_negative = eval_X.where(eval_y==False).dropna()
print(eval_x_positive.shape, eval_x_negative.shape)

eval_x_negative_sampled =  eval_x_negative.sample(len(eval_x_positive)*undersample_ratio)

eval_X_balanced = pd.concat([eval_x_negative_sampled, eval_x_positive])
eval_y_balanced = np.array([False]*len(eval_x_negative_sampled) + [True]*len(eval_x_positive))


In [None]:
eval_y = eval_y.astype(int)

In [None]:
# eval RF selector

eval_scaled = scaler.transform(eval_X)
eval_selected_scaled = selector.transform(eval_scaled)
X_resampled_selected = selector.transform(X_resampled)
eval_X_balanced_selected = selector.transform(eval_X_balanced)

train_preds = selector.predict(X_resampled)
recall = recall_score(y_resampled, train_preds)
precision = precision_score(y_resampled, train_preds)
print('RF train', recall, precision)

eval_balanced_y_pred = selector.predict(eval_X_balanced)
recall = recall_score(eval_y_balanced, eval_balanced_y_pred)
precision = precision_score(eval_y_balanced, eval_balanced_y_pred)
print('RF test', recall, precision)

eval_y_pred = selector.predict(eval_scaled)
precision_rf = precision_score(eval_y, eval_y_pred)
recall_rf = recall_score(eval_y, eval_y_pred)
f1_rf = 2*recall_rf*precision_rf/(recall_rf+precision_rf)

print('RF eval', recall_rf, precision_rf, f1_rf)


In [None]:
if train_SVC_linear:
  SVC_model = LinearSVC(
      penalty ='l2',
      loss='squared_hinge',
      random_state=42,
      verbose=VERBOSE_LVL)

  SVC_model.fit(X_resampled_selected, y_resampled)


In [None]:
if train_SVC_linear:
  train_preds = SVC_model.predict(X_resampled_selected)
  recall = recall_score(y_resampled, train_preds)
  precision = precision_score(y_resampled, train_preds)
  print('SVC train', recall, precision)

  eval_balanced_y_pred = SVC_model.predict(eval_X_balanced_selected)
  recall = recall_score(eval_y_balanced, eval_balanced_y_pred)
  precision = precision_score(eval_y_balanced, eval_balanced_y_pred)
  print('SVC test', recall, precision)

  eval_y_pred = SVC_model.predict(eval_selected_scaled)
  recall_svc = recall_score(eval_y, eval_y_pred)
  precision_svc = precision_score(eval_y, eval_y_pred)
  f1_svc = 2*recall_svc*precision_svc/(recall_svc+precision_svc)

  print('SVC eval', recall_svc, precision_svc, f1_svc)

In [None]:
if train_SVC_sigmoid:
  SVC_model_2 = SVC(
      kernel ='sigmoid',
      random_state=42,
      verbose=VERBOSE_LVL>0)

  SVC_model_2.fit(X_resampled_selected, y_resampled)

  train_preds = SVC_model_2.predict(X_resampled_selected)
  recall = recall_score(y_resampled, train_preds)
  precision = precision_score(y_resampled, train_preds)
  print('SVC2 train', recall, precision)

  eval_balanced_y_pred = SVC_model_2.predict(eval_X_balanced_selected)
  recall = recall_score(eval_y_balanced, eval_balanced_y_pred)
  precision = precision_score(eval_y_balanced, eval_balanced_y_pred)
  print('SVC2 test', recall, precision)

  eval_y_pred = SVC_model_2.predict(eval_selected_scaled)
  recall_svc2 = recall_score(eval_y, eval_y_pred)
  precision_svc2 = precision_score(eval_y, eval_y_pred)
  f1_svc2 = 2*recall_svc2*precision_svc2/(precision_svc2+recall_svc2)

  print('SVC2 eval', recall_svc2, precision_svc2, f1_svc2)

In [None]:
if train_LR:
  LR_model = LogisticRegression(
      penalty ='l2',
      max_iter = 20000,
      random_state=42)

  LR_model.fit(X_resampled_selected, y_resampled)


In [None]:
if train_LR:
  train_preds = LR_model.predict(X_resampled_selected)
  recall = recall_score(y_resampled, train_preds)
  precision = precision_score(y_resampled, train_preds)
  print('LR train', recall, precision)

  eval_balanced_y_pred = LR_model.predict(eval_X_balanced_selected)
  recall = recall_score(eval_y_balanced, eval_balanced_y_pred)
  precision = precision_score(eval_y_balanced, eval_balanced_y_pred)
  print('LR test', recall, precision)

  eval_y_pred = LR_model.predict(eval_selected_scaled)
  recall_lr = recall_score(eval_y, eval_y_pred)
  precision_lr = precision_score(eval_y, eval_y_pred)
  f1_lr = 2*recall_lr*precision_lr/(precision_lr+recall_lr)

  print('LR eval', recall_lr, precision_lr, f1_lr)

# Results

In [None]:
from joblib import dump, load


dump(selector, 'RF_selector.joblib') 
if train_LR:
  dump(LR_model, 'LR_model_all.joblib') 
if train_SVC_linear:
  dump(SVC_model, 'SVC_linear_model_all.joblib') 
if train_SVC_sigmoid:
  dump(SVC_model_2, 'SVC_sigmoid_model_all.joblib') 


In [None]:

print('RF eval', recall_rf, precision_rf, f1_rf)
print('LR eval', recall_lr, precision_lr, f1_lr)
print('SVC eval', recall_svc, precision_svc, f1_svc)