In [1]:
import pandas as pd

In [2]:
# all columns in the dataset
INFO_COLUMNS = ["class", "type", "linked", "fileschanged",
                "ndev", "age", "exp", "rexp", "sexp",
                "glm_probability", "classification",
                "time_of_day", "day_of_week", "is_weekend",
                "author_experience", "author_ownership",
                "hash","file", "num_files_changed"]

Y_COLUMN = ['contains_bug']

X_COLUMNS = ["d_cbo","d_cboModified","d_fanin",
             "d_fanout","d_wmc","d_dit","d_noc",
             "d_rfc","d_lcom","d_lcom*","d_tcc",
             "d_lcc","d_totalMethodsQty","d_staticMethodsQty",
             "d_publicMethodsQty","d_privateMethodsQty","d_protectedMethodsQty",
             "d_defaultMethodsQty","d_visibleMethodsQty","d_abstractMethodsQty",
             "d_finalMethodsQty","d_synchronizedMethodsQty","d_totalFieldsQty",
             "d_staticFieldsQty","d_publicFieldsQty","d_privateFieldsQty",
             "d_protectedFieldsQty","d_defaultFieldsQty","d_finalFieldsQty",
             "d_synchronizedFieldsQty","d_nosi","d_loc","d_returnQty","d_loopQty",
             "d_comparisonsQty","d_tryCatchQty","d_parenthesizedExpsQty","d_stringLiteralsQty",
             "d_numbersQty","d_assignmentsQty","d_mathOperationsQty","d_variablesQty",
             "d_maxNestedBlocksQty","d_anonymousClassesQty","d_innerClassesQty",
             "d_lambdasQty","d_uniqueWordsQty","d_modifiers","d_logStatementsQty",
             "cbo","cboModified","fanin","fanout","wmc","dit","noc","rfc","lcom","lcom*",
             "tcc","lcc","totalMethodsQty","staticMethodsQty","publicMethodsQty",
             "privateMethodsQty","protectedMethodsQty","defaultMethodsQty",
             "visibleMethodsQty","abstractMethodsQty","finalMethodsQty",
             "synchronizedMethodsQty","totalFieldsQty","staticFieldsQty",
             "publicFieldsQty","privateFieldsQty","protectedFieldsQty",
             "defaultFieldsQty","finalFieldsQty","synchronizedFieldsQty",
             "nosi","loc","returnQty","loopQty","comparisonsQty",
             "tryCatchQty","parenthesizedExpsQty","stringLiteralsQty",
             "numbersQty","assignmentsQty","mathOperationsQty",
             "variablesQty","maxNestedBlocksQty","anonymousClassesQty",
             "innerClassesQty","lambdasQty","uniqueWordsQty",
             "modifiers","logStatementsQty","fix","entrophy","la","ld",
             "net_lines_changed","absolute_lines_changed","lines_per_file",
             "changed_file_count",
             #"entropy_bucket"
             ]

In [3]:
# read the merged df

df = pd.read_csv('merged_datasets/new/tomcat_merged_df.csv')

print(df.shape[0])

126165


In [4]:
# drop 'classification' column, it's not needed, and mostly NaN values
df = df.drop('classification', axis=1)

In [5]:
###################################
#        Problematic Columns      #
###################################

# if number of visible methods in a class is less than 2, then CK prefers to set it -1.
# For some reason, some rows have NaN values, so set it to -1 as well as it is not applicable.

df['d_lcom*'] = df['d_lcom*'].fillna(-1)
df['d_tcc'] = df['d_tcc'].fillna(-1)
df['d_lcc'] = df['d_lcc'].fillna(-1)
df['lcom*'] = df['lcom*'].fillna(-1)
df['tcc'] = df['tcc'].fillna(-1)
df['lcc'] = df['lcc'].fillna(-1)

In [6]:
df.head()

Unnamed: 0,d_cbo,d_cboModified,d_fanin,d_fanout,d_wmc,d_dit,d_noc,d_rfc,d_lcom,d_lcom*,...,net_lines_changed,absolute_lines_changed,lines_per_file,author_experience,author_ownership,changed_file_count,entropy_bucket,num_files_changed,hash,file
0,0,0,0,0,0,0,0,0,0,-1.0,...,5.0,5.0,5.0,4138.0,43.0,1,low,1,b6eca3e216f94ec05e7ceb833aa469523a08ad93,java/org/apache/tomcat/util/openssl/openssl_h....
1,0,0,0,0,0,0,0,0,0,-1.0,...,5.0,5.0,5.0,4138.0,43.0,1,low,1,b6eca3e216f94ec05e7ceb833aa469523a08ad93,java/org/apache/tomcat/util/openssl/openssl_h....
2,0,0,0,0,0,0,0,0,0,-1.0,...,5.0,5.0,5.0,4138.0,43.0,1,low,1,b6eca3e216f94ec05e7ceb833aa469523a08ad93,java/org/apache/tomcat/util/openssl/openssl_h....
3,0,0,0,0,0,0,0,0,0,-1.0,...,5.0,5.0,5.0,4138.0,43.0,1,low,1,b6eca3e216f94ec05e7ceb833aa469523a08ad93,java/org/apache/tomcat/util/openssl/openssl_h....
4,0,0,0,0,0,0,0,0,0,-1.0,...,5.0,5.0,5.0,4138.0,43.0,1,low,1,b6eca3e216f94ec05e7ceb833aa469523a08ad93,java/org/apache/tomcat/util/openssl/openssl_h....


In [8]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np


# Simulating a dataset with the given column names
np.random.seed(42)
X, y = make_classification(n_samples=1000, n_features=len(X_COLUMNS), n_informative=20, n_redundant=10, random_state=42)
X = pd.DataFrame(X, columns=X_COLUMNS)

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardizing the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Building the Deep Neural Network
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

# Compiling the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Training the model
history = model.fit(X_train_scaled, y_train, epochs=50, batch_size=32, validation_split=0.2, verbose=1)

# Evaluating the model
y_pred = (model.predict(X_test_scaled) > 0.5).astype("int32")
accuracy = accuracy_score(y_test, y_pred)

print(f"Test Accuracy: {accuracy}")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
I0000 00:00:1739132766.202444  133351 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 2248 MB memory:  -> device: 0, name: NVIDIA GeForce GTX 1650, pci bus id: 0000:01:00.0, compute capability: 7.5


Epoch 1/50


I0000 00:00:1739132769.791874  140150 service.cc:148] XLA service 0x7fe264009ee0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1739132769.792353  140150 service.cc:156]   StreamExecutor device (0): NVIDIA GeForce GTX 1650, Compute Capability 7.5
2025-02-09 23:26:09.846358: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1739132770.095565  140150 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m19/20[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 6ms/step - accuracy: 0.5241 - loss: 0.7316

I0000 00:00:1739132773.010409  140150 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 28ms/step - accuracy: 0.5226 - loss: 0.7324 - val_accuracy: 0.7250 - val_loss: 0.6355
Epoch 2/50
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.6347 - loss: 0.6483 - val_accuracy: 0.7875 - val_loss: 0.5885
Epoch 3/50
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.6635 - loss: 0.6282 - val_accuracy: 0.8062 - val_loss: 0.5411
Epoch 4/50
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.7155 - loss: 0.5519 - val_accuracy: 0.8125 - val_loss: 0.4893
Epoch 5/50
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.7355 - loss: 0.5370 - val_accuracy: 0.8313 - val_loss: 0.4447
Epoch 6/50
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.7865 - loss: 0.4407 - val_accuracy: 0.8062 - val_loss: 0.4189
Epoch 7/50
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━

In [13]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd


# Simulating dataset
np.random.seed(42)
X, y = make_classification(n_samples=1000, n_features=len(X_COLUMNS), 
                           n_informative=20, n_redundant=10, random_state=42)
X = pd.DataFrame(X, columns=X_COLUMNS)

# Data Split
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=42)

# Data Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Improved Deep Neural Network Model
model = Sequential([
    Dense(256, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    BatchNormalization(),
    Dropout(0.4),
    
    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.4),
    
    Dense(64, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),
    
    Dense(32, activation='relu'),
    Dropout(0.2),
    
    Dense(1, activation='sigmoid')
])

# Compile the Model
model.compile(optimizer=Adam(learning_rate=0.0005), 
              loss='binary_crossentropy', 
              metrics=['accuracy'])

# Train the Model
history = model.fit(X_train_scaled, y_train, 
                    epochs=100, 
                    batch_size=32, 
                    validation_split=0.2, 
                    verbose=1)

# Evaluate the Model
y_pred = (model.predict(X_test_scaled) > 0.5).astype("int32")
accuracy = accuracy_score(y_test, y_pred)

print(f"Improved Model Accuracy: {accuracy}")

# print detailed classification report
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))


Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 30ms/step - accuracy: 0.5507 - loss: 0.8104 - val_accuracy: 0.6250 - val_loss: 0.6773
Epoch 2/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.6105 - loss: 0.7011 - val_accuracy: 0.6500 - val_loss: 0.6620
Epoch 3/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.6233 - loss: 0.7160 - val_accuracy: 0.6375 - val_loss: 0.6454
Epoch 4/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.6013 - loss: 0.7107 - val_accuracy: 0.6562 - val_loss: 0.6280
Epoch 5/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.6317 - loss: 0.6798 - val_accuracy: 0.6938 - val_loss: 0.6057
Epoch 6/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.6216 - loss: 0.6294 - val_accuracy: 0.7188 - val_loss: 0.5878
Epoch 7/100
[1m20/20[0m [32m━━━━━━━━━

In [12]:
# print detailed classification report
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.90      0.86       100
           1       0.89      0.80      0.84       100

    accuracy                           0.85       200
   macro avg       0.85      0.85      0.85       200
weighted avg       0.85      0.85      0.85       200



In [11]:
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification
import pandas as pd
import numpy as np

# Defining the dataset
X_COLUMNS = ["d_cbo", "d_cboModified", "d_fanin", "d_fanout", "d_wmc", "d_dit", "d_noc", 
             "d_rfc", "d_lcom", "d_lcom*", "d_tcc", "d_lcc", "d_totalMethodsQty", 
             "d_staticMethodsQty", "d_publicMethodsQty", "d_privateMethodsQty", 
             "d_protectedMethodsQty", "d_defaultMethodsQty", "d_visibleMethodsQty", 
             "d_abstractMethodsQty", "d_finalMethodsQty", "d_synchronizedMethodsQty", 
             "d_totalFieldsQty", "d_staticFieldsQty", "d_publicFieldsQty", 
             "d_privateFieldsQty", "d_protectedFieldsQty", "d_defaultFieldsQty", 
             "d_finalFieldsQty", "d_synchronizedFieldsQty", "d_nosi", "d_loc", 
             "d_returnQty", "d_loopQty", "d_comparisonsQty", "d_tryCatchQty", 
             "d_parenthesizedExpsQty", "d_stringLiteralsQty", "d_numbersQty", 
             "d_assignmentsQty", "d_mathOperationsQty", "d_variablesQty", 
             "d_maxNestedBlocksQty", "d_anonymousClassesQty", "d_innerClassesQty", 
             "d_lambdasQty", "d_uniqueWordsQty", "d_modifiers", "d_logStatementsQty", 
             "cbo", "cboModified", "fanin", "fanout", "wmc", "dit", "noc", "rfc", 
             "lcom", "lcom*", "tcc", "lcc", "totalMethodsQty", "staticMethodsQty", 
             "publicMethodsQty", "privateMethodsQty", "protectedMethodsQty", 
             "defaultMethodsQty", "visibleMethodsQty", "abstractMethodsQty", 
             "finalMethodsQty", "synchronizedMethodsQty", "totalFieldsQty", 
             "staticFieldsQty", "publicFieldsQty", "privateFieldsQty", 
             "protectedFieldsQty", "defaultFieldsQty", "finalFieldsQty", 
             "synchronizedFieldsQty", "nosi", "loc", "returnQty", "loopQty", 
             "comparisonsQty", "tryCatchQty", "parenthesizedExpsQty", 
             "stringLiteralsQty", "numbersQty", "assignmentsQty", "mathOperationsQty", 
             "variablesQty", "maxNestedBlocksQty", "anonymousClassesQty", 
             "innerClassesQty", "lambdasQty", "uniqueWordsQty", "modifiers", 
             "logStatementsQty", "entrophy", "la", "ld", "lines_per_file", 
             "changed_file_count"]

# Simulating dataset
np.random.seed(42)
X, y = make_classification(n_samples=1000, n_features=len(X_COLUMNS), 
                           n_informative=20, n_redundant=10, random_state=42)
X = pd.DataFrame(X, columns=X_COLUMNS)

# Data Split
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=42)

# Data Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# XGBoost Classifier
xgb_model = xgb.XGBClassifier(
    n_estimators=300, 
    learning_rate=0.05, 
    max_depth=8, 
    subsample=0.8, 
    colsample_bytree=0.8, 
    reg_alpha=0.5, 
    reg_lambda=1.0, 
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)

# Model Training
xgb_model.fit(X_train_scaled, y_train)

# Predictions
y_pred = xgb_model.predict(X_test_scaled)

# Model Accuracy
accuracy = accuracy_score(y_test, y_pred)

print(f"XGBoost Model Accuracy: {accuracy}")


ModuleNotFoundError: No module named 'xgboost'