In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler


#  Load the dataset
df = pd.read_csv('Oracle.csv')
selected_features = ['login', 'Total number of Repo activities', 'Unique number of Repo activities',
                     'Total number of PR activities', 'Unique number of PR activities',
                     'Total number of Issue activities', 'Unique number of Issue activities',
                     'Total number of Commit activities', 'Unique number of Commit activities',
                     'Number of following', 'Number of followers',
                     'Account tag', 'Account name', 'Account bio', 'Account login',
                     'Median Activity per Day', 'Median Creation Time of the first activities',
                     'Text similarity', 'Text similarity of Comments Before Bot',
                     'Text similarity of Commit Messages', 'Type', 'Data_Source']
df = df[selected_features]
# Identify text and int features
text_features = ['login', 'Account tag', 'Account name', 'Account bio', 'Account login', 'Type', 'Data_Source']
int_features = [col for col in df.columns if col not in text_features]

#  Preprocess text features using LabelEncoder
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
for feature in text_features:
    df[feature] = label_encoder.fit_transform(df[feature].astype(str))

#  Remove correlated features
numeric_columns = df[int_features].select_dtypes(include=np.number).columns
corr_matrix = df[numeric_columns].corr().abs()
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper_tri.columns if any(upper_tri[column])]
df = df.drop(columns=to_drop)

# Split features and target variable
X = df.drop(columns=['Type', 'Data_Source'])
y = df['Type']
# Scale the input features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Step 6: One-hot encode text features using CountVectorizer
X_text = pd.get_dummies(df[text_features])

# Step 7: Initialize classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Support-Vector Machine': SVC(probability=True)
}

# Step 8: Evaluate classifiers using stratified 10-fold cross-validation
results = {}
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

for clf_name, clf in classifiers.items():
    precision_list = []
    recall_list = []
    f1_list = []
    auc_list = []

    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        X_train_text, X_test_text = X_text.iloc[train_index], X_text.iloc[test_index]

        # Step 9: Preprocess text features using CountVectorizer
        count_vectorizer = CountVectorizer()
        X_train_text_encoded = count_vectorizer.fit_transform(X_train_text.astype(str).sum(axis=1))
        X_test_text_encoded = count_vectorizer.transform(X_test_text.astype(str).sum(axis=1))

        # Step 10: Combine one-hot encoded text features with other numerical features
        X_train_text_encoded_df = pd.DataFrame(X_train_text_encoded.toarray()).add_prefix('text_')
        X_test_text_encoded_df = pd.DataFrame(X_test_text_encoded.toarray()).add_prefix('text_')
        X_train = pd.concat([X_train.reset_index(drop=True), X_train_text_encoded_df], axis=1)
        X_test = pd.concat([X_test.reset_index(drop=True), X_test_text_encoded_df], axis=1)

        clf.fit(X_train, y[train_index])
        y_pred = clf.predict(X_test)

        precision_list.append(precision_score(y[test_index], y_pred))
        recall_list.append(recall_score(y[test_index], y_pred))
        f1_list.append(f1_score(y[test_index], y_pred))
        auc_list.append(roc_auc_score(y[test_index], clf.predict_proba(X_test)[:, 1]))

    results[clf_name] = {
        'Precision (%)': sum(precision_list) / len(precision_list) * 100,
        'Recall (%)': sum(recall_list) / len(recall_list) * 100,
        'F1-score (%)': sum(f1_list) / len(f1_list) * 100,
        'AUC (%)': sum(auc_list) / len(auc_list) * 100,
    }

# Step 11: Display the results for Table 3
print("Table 3: Performance of the classifiers:")
print(pd.DataFrame(results).transpose())

Table 3: Performance of the classifiers:
                        Precision (%)  Recall (%)  F1-score (%)    AUC (%)
Logistic Regression         96.846227   98.938643     97.878478  90.071050
K-Nearest Neighbors         87.925350   97.177410     92.318926  65.643179
Decision Tree               96.951701   98.938643     97.932735  89.307996
Random Forest               96.951701   98.938643     97.932735  90.176585
Support-Vector Machine      86.704539  100.000000     92.878868  62.492746
