In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler

# Load the dataset
df = pd.read_csv('Oracle.csv')

selected_features = ['login', 'Total number of Repo activities', 'Unique number of Repo activities',
                     'Total number of PR activities', 'Unique number of PR activities',
                     'Total number of Issue activities', 'Unique number of Issue activities',
                     'Total number of Commit activities', 'Unique number of Commit activities',
                     'Number of following', 'Number of followers',
                     'Account tag', 'Account name', 'Account bio', 'Account login',
                     'Median Activity per Day', 'Median Creation Time of the first activities',
                     'Text similarity', 'Text similarity of Comments Before Bot',
                     'Text similarity of Commit Messages', 'Type', 'Data_Source']

df = df[selected_features]

# Identify text and int features
text_features = ['login', 'Account tag', 'Account name', 'Account bio', 'Account login', 'Type', 'Data_Source']
int_features = [col for col in df.columns if col not in text_features]

# Preprocess text features using LabelEncoder
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
for feature in text_features:
    df[feature] = label_encoder.fit_transform(df[feature].astype(str))

# Remove correlated features
numeric_columns = df[int_features].select_dtypes(include=np.number).columns
corr_matrix = df[numeric_columns].corr().abs()
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper_tri.columns if any(upper_tri[column])]
df = df.drop(columns=to_drop)

# Split features and target variable
X = df.drop(columns=['Type', 'Data_Source'])
y = df['Type']

# Scale the input features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 6: One-hot encode text features using CountVectorizer
X_text_encoded = pd.get_dummies(df[text_features])

bots_df = df[df['Type'] == 0]
humans_df = df[df['Type'] == 1]

# Step 13: Split bots into 'issue_bots' and 'commit_bots' datasets
issue_bots = bots_df[bots_df['Data_Source'] == 1]
commit_bots = bots_df[bots_df['Data_Source'] == 0]

# Step 14: Add human accounts to both datasets
issue_bots_df = pd.concat([issue_bots, humans_df], ignore_index=True)
commit_bots_df = pd.concat([commit_bots, humans_df], ignore_index=True)

# Split features and target variable for issue_bots dataset
X_issue_bots = issue_bots_df.drop(columns=['Type', 'Data_Source'])
y_issue_bots = issue_bots_df['Type']

# Split features and target variable for commit_bots dataset
X_commit_bots = commit_bots_df.drop(columns=['Type', 'Data_Source'])
y_commit_bots = commit_bots_df['Type']

# Step 7: Initialize classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Support-Vector Machine': SVC(probability=True)
}

# Step 8: Evaluating classifiers on issue_bots and commit_bots datasets
results_issue_bots = {}
results_commit_bots = {}
skf_issue_bots = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
skf_commit_bots = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

for clf_name, clf in classifiers.items():
    precision_list_issue_bots = []
    recall_list_issue_bots = []
    f1_list_issue_bots = []
    auc_list_issue_bots = []
    precision_list_commit_bots = []
    recall_list_commit_bots = []
    f1_list_commit_bots = []
    auc_list_commit_bots = []

    # Inside the for loop for issue_bots
    for train_index, test_index in skf_issue_bots.split(X_issue_bots, y_issue_bots):
        X_train_issue_bots, X_test_issue_bots = X_issue_bots.iloc[train_index], X_issue_bots.iloc[test_index]
        X_train_issue_bots_text, X_test_issue_bots_text = X_text_encoded[X_text_encoded.index.isin(train_index)], X_text_encoded[X_text_encoded.index.isin(test_index)]

        # Preprocess text features using CountVectorizer
        count_vectorizer = CountVectorizer()
        X_train_issue_bots_text_encoded = count_vectorizer.fit_transform(X_train_issue_bots_text.astype(str).sum(axis=1))
        X_test_issue_bots_text_encoded = count_vectorizer.transform(X_test_issue_bots_text.astype(str).sum(axis=1))

        # Combine one-hot encoded text features with other numerical features
        X_train_issue_bots_text_encoded_df = pd.DataFrame(X_train_issue_bots_text_encoded.toarray()).add_prefix('text_')
        X_test_issue_bots_text_encoded_df = pd.DataFrame(X_test_issue_bots_text_encoded.toarray()).add_prefix('text_')
        X_train_issue_bots = pd.concat([X_train_issue_bots.reset_index(drop=True), X_train_issue_bots_text_encoded_df], axis=1)
        X_test_issue_bots = pd.concat([X_test_issue_bots.reset_index(drop=True), X_test_issue_bots_text_encoded_df], axis=1)

        clf.fit(X_train_issue_bots, y_issue_bots.iloc[train_index])
        y_pred_issue_bots = clf.predict(X_test_issue_bots)

        precision_list_issue_bots.append(precision_score(y_issue_bots.iloc[test_index], y_pred_issue_bots))
        recall_list_issue_bots.append(recall_score(y_issue_bots.iloc[test_index], y_pred_issue_bots))
        f1_list_issue_bots.append(f1_score(y_issue_bots.iloc[test_index], y_pred_issue_bots))
        auc_list_issue_bots.append(roc_auc_score(y_issue_bots.iloc[test_index], clf.predict_proba(X_test_issue_bots)[:, 1]))

     # Calculate average precision, recall, F1-score, and AUC for issue_bots
    avg_precision_issue_bots = np.nanmean(precision_list_issue_bots)
    avg_recall_issue_bots = np.nanmean(recall_list_issue_bots)
    avg_f1_issue_bots = np.nanmean(f1_list_issue_bots)
    avg_auc_issue_bots = np.nanmean(auc_list_issue_bots)

    results_issue_bots[clf_name] = {
        'Precision (%)': avg_precision_issue_bots * 100,
        'Recall (%)': avg_recall_issue_bots * 100,
        'F1-score (%)': avg_f1_issue_bots * 100,
        'AUC (%)': avg_auc_issue_bots * 100,
    }

   # Inside the for loop for commit_bots
    for train_index, test_index in skf_commit_bots.split(X_commit_bots, y_commit_bots):
        X_train_commit_bots, X_test_commit_bots = X_commit_bots.iloc[train_index], X_commit_bots.iloc[test_index]
        X_train_commit_bots_text, X_test_commit_bots_text = X_text_encoded[X_text_encoded.index.isin(train_index)], X_text_encoded[X_text_encoded.index.isin(test_index)]

    # Preprocess text features using CountVectorizer
        count_vectorizer = CountVectorizer()
        X_train_commit_bots_text_encoded = count_vectorizer.fit_transform(X_train_commit_bots_text.astype(str).sum(axis=1))
        X_test_commit_bots_text_encoded = count_vectorizer.transform(X_test_commit_bots_text.astype(str).sum(axis=1))

    # Combine one-hot encoded text features with other numerical features
        X_train_commit_bots_text_encoded_df = pd.DataFrame(X_train_commit_bots_text_encoded.toarray()).add_prefix('text_')
        X_test_commit_bots_text_encoded_df = pd.DataFrame(X_test_commit_bots_text_encoded.toarray()).add_prefix('text_')
        X_train_commit_bots = pd.concat([X_train_commit_bots.reset_index(drop=True), X_train_commit_bots_text_encoded_df], axis=1)
        X_test_commit_bots = pd.concat([X_test_commit_bots.reset_index(drop=True), X_test_commit_bots_text_encoded_df], axis=1)

        clf.fit(X_train_commit_bots, y_commit_bots.iloc[train_index])
        y_pred_commit_bots = clf.predict(X_test_commit_bots)

        precision_list_commit_bots.append(precision_score(y_commit_bots.iloc[test_index], y_pred_commit_bots))
        recall_list_commit_bots.append(recall_score(y_commit_bots.iloc[test_index], y_pred_commit_bots))
        f1_list_commit_bots.append(f1_score(y_commit_bots.iloc[test_index], y_pred_commit_bots))
        auc_list_commit_bots.append(roc_auc_score(y_commit_bots.iloc[test_index], clf.predict_proba(X_test_commit_bots)[:, 1]))

    results_commit_bots[clf_name] = {
        'Precision (%)': sum(precision_list_commit_bots) / len(precision_list_commit_bots) * 100,
        'Recall (%)': sum(recall_list_commit_bots) / len(recall_list_commit_bots) * 100,
        'F1-score (%)': sum(f1_list_commit_bots) / len(f1_list_commit_bots) * 100,
        'AUC (%)': sum(auc_list_commit_bots) / len(auc_list_commit_bots) * 100,
}

# Step 9: Display the results for Table 4 (Issue Bots)
print("Table 4: Performance of classifiers on issue_bots dataset:")
print(pd.DataFrame(results_issue_bots).transpose())

# Step 10: Display the results for Table 4 (Commit Bots)
print("\nTable 4: Performance of classifiers on commit_bots dataset:")
print(pd.DataFrame(results_commit_bots).transpose())

Table 4: Performance of classifiers on issue_bots dataset:
                        Precision (%)  Recall (%)  F1-score (%)    AUC (%)
Logistic Regression         97.405248   98.961012     98.174937  88.561961
K-Nearest Neighbors         91.367499   98.938797     95.001887  64.313776
Decision Tree               97.533460   98.915866     98.218060  87.269527
Random Forest               97.620110   98.938439     98.273301  89.387692
Support-Vector Machine      90.682012  100.000000     95.113310  59.253302

Table 4: Performance of classifiers on commit_bots dataset:
                        Precision (%)  Recall (%)  F1-score (%)    AUC (%)
Logistic Regression         98.921507   98.961472     98.939920  93.226822
K-Nearest Neighbors         95.256783   99.774164     97.463020  58.853956
Decision Tree               99.123401   99.164581     99.142759  90.906401
Random Forest               98.764460   99.074339     98.918073  93.000488
Support-Vector Machine      95.184919  100.000000     9