In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, ShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
#from google.colab import drive
from sklearn.metrics import roc_auc_score

In [2]:
## Mount Google Drive to access the graph enriched data file
#drive.mount('/content/drive')

## Path to the CSV file
#file_path_small = '/content/drive/MyDrive/NS-project/graph_data.csv'
#file_path_full = '/content/drive/MyDrive/NS-project/graph_data_full.csv'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
file_path_small = 'data/graph_data.csv'
file_path_full = 'data/graph_data_full.csv'

# Load the data into a DataFrame
data_small = pd.read_csv(file_path_small)
data = pd.read_csv(file_path_full)

In [4]:
data_small.describe()

Unnamed: 0,label,num_edges,num_nodes,density,nr_connected_components,average_path_length,diameter,clustering_coefficient,assortativity,modularity_label_prop,modularity_girvan_newman,most_frequent_degrees,degree_frequencies
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,0.0,0.0
mean,0.507,24.992,24.026,0.118248,1.0,2.422608,4.563,0.046237,-0.662832,0.24974,0.255417,,
std,0.500201,20.232333,17.489975,0.048215,0.0,0.669035,2.249687,0.073039,0.22039,0.176892,0.195096,,
min,0.0,10.0,11.0,0.021978,1.0,1.757576,2.0,0.0,-1.0,0.0,-0.05,,
25%,0.0,13.0,13.0,0.08129,1.0,1.977941,3.0,0.0,-0.847469,0.108383,0.08284,,
50%,1.0,16.5,17.0,0.125,1.0,2.209524,4.0,0.0,-0.679982,0.245291,0.25,,
75%,1.0,27.0,26.25,0.153846,1.0,2.652403,6.0,0.081675,-0.500533,0.39455,0.404959,,
max,1.0,126.0,97.0,0.272727,1.0,7.423473,17.0,0.757702,-0.069693,0.650042,0.728239,,


In [5]:
data_small.isnull().sum()

edge_list                      0
label                          0
num_edges                      0
num_nodes                      0
density                        0
is_connected                   0
nr_connected_components        0
average_path_length            0
diameter                       0
clustering_coefficient         0
assortativity                  0
modularity_label_prop          0
modularity_girvan_newman       0
most_frequent_degrees       1000
degree_frequencies          1000
degrees                        0
degree_probabilities           0
dtype: int64

In [6]:
data.describe()

Unnamed: 0,label,num_edges,num_nodes,density,nr_connected_components,average_path_length,diameter,clustering_coefficient,assortativity,modularity_label_prop,modularity_girvan_newman
count,203088.0,203088.0,203088.0,203088.0,203088.0,203088.0,203088.0,203088.0,203088.0,203088.0,203088.0
mean,0.512413,24.851375,23.926968,0.117135,1.0,2.42785,4.578065,0.046727,-0.654529,0.255165,0.26321
std,0.499847,19.143689,16.554645,0.048414,0.0,0.653612,2.171184,0.069778,0.216767,0.174359,0.192136
min,0.0,10.0,11.0,0.020619,1.0,1.690909,2.0,0.0,-1.0,0.0,-0.053633
25%,0.0,13.0,13.0,0.078818,1.0,1.983333,3.0,0.0,-0.829268,0.119898,0.083333
50%,1.0,17.0,17.0,0.117647,1.0,2.213333,4.0,0.0,-0.672453,0.252066,0.258264
75%,1.0,28.0,27.0,0.153846,1.0,2.641653,5.0,0.082332,-0.492248,0.398438,0.410494
max,1.0,185.0,97.0,0.345455,1.0,9.860362,27.0,0.903363,0.315789,0.703316,0.750434


In [7]:
data.isnull().sum()

edge_list                   0
label                       0
num_edges                   0
num_nodes                   0
density                     0
is_connected                0
nr_connected_components     0
average_path_length         0
diameter                    0
clustering_coefficient      0
assortativity               0
modularity_label_prop       0
modularity_girvan_newman    0
degrees                     0
degree_probabilities        0
dtype: int64

In [8]:
# Lets find features to use
X = data_small.drop(['label', 'edge_list', 'most_frequent_degrees',	'degree_frequencies',	'degrees',	'degree_probabilities'], axis=1)
y = data_small['label']

X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.1, random_state=0)

sel = SelectFromModel(RandomForestClassifier(random_state=0))
sel.fit(X_train, y_train)

selected_feat= X_train.columns[(sel.get_support())]
print('Fetures to use:', selected_feat)

Fetures to use: Index(['average_path_length', 'assortativity', 'modularity_label_prop',
       'modularity_girvan_newman'],
      dtype='object')


In [9]:
# Lets find best parameters for RandomForestClassifier, it takes ~10 min
param_grid = {
    'n_estimators': [100, 200, 300],  
    'max_depth': [None, 5, 10],  
    'min_samples_split': [2, 5, 10],  
    'min_samples_leaf': [1, 2, 4],  
    'max_features': ['sqrt', 'log2', None]
}

X = data_small.drop(data_small.columns.difference(selected_feat), axis=1)
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.1, random_state=0)

model = RandomForestClassifier(random_state=0)

# Perform grid search using cross-validation
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Print the best hyperparameters and best score
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# Evaluate the model with the best hyperparameters on the test set
best_model = grid_search.best_estimator_
accuracy = best_model.score(X_test, y_test)
print("Accuracy:", accuracy)

Best Hyperparameters: {'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
Best Score: 0.7666666666666666
Accuracy: 0.77


In [10]:
# Lets use parameters on full dataset, it takes ~1 min
X = data.drop(data.columns.difference(selected_feat), axis=1)
y = data['label'] 
model = RandomForestClassifier(max_depth=5, max_features='sqrt', min_samples_leaf=1, min_samples_split=5, n_estimators=100, random_state=0)

# Initialize the ShuffleSplit cross-validator
shuffle_split = ShuffleSplit(n_splits=5, test_size=0.1, random_state=0)

# Perform cross-validation with random folds
accuracy_scores = []
auc_scores = []
for train_index, test_index in shuffle_split.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Train the model
    model.fit(X_train, y_train)
    # Predict probabilities for the positive class
    y_pred_prob = model.predict_proba(X_test)[:, 1]
    # Calculate the AUC score
    auc_score = roc_auc_score(y_test, y_pred_prob)
    auc_scores.append(auc_score)

    # Evaluate the model on the test set
    accuracy = model.score(X_test, y_test)
    accuracy_scores.append(accuracy)

# Calculate and print the average accuracy and auc across all folds
average_accuracy = sum(accuracy_scores) / len(accuracy_scores)
average_auc = sum(auc_scores) / len(auc_scores)
print("Average Accuracy:", average_accuracy)
print("Average AUC:", average_auc)

Average Accuracy: 0.7728790191540696
Average AUC: 0.8360184493062398


In [11]:
accuracy_scores

[0.7732532374809198,
 0.773696390762716,
 0.7730070412132553,
 0.769018661677089,
 0.7754197646363681]

In [12]:
auc_scores

[0.8376905096893367,
 0.8359198050326401,
 0.8379970406177906,
 0.8315592728739172,
 0.8369256183175147]