#Pre-processing


In [1]:
#import the necessary packages
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
import ast
import joblib

In [2]:
# Load the dataset
df = pd.read_csv('smart_contracts.csv')
print(df.head())

                                      address  \
0  0x006699d34AA3013605d468d2755A2Fe59A16B12B   
1  0x00c83aeCC790e8a4453e5dD3B0B4b3680501a7A7   
2  0x010c5322d78c88ca18282b0a072a8913648b3038   
3  0x0114622386c1a00686e594c70682d7aa0f8afa29   
4  0x01A6F6Ac4F5b2564e8C52BA687E7019D0E81E7e8   

                                         source_code  \
0  pragma solidity 0.5.4;\n\ninterface IERC20 {\n...   
1  pragma solidity 0.6.10;\npragma experimental A...   
2  pragma solidity 0.6.5;\npragma experimental AB...   
3  pragma solidity 0.6.6;\n\nlibrary Address {\n\...   
4  pragma solidity 0.6.8;\npragma experimental AB...   

                                            bytecode                 slither  
0  0x608060405234801561001057600080fd5b5060043610...                     [4]  
1  0x608060405234801561001057600080fd5b5060043610...            [6, 4, 9, 7]  
2  0x608060405234801561001057600080fd5b5060043610...                    [38]  
3  0x608060405234801561001057600080fd5b5060043610...

In [3]:
#labels
LABELS = {0: 'access-control', 1: 'arithmetic', 2: 'other', 3: 'reentrancy', 4: 'safe', 5: 'unchecked-calls'}

In [4]:
#slither array
def extract_first_element(arr):
    arr = ast.literal_eval(arr)
    return arr[0] if arr else None

df['slither'] = df['slither'].apply(extract_first_element)
df = df[df['slither'].isin(LABELS.keys())]

#Pipeline

In [5]:
# Custom transformer to select columns
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.column]

In [6]:
# Custom transformer to convert text data
class TextTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, vectorizer):
        self.vectorizer = vectorizer

    def fit(self, X, y=None):
        self.vectorizer.fit(X)
        return self

    def transform(self, X):
        return self.vectorizer.transform(X)

In [7]:
# Combine source code and bytecode text features
source_code_transformer = Pipeline([
    ('selector', ColumnSelector('source_code')),
    ('tfidf', TextTransformer(TfidfVectorizer(max_features=5000))),
])

bytecode_transformer = Pipeline([
    ('selector', ColumnSelector('bytecode')),
    ('tfidf', TextTransformer(TfidfVectorizer(max_features=5000))),
])

In [8]:
# Combine all features
combined_features = FeatureUnion([
    ('source_code', source_code_transformer),
    ('bytecode', bytecode_transformer),
])

In [9]:
# Final pipeline
pipeline = Pipeline([
    ('features', combined_features),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42)),
])

#Dataset

In [10]:
# Prepare the dataset
X = df[['source_code', 'bytecode']]
y = df['slither']

In [11]:
# Split the dataset and reset indices
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_test = X_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

#Model

In [12]:
# Train the model
pipeline.fit(X_train, y_train)
# Save the model to a file
joblib.dump(pipeline, 'smart_contract_vulnerability_model.pkl')


['smart_contract_vulnerability_model.pkl']

#Evaluation

In [13]:
# Predict and evaluate
y_pred = pipeline.predict(X_test)
# Map predictions to vulnerability labels, handling unexpected labels
y_pred_labels = [LABELS.get(label, 'unknown') for label in y_pred]

In [14]:
# Display the results
for i in range(len(X_test)):
    address = df.iloc[X_test.index[i]]['address']
    pred_label = y_pred_labels[i]
    source_code_snippet = df.iloc[X_test.index[i]]['source_code'][:200]  # Show a snippet of the source code

    print(f"Address: {address}")
    print(f"Predicted Vulnerability: {pred_label}")
    print(f"Source Code Snippet: {source_code_snippet}...")
    print()

Address: 0x006699d34AA3013605d468d2755A2Fe59A16B12B
Predicted Vulnerability: safe
Source Code Snippet: pragma solidity 0.5.4;

interface IERC20 {





    function balanceOf(address account) external view returns (uint256);


    function transfer(address recipient, uint256 amount) external returns (bo...

Address: 0x01A6F6Ac4F5b2564e8C52BA687E7019D0E81E7e8
Predicted Vulnerability: safe
Source Code Snippet: pragma solidity 0.6.8;
pragma experimental ABIEncoderV2;

contract Initializable {



  /**

   * @dev Indicates that the contract has been initialized.

   */

  bool private initialized;



  /**

 ...

Address: 0x01b23286ff60a543ec29366ae8d6b6274ca20541
Predicted Vulnerability: safe
Source Code Snippet: pragma solidity 0.4.26;

interface IERC20 {

  function totalSupply() external view returns (uint256);



  function balanceOf(address who) external view returns (uint256);



  function allowance(add...

Address: 0x01b952402442630232f32d48380a557015b7c5ec
Predicted Vulnerability:

In [15]:
# Evaluation
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

           2       0.00      0.00      0.00         1
           4       0.50      1.00      0.67         3
           5       0.00      0.00      0.00         2

    accuracy                           0.50         6
   macro avg       0.17      0.33      0.22         6
weighted avg       0.25      0.50      0.33         6

Accuracy: 0.5


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#Gradio Interface

In [16]:
!pip install gradio

Collecting gradio
  Downloading gradio-4.31.5-py3-none-any.whl (12.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m48.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl (15 kB)
Collecting fastapi (from gradio)
  Downloading fastapi-0.111.0-py3-none-any.whl (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.0/92.0 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ffmpy (from gradio)
  Downloading ffmpy-0.3.2.tar.gz (5.5 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting gradio-client==0.16.4 (from gradio)
  Downloading gradio_client-0.16.4-py3-none-any.whl (315 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m315.9/315.9 kB[0m [31m30.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting httpx>=0.24.1 (from gradio)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━

In [17]:
import gradio as gr
import joblib
import pandas as pd

model = joblib.load('smart_contract_vulnerability_model.pkl')

LABELS = {0: 'access-control', 1: 'arithmetic', 2: 'other', 3: 'reentrancy', 4: 'safe', 5: 'unchecked-calls'}

def predict_vulnerability(source_code, bytecode):
    data = pd.DataFrame({'source_code': [source_code], 'bytecode': [bytecode]})
    prediction = model.predict(data)
    pred_label = LABELS.get(prediction[0], 'unknown')
    return pred_label

iface = gr.Interface(
    fn=predict_vulnerability,
    inputs=[
        gr.Textbox(lines=20, placeholder="Enter smart contract source code here...", label="Source Code"),
        gr.Textbox(lines=5, placeholder="Enter smart contract bytecode here...", label="Bytecode")
    ],
    outputs="text",
    title="Smart Contract Vulnerability Predictor",
    description="Enter the smart contract source code and bytecode to predict its potential vulnerability."
)

iface.launch()


Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://6fa0229b0003ffe379.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


