In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
import warnings

# Ignore warnings
warnings.filterwarnings('ignore')

# Load the dataset
file_path = 'smart_contract_dataset.csv'
data = pd.read_csv(file_path)

# Step 1: Data Collection and Preprocessing
# Encode categorical features
label_encoder = LabelEncoder()
data['Code Snippet'] = label_encoder.fit_transform(data['Code Snippet'])
data['Function Call Patterns'] = label_encoder.fit_transform(data['Function Call Patterns'])
data['Control Flow Graph'] = label_encoder.fit_transform(data['Control Flow Graph'])
data['Opcode Sequence'] = label_encoder.fit_transform(data['Opcode Sequence'])

# Define features and labels
X = data.drop('Label', axis=1)
y = data['Label']

# Split the dataset into training, validation, and testing sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Step 2: Feature Extraction (already done in preprocessing)

# Step 3: Supervised Learning
# Train a Random Forest Classifier
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train, y_train)

# Train a Support Vector Machine Classifier
svm_clf = SVC(kernel='linear', random_state=42)
svm_clf.fit(X_train, y_train)

# Evaluate the models on the validation set
rf_val_preds = rf_clf.predict(X_val)
svm_val_preds = svm_clf.predict(X_val)

print("Random Forest Validation Accuracy:", accuracy_score(y_val, rf_val_preds))
print("SVM Validation Accuracy:", accuracy_score(y_val, svm_val_preds))

# Step 4: Static and Dynamic Analysis (Placeholder for actual implementation)
def static_analysis(contract_code):
    # Placeholder for static analysis implementation
    # Identify vulnerabilities like reentrancy, integer overflow, access control issues
    vulnerabilities = []
    # Implement static analysis logic here
    return vulnerabilities

def dynamic_analysis(contract_code):
    # Placeholder for dynamic analysis implementation
    # Execute the smart contract in a controlled environment
    behavior_issues = []
    # Implement dynamic analysis logic here
    return behavior_issues

# Step 5: Automated Scanning (Placeholder for actual implementation)
def automated_scanning(contract_code):
    static_vulnerabilities = static_analysis(contract_code)
    dynamic_issues = dynamic_analysis(contract_code)
    report = {
        "static_vulnerabilities": static_vulnerabilities,
        "dynamic_issues": dynamic_issues
    }
    return report

# Generate reports for testing data
for index, row in X_test.iterrows():
    contract_code = label_encoder.inverse_transform([row['Code Snippet']])[0]
    report = automated_scanning(contract_code)
    print(f"Contract ID: {data.loc[index, 'Contract ID']}, Report: {report}")

# Final evaluation on the test set
rf_test_preds = rf_clf.predict(X_test)
svm_test_preds = svm_clf.predict(X_test)

print("\nRandom Forest Test Accuracy:", accuracy_score(y_test, rf_test_preds))
print("Random Forest Test Classification Report:\n", classification_report(y_test, rf_test_preds))

print("\nSVM Test Accuracy:", accuracy_score(y_test, svm_test_preds))
print("SVM Test Classification Report:\n", classification_report(y_test, svm_test_preds))

# Step 6: Deployment (Placeholder for actual deployment steps)
def deploy_model(model):
    # Placeholder for deployment logic
    # Integrate the model into blockchain development systems or deploy as a web-based tool
    pass

# Deploy the best performing model
deploy_model(rf_clf if accuracy_score(y_test, rf_test_preds) > accuracy_score(y_test, svm_test_preds) else svm_clf)


Random Forest Validation Accuracy: 1.0
SVM Validation Accuracy: 0.9977777777777778
Contract ID: 309, Report: {'static_vulnerabilities': [], 'dynamic_issues': []}
Contract ID: 2381, Report: {'static_vulnerabilities': [], 'dynamic_issues': []}
Contract ID: 2762, Report: {'static_vulnerabilities': [], 'dynamic_issues': []}
Contract ID: 1238, Report: {'static_vulnerabilities': [], 'dynamic_issues': []}
Contract ID: 1299, Report: {'static_vulnerabilities': [], 'dynamic_issues': []}
Contract ID: 2821, Report: {'static_vulnerabilities': [], 'dynamic_issues': []}
Contract ID: 1023, Report: {'static_vulnerabilities': [], 'dynamic_issues': []}
Contract ID: 775, Report: {'static_vulnerabilities': [], 'dynamic_issues': []}
Contract ID: 131, Report: {'static_vulnerabilities': [], 'dynamic_issues': []}
Contract ID: 2067, Report: {'static_vulnerabilities': [], 'dynamic_issues': []}
Contract ID: 402, Report: {'static_vulnerabilities': [], 'dynamic_issues': []}
Contract ID: 1459, Report: {'static_vulne

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
file_path = 'smart_contract_dataset.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
print("Dataset:")
print(data.head())

# Feature Extraction
# Convert categorical columns to numeric using one-hot encoding
data_encoded = pd.get_dummies(data, columns=['Function Call Patterns', 'Control Flow Graph', 'Opcode Sequence'])

# Extract features and labels
X = data_encoded.drop(columns=['Contract ID', 'Code Snippet', 'Label'])
y = data_encoded['Label']

# Split the dataset into training, validation, and testing sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Train machine learning models
# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Support Vector Machine
svm_model = SVC(random_state=42)
svm_model.fit(X_train, y_train)

# Evaluate the models using validation set
rf_val_pred = rf_model.predict(X_val)
svm_val_pred = svm_model.predict(X_val)

print("Random Forest Validation Accuracy:", accuracy_score(y_val, rf_val_pred))
print("SVM Validation Accuracy:", accuracy_score(y_val, svm_val_pred))

# Select the best model based on validation performance
best_model = rf_model if accuracy_score(y_val, rf_val_pred) > accuracy_score(y_val, svm_val_pred) else svm_model

# Evaluate the best model using the test set
test_pred = best_model.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, test_pred))
print("Classification Report:")
print(classification_report(y_test, test_pred))

# Static Analysis (Example: Identify reentrancy, integer overflow, and access control issues)
def static_analysis(code_snippet):
    vulnerabilities = []
    if 'reentrancy' in code_snippet:
        vulnerabilities.append('Reentrancy')
    if 'integer overflow' in code_snippet:
        vulnerabilities.append('Integer Overflow')
    if 'access control' in code_snippet:
        vulnerabilities.append('Access Control')
    return vulnerabilities

data['Vulnerabilities'] = data['Code Snippet'].apply(static_analysis)
print("Static Analysis Results:")
print(data[['Contract ID', 'Vulnerabilities']].head())

# Dynamic Analysis (Pseudo code as actual execution requires blockchain environment)
def dynamic_analysis(contract_id):
    # Placeholder for dynamic analysis code
    # Execute the smart contract in a controlled environment
    # Observe behavior and identify potential weaknesses
    return []

data['Dynamic Vulnerabilities'] = data['Contract ID'].apply(dynamic_analysis)
print("Dynamic Analysis Results:")
print(data[['Contract ID', 'Dynamic Vulnerabilities']].head())

# Automated Scanning (Example: Real-time vulnerability identification)
def automated_scanning(code_snippet):
    # Placeholder for automated scanning using blockchain development tools
    vulnerabilities = static_analysis(code_snippet)
    return vulnerabilities

data['Automated Scan Results'] = data['Code Snippet'].apply(automated_scanning)
print("Automated Scanning Results:")
print(data[['Contract ID', 'Automated Scan Results']].head())

# Generate detailed reports for developers
def generate_report(contract_id, vulnerabilities):
    report = f"Contract ID: {contract_id}\nVulnerabilities: {', '.join(vulnerabilities)}"
    return report

data['Reports'] = data.apply(lambda row: generate_report(row['Contract ID'], row['Automated Scan Results']), axis=1)
print("Generated Reports:")
print(data[['Contract ID', 'Reports']].head())

# Save the detailed reports to a file
data[['Contract ID', 'Reports']].to_csv('smart_contract_reports.csv', index=False)
print("Reports saved to 'smart_contract_reports.csv'")


Dataset:
   Contract ID                   Code Snippet Function Call Patterns  \
0         1217  contract_vtwawnvsmqtfssnlcunf       {call48, call67}   
1         2623  contract_bwwerteghxlgxchhkrue       {call30, call72}   
2         2078  contract_tlkanexsvuxvezzulnwy       {call18, call80}   
3         1416  contract_fqujpcccvilvzzmkmrno       {call49, call75}   
4         1918  contract_rxbkkepuonergclgcojw       {call11, call80}   

  Control Flow Graph          Opcode Sequence  Label  
0   graph_zkkglurmtf  opcodes_ihcgwyleqwdqlfr      0  
1   graph_xgtkypidke  opcodes_qjkyudygntzggrt      1  
2   graph_matsswbcuu  opcodes_gxtyecphiyxokat      1  
3   graph_peyuckydnn  opcodes_sgqxulrlpuaivdz      0  
4   graph_vtnkqofhir  opcodes_qxwjsibgxmolrrp      1  
Random Forest Validation Accuracy: 0.4955555555555556
SVM Validation Accuracy: 0.5088888888888888
Test Accuracy: 0.49777777777777776
Classification Report:
              precision    recall  f1-score   support

           0     

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

# Load the dataset
file_path = 'smart_contract_dataset.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
print("Dataset:")
print(data.head())

# Feature Extraction
# Convert categorical columns to numeric using one-hot encoding
data_encoded = pd.get_dummies(data, columns=['Function Call Patterns', 'Control Flow Graph', 'Opcode Sequence'])

# Extract features and labels
X = data_encoded.drop(columns=['Contract ID', 'Code Snippet', 'Label'])
y = data_encoded['Label']

# Split the dataset into training, validation, and testing sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# Hyperparameter tuning for Random Forest
rf_params = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

rf_model = GridSearchCV(RandomForestClassifier(random_state=42), rf_params, cv=3, n_jobs=-1)
rf_model.fit(X_train, y_train)
print("Best Random Forest Params:", rf_model.best_params_)

# Hyperparameter tuning for SVM
svm_params = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

svm_model = GridSearchCV(SVC(random_state=42), svm_params, cv=3, n_jobs=-1)
svm_model.fit(X_train, y_train)
print("Best SVM Params:", svm_model.best_params_)

# Gradient Boosting Classifier
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train, y_train)

# Evaluate the models using validation set
rf_val_pred = rf_model.predict(X_val)
svm_val_pred = svm_model.predict(X_val)
gb_val_pred = gb_model.predict(X_val)

print("Random Forest Validation Accuracy:", accuracy_score(y_val, rf_val_pred))
print("SVM Validation Accuracy:", accuracy_score(y_val, svm_val_pred))
print("Gradient Boosting Validation Accuracy:", accuracy_score(y_val, gb_val_pred))

# Select the best model based on validation performance
best_model = rf_model if accuracy_score(y_val, rf_val_pred) > max(accuracy_score(y_val, svm_val_pred), accuracy_score(y_val, gb_val_pred)) else (svm_model if accuracy_score(y_val, svm_val_pred) > accuracy_score(y_val, gb_val_pred) else gb_model)

# Evaluate the best model using the test set
test_pred = best_model.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, test_pred))
print("Classification Report:")
print(classification_report(y_test, test_pred))

# Static Analysis (Example: Identify reentrancy, integer overflow, and access control issues)
def static_analysis(code_snippet):
    vulnerabilities = []
    if 'reentrancy' in code_snippet:
        vulnerabilities.append('Reentrancy')
    if 'integer overflow' in code_snippet:
        vulnerabilities.append('Integer Overflow')
    if 'access control' in code_snippet:
        vulnerabilities.append('Access Control')
    return vulnerabilities

data['Vulnerabilities'] = data['Code Snippet'].apply(static_analysis)
print("Static Analysis Results:")
print(data[['Contract ID', 'Vulnerabilities']].head())

# Dynamic Analysis (Pseudo code as actual execution requires blockchain environment)
def dynamic_analysis(contract_id):
    # Placeholder for dynamic analysis code
    # Execute the smart contract in a controlled environment
    # Observe behavior and identify potential weaknesses
    return []

data['Dynamic Vulnerabilities'] = data['Contract ID'].apply(dynamic_analysis)
print("Dynamic Analysis Results:")
print(data[['Contract ID', 'Dynamic Vulnerabilities']].head())

# Automated Scanning (Example: Real-time vulnerability identification)
def automated_scanning(code_snippet):
    # Placeholder for automated scanning using blockchain development tools
    vulnerabilities = static_analysis(code_snippet)
    return vulnerabilities

data['Automated Scan Results'] = data['Code Snippet'].apply(automated_scanning)
print("Automated Scanning Results:")
print(data[['Contract ID', 'Automated Scan Results']].head())

# Generate detailed reports for developers
def generate_report(contract_id, vulnerabilities):
    report = f"Contract ID: {contract_id}\nVulnerabilities: {', '.join(vulnerabilities)}"
    return report

data['Reports'] = data.apply(lambda row: generate_report(row['Contract ID'], row['Automated Scan Results']), axis=1)
print("Generated Reports:")
print(data[['Contract ID', 'Reports']].head())

# Save the detailed reports to a file
data[['Contract ID', 'Reports']].to_csv('smart_contract_reports.csv', index=False)
print("Reports saved to 'smart_contract_reports.csv'")


Dataset:
   Contract ID                   Code Snippet Function Call Patterns  \
0         1217  contract_vtwawnvsmqtfssnlcunf       {call48, call67}   
1         2623  contract_bwwerteghxlgxchhkrue       {call30, call72}   
2         2078  contract_tlkanexsvuxvezzulnwy       {call18, call80}   
3         1416  contract_fqujpcccvilvzzmkmrno       {call49, call75}   
4         1918  contract_rxbkkepuonergclgcojw       {call11, call80}   

  Control Flow Graph          Opcode Sequence  Label  
0   graph_zkkglurmtf  opcodes_ihcgwyleqwdqlfr      0  
1   graph_xgtkypidke  opcodes_qjkyudygntzggrt      1  
2   graph_matsswbcuu  opcodes_gxtyecphiyxokat      1  
3   graph_peyuckydnn  opcodes_sgqxulrlpuaivdz      0  
4   graph_vtnkqofhir  opcodes_qxwjsibgxmolrrp      1  
Best Random Forest Params: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best SVM Params: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}
Random Forest Validation Accuracy: 0.4955555555

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

# Load the dataset
file_path = 'smart_contract_dataset.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
print("Dataset:")
print(data.head())

# Feature Extraction
# Convert categorical columns to numeric using one-hot encoding
data_encoded = pd.get_dummies(data, columns=['Function Call Patterns', 'Control Flow Graph', 'Opcode Sequence'])

# Extract features and labels
X = data_encoded.drop(columns=['Contract ID', 'Code Snippet', 'Label'])
y = data_encoded['Label']

# Split the dataset into training, validation, and testing sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# Hyperparameter tuning for Random Forest
rf_params = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

rf_model = GridSearchCV(RandomForestClassifier(random_state=42), rf_params, cv=3, n_jobs=-1)
rf_model.fit(X_train, y_train)
print("Best Random Forest Params:", rf_model.best_params_)

# Hyperparameter tuning for SVM
svm_params = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

svm_model = GridSearchCV(SVC(random_state=42), svm_params, cv=3, n_jobs=-1)
svm_model.fit(X_train, y_train)
print("Best SVM Params:", svm_model.best_params_)

# Gradient Boosting Classifier
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train, y_train)

# Evaluate the models using validation set
rf_val_pred = rf_model.predict(X_val)
svm_val_pred = svm_model.predict(X_val)
gb_val_pred = gb_model.predict(X_val)

print("Random Forest Validation Accuracy:", accuracy_score(y_val, rf_val_pred))
print("SVM Validation Accuracy:", accuracy_score(y_val, svm_val_pred))
print("Gradient Boosting Validation Accuracy:", accuracy_score(y_val, gb_val_pred))

# Select the best model based on validation performance
best_model = rf_model if accuracy_score(y_val, rf_val_pred) > max(accuracy_score(y_val, svm_val_pred), accuracy_score(y_val, gb_val_pred)) else (svm_model if accuracy_score(y_val, svm_val_pred) > accuracy_score(y_val, gb_val_pred) else gb_model)

# Evaluate the best model using the test set
test_pred = best_model.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, test_pred))
print("Classification Report:")
print(classification_report(y_test, test_pred))

# Static Analysis (Example: Identify reentrancy, integer overflow, and access control issues)
def static_analysis(code_snippet):
    vulnerabilities = []
    if 'reentrancy' in code_snippet:
        vulnerabilities.append('Reentrancy')
    if 'integer overflow' in code_snippet:
        vulnerabilities.append('Integer Overflow')
    if 'access control' in code_snippet:
        vulnerabilities.append('Access Control')
    return vulnerabilities

data['Vulnerabilities'] = data['Code Snippet'].apply(static_analysis)
print("Static Analysis Results:")
print(data[['Contract ID', 'Vulnerabilities']].head())

# Dynamic Analysis (Pseudo code as actual execution requires blockchain environment)
def dynamic_analysis(contract_id):
    # Placeholder for dynamic analysis code
    # Execute the smart contract in a controlled environment
    # Observe behavior and identify potential weaknesses
    return []

data['Dynamic Vulnerabilities'] = data['Contract ID'].apply(dynamic_analysis)
print("Dynamic Analysis Results:")
print(data[['Contract ID', 'Dynamic Vulnerabilities']].head())

# Automated Scanning (Example: Real-time vulnerability identification)
def automated_scanning(code_snippet):
    # Placeholder for automated scanning using blockchain development tools
    vulnerabilities = static_analysis(code_snippet)
    return vulnerabilities

data['Automated Scan Results'] = data['Code Snippet'].apply(automated_scanning)
print("Automated Scanning Results:")
print(data[['Contract ID', 'Automated Scan Results']].head())

# Generate detailed reports for developers
def generate_report(contract_id, vulnerabilities):
    report = f"Contract ID: {contract_id}\nVulnerabilities: {', '.join(vulnerabilities)}"
    return report

data['Reports'] = data.apply(lambda row: generate_report(row['Contract ID'], row['Automated Scan Results']), axis=1)
print("Generated Reports:")
print(data[['Contract ID', 'Reports']].head())

# Save the detailed reports to a file
data[['Contract ID', 'Reports']].to_csv('smart_contract_reports.csv', index=False)
print("Reports saved to 'smart_contract_reports.csv'")
