In [2]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
import os
import glob

Read and process opcode files

In [19]:
def read_opcode_files(base_path):
    data = []
    labels = []
    
    # Walk through all directories
    for apt_folder in os.listdir(base_path):
        apt_path = os.path.join(base_path, apt_folder)
        if os.path.isdir(apt_path):
            # Process each file in the APT folder
            for file_path in glob.glob(os.path.join(apt_path, '*')):
                with open(file_path, 'r') as f:
                    # Split by newline and join with commas
                    opcodes = f.read().strip()
                    opcodes = ', '.join(op.strip() for op in opcodes.split('\n') if op.strip())
                    data.append(opcodes)
                    labels.append(apt_folder)
    
    # Create the DataFrame
    df = pd.DataFrame({'Opcodes': data, 'APT': labels})
    
    # Print the first few rows of the DataFrame
    print("Preview of the DataFrame before n-gram extraction:")
    print(df.head())
    
    # Save to CSV
    output_path = 'opcodes_dataset.csv'
    df.to_csv(output_path, index=False)
    print(f"\nDataset saved to: {output_path}")
    
    return df


def extract_ngram_features(df, n=1):
    if n == 1:
        # Get all unique opcodes
        unique_ngrams = set()
        for opcodes in df['Opcodes']:
            unique_ngrams.update(op.strip() for op in opcodes.split(', '))
        
        # Sort the unique opcodes for consistent column ordering
        unique_ngrams = sorted(unique_ngrams)
        
        # Create a DataFrame with opcode frequencies
        ngram_counts_df = pd.DataFrame(0, index=range(len(df)), columns=unique_ngrams)
        
        # Count frequencies for each sample
        for i, row in df.iterrows():
            opcode_list = [op.strip() for op in row['Opcodes'].split(', ')]
            counts = Counter(opcode_list)
            
            for opcode, count in counts.items():
                ngram_counts_df.at[i, opcode] = count
                
    elif n == 2:
        # Get all unique bigrams from the entire dataset first
        all_bigrams = set()
        for opcodes in df['Opcodes']:
            opcode_list = [op.strip() for op in opcodes.split(', ')]
            bigrams = list(zip(opcode_list[:-1], opcode_list[1:]))
            bigram_strings = [f"{b[0]}_{b[1]}" for b in bigrams]
            all_bigrams.update(bigram_strings)
        
        # Sort the unique bigrams for consistent column ordering
        unique_bigrams = sorted(all_bigrams)
        
        # Create a DataFrame with bigram frequencies
        ngram_counts_df = pd.DataFrame(0, index=range(len(df)), columns=unique_bigrams)
        
        # Count frequencies for each sample
        for i, row in df.iterrows():
            opcode_list = [op.strip() for op in row['Opcodes'].split(', ')]
            bigrams = list(zip(opcode_list[:-1], opcode_list[1:]))
            bigram_strings = [f"{b[0]}_{b[1]}" for b in bigrams]
            counts = Counter(bigram_strings)
            
            for bigram, count in counts.items():
                ngram_counts_df.at[i, bigram] = count
    
    return ngram_counts_df.values, ngram_counts_df.columns.tolist()

# Example usage:
# Get both 1-gram and 2-gram features
X_1gram, cols_1gram = extract_ngram_features(df, n=1)
X_2gram, cols_2gram = extract_ngram_features(df, n=2)

# Convert to DataFrames using the columns from the feature extraction
feature_df_1gram = pd.DataFrame(X_1gram, columns=cols_1gram)
feature_df_2gram = pd.DataFrame(X_2gram, columns=cols_2gram)

print("Preview of 1-gram frequencies:")
print(feature_df_1gram.head())
print(f"\nNumber of unique 1-grams: {len(cols_1gram)}")

print("\nPreview of 2-gram frequencies:")
print(feature_df_2gram.head())
print(f"\nNumber of unique 2-grams: {len(cols_2gram)}")

Preview of 1-gram frequencies:
   A  AA  AAA  AAD  AAM  AAS  AB  ABCBFFFF  ABDAFFFF  ABFDFFFF  ...  X  XADD  \
0  0   0    0    0    0    0   0         0         0         0  ...  0     0   
1  0   0    0    0    0    0   0         0         0         0  ...  0     0   
2  0   0    0    0    0    0   0         0         0         0  ...  0     0   
3  0   0    0    0    0    0   0         0         0         0  ...  0     0   
4  0   0    0    0    0    0   0         0         0         0  ...  0     0   

   XADD.LOCK  XCHG  XGETBV  XLAT   XOR  XORPD  XORPS  Y  
0          0     0       0     0    98      0      0  0  
1          0     0       0     0    21      0      0  0  
2          0     3       0     0  1387      0      0  0  
3          0     0       0     0     1      0      0  0  
4          0     0       0     0     1      0      0  0  

[5 rows x 520 columns]

Number of unique 1-grams: 520

Preview of 2-gram frequencies:
   AAA_ADC  AAA_ADD  AAA_DEC  AAA_JNO  AAA_PUSH  AAA_

Read the data
Extract features
Combine features
Encode labels
Train/test split

In [20]:
base_path = '/Users/jeffreyjeyachandren/Desktop/opscode_ml/opcodes'  # Adjust this to your opscode folder path
df = read_opcode_files(base_path)

# Get features
X_1gram, cols_1gram = extract_ngram_features(df, n=1)
X_2gram, cols_2gram = extract_ngram_features(df, n=2)

# Combine features
X = np.concatenate([X_1gram, X_2gram], axis=1)

# Encode labels
le = LabelEncoder()
y = le.fit_transform(df['APT'])

# Print shape information
print(f"Shape of 1-gram features: {X_1gram.shape}")
print(f"Shape of 2-gram features: {X_2gram.shape}")
print(f"Shape of combined features: {X.shape}")
print(f"Number of classes: {len(np.unique(y))}")

# Split and continue with classification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Preview of the DataFrame before n-gram extraction:
                                             Opcodes            APT
0  JMP, JMP, JMP, JMP, JMP, JMP, JMP, JMP, POP, P...        Evilnum
1  PUSH, MOV, MOV, CALL, PUSH, PUSH, PUSH, PUSH, ...  APT19_opcodes
2  MOV, PUSH, MOV, CALL, TEST, PUSH, CALL, ADD, M...  APT19_opcodes
3  PUSH, PUSH, PUSH, MOV, CMP, JE, CMP, JNE, PUSH...  APT19_opcodes
4  PUSH, PUSH, PUSH, MOV, CMP, JE, CMP, JNE, PUSH...  APT19_opcodes

Dataset saved to: opcodes_dataset.csv
Shape of 1-gram features: (215, 520)
Shape of 2-gram features: (215, 5653)
Shape of combined features: (215, 6173)
Number of classes: 32


In [21]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize classifiers
classifiers = {
    'SVM': SVC(kernel='rbf'),
    'KNN-3': KNeighborsClassifier(n_neighbors=3),
    'KNN-5': KNeighborsClassifier(n_neighbors=5),
    'Decision Tree': DecisionTreeClassifier()
}

# Train and evaluate classifiers
results = {}
for name, clf in classifiers.items():
    # Train
    clf.fit(X_train, y_train)
    
    # Predict
    y_pred = clf.predict(X_test)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    
        # Get unique classes present in the test set
    unique_classes = np.unique(y_test)
    target_names = le.inverse_transform(unique_classes)
    
    # Generate classification report
    report = classification_report(y_test, y_pred, 
                                 labels=unique_classes,
                                 target_names=target_names)
    
    results[name] = {
        'accuracy': accuracy,
        'report': report
    }

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Results

In [22]:
# Print results
for name, result in results.items():
    print(f"\n{name} Results:")
    print(f"Accuracy: {result['accuracy']:.4f}")
    print("\nClassification Report:")
    print(result['report'])


SVM Results:
Accuracy: 0.3023

Classification Report:
                 precision    recall  f1-score   support

  APT12_opcodes       0.00      0.00      0.00         1
  APT17_opcodes       0.00      0.00      0.00         1
   APT1_opcodes       0.00      0.00      0.00         4
          APT28       0.00      0.00      0.00         3
          APT29       0.29      0.33      0.31         6
  APT30_opcodes       0.00      0.00      0.00         3
BlueMockingbird       0.00      0.00      0.00         1
      Elderwood       0.00      0.00      0.00         1
           FIN7       0.00      0.00      0.00         1
        Gallium       0.00      0.00      0.00         1
      Gamaredon       0.00      0.00      0.00         1
       Ke3chang       0.00      0.00      0.00         2
       MenuPass       0.00      0.00      0.00         1
         Moafee       0.31      1.00      0.48        11
       Sandworm       0.00      0.00      0.00         1
          Turla       0.00      