In [1]:
import pandas as pd
from collections import Counter
from itertools import islice

In [2]:
df = pd.read_csv('fopc.csv')
df

Unnamed: 0,Opcodes,APT
0,"RET, NOP, NOP, SUB, MOV, XOR, MOV, MOV, MOV, M...",1
1,"PUSH, POP, MOV, MOV, INT, MOV, INT, PUSH, PUSH...",1
2,"PUSH, POP, MOV, MOV, INT, MOV, INT, PUSH, PUSH...",1
3,"PUSH, POP, MOV, MOV, INT, MOV, INT, PUSH, PUSH...",1
4,"PUSH, POP, MOV, MOV, INT, MOV, INT, PUSH, PUSH...",2
...,...,...
302,"PUSH, PUSH, PUSH, PUSH, PUSH, PUSH, SUB, MOV, ...",34
303,"PUSH, POP, MOV, MOV, INT, MOV, INT, PUSH, PUSH...",34
304,"PUSH, POP, MOV, MOV, INT, MOV, INT, PUSH, PUSH...",34
305,"PUSH, POP, MOV, MOV, INT, MOV, INT, PUSH, PUSH...",34


In [5]:
# ===================== 1-GRAM PROCESSING =====================
unique_1grams = set()
for opcodes in df['Opcodes']:
    unique_1grams.update(op.strip() for op in opcodes.split(', '))

unique_1grams = sorted(unique_1grams)
one_gram_counts_df = pd.DataFrame(0, index=range(len(df)), columns=unique_1grams)

for i, row in df.iterrows():
    opcode_list = [op.strip() for op in row['Opcodes'].split(', ')]
    counts_1gram = Counter(opcode_list)

    for opcode, count in counts_1gram.items():
        one_gram_counts_df.at[i, opcode] = count

x_1gram = one_gram_counts_df.values
y_1gram = df['APT'].values

In [7]:
# ===================== 2-GRAM PROCESSING =====================
def generate_2grams(sequence):
    """Generate 2-grams from a sequence of opcodes."""
    return ['{'+', '.join(sequence[i:i + 2])+'}' for i in range(len(sequence) - 2 + 1)]

unique_2grams = set()
for opcodes in df['Opcodes']:
    opcode_list = [op.strip() for op in opcodes.split(', ')]
    two_grams = generate_2grams(opcode_list)
    unique_2grams.update(two_grams)
print(unique_2grams)
unique_2grams = sorted(unique_2grams)
two_gram_counts_df = pd.DataFrame(0, index=range(len(df)), columns=unique_2grams)

for i, row in df.iterrows():
    opcode_list = [op.strip() for op in row['Opcodes'].split(', ')]
    two_grams = generate_2grams(opcode_list)
    counts_2gram = Counter(two_grams)

    for two_gram, count in counts_2gram.items():
        two_gram_counts_df.at[i, two_gram] = count

x_2gram = two_gram_counts_df.values




{'{FCOMI, XOR}', '{JMP, LODSB}', '{FFREE, POP}', '{RCR, LEA}', '{SAR, CMP}', '{SCASB.REPNE, JNZ}', '{ADD, SCASW.REPNE}', '{SETLE, POP}', '{PUSH, JNO}', '{POPFD, OUT}', '{FCMOVE, JNZ}', '{PUSH, MOVSB.REP}', '{JNC, XOR}', '{RCL, SBB}', '{JC, PUSHFD}', '{JL, TEST}', '{BTS, MOVZX}', '{SALC, AND}', '{IN, POP}', '{LOOPNZ, MOV}', '{ADDSS, MOVSS}', '{SHR, BSR}', '{CALL, FRSTOR}', '{PCMPEQW, PMOVMSKB}', '{CLD, NOP}', '{MOVZX, CMP}', '{JC, BTC}', '{CPUID, TEST}', '{LEA, PUSHFD}', '{IN, JC}', '{PUSH, JL}', '{BT, DAS}', '{SUB, IN}', '{CMP, SETNS}', '{CLC, SALC}', '{MOVSD, NOP}', '{INC, XOR}', '{PUSHFD, ADD}', '{SUB, RCR}', '{SETNZ, PUSH}', '{BOUND, MOVSW}', '{INC, NOT}', '{MOVSD.REP, CALL}', '{TEST, IDIV}', '{SBB, SETLE}', '{SETNC, MOVSX}', '{RCL, CMP}', '{SCASD, OR}', '{JNZ, OUT}', '{SETNS, PUSHAD}', '{IN, STC}', '{STOSW.REP, MOV}', '{BT, LEA}', '{PSRLD, PSLLD}', '{TEST, CMC}', '{ADD, STD}', '{BT, NOT}', '{SUB, INSB}', '{ROR, AAM}', '{JL, AAS}', '{SAR, ROL}', '{LOOP, CMP}', '{SETLE, SUB}', '{NOP,

In [9]:
print("1-Gram Feature Matrix (first few rows):")
print(one_gram_counts_df.head())
print("\nShape of x_1gram:", x_1gram.shape)


print("\n2-Gram Feature Matrix (first few rows):")
print(two_gram_counts_df.head())
print("\nShape of x_2gram:", x_2gram.shape)


1-Gram Feature Matrix (first few rows):
   AAA  AAD  AAM  AAS  ADC  ADD  ADD.LOCK  ADDPS  ADDSD  ADDSS  ...  VXORPS  \
0    0    0    0    0    0  164         0      0      0      0  ...       0   
1    0    0    2    0    6  217         2      0      0      0  ...       0   
2    0    0    0    0    3  220         0      0      0      0  ...       0   
3    0    0    2    0    6  217         2      0      0      0  ...       0   
4    1    0    0    1    0    3         0      0      0      0  ...       0   

   VZEROUPPER  WAIT  XADD  XADD.LOCK  XCHG  XGETBV  XLAT  XOR  XORPS  
0           0     0     0          0     2       0     0  140      0  
1           0     0     0          0     0       0     0    1      0  
2           0     0     0          0     1       0     0    2      0  
3           0     0     0          0     0       0     0    1      0  
4           0     0     0          0     0       0     0    2      0  

[5 rows x 329 columns]

Shape of x_1gram: (307, 329)

2-Gr