# BinProject 

generate the project of binaries with offset and 

In [1]:
import os 
import json
from pathlib import Path
from utils.key import *
from utils.ghidra_helper import *
from utils.launcher import HeadlessLoggingPyhidraLauncher

In [2]:
# start the Launcher 
launcher = HeadlessLoggingPyhidraLauncher(verbose=True, log_path='./launch.log')
launcher.start()

INFO  Using log config file: jar:file:/home/dingisoul/dev/ghidra_11.0.1_PUBLIC/Ghidra/Framework/Generic/lib/Generic.jar!/generic.log4j.xml (LoggingInitialization)  
INFO  Using log file: ./launch.log (LoggingInitialization)  
INFO  Loading user preferences: /home/dingisoul/.ghidra/.ghidra_11.0.1_PUBLIC/preferences (Preferences)  
INFO  Searching for classes... (ClassSearcher)  
INFO  Class search complete (651 ms) (ClassSearcher)  
INFO  Initializing SSL Context (SSLContextInitializer)  
INFO  Initializing Random Number Generator... (SecureRandomFactory)  
INFO  Random Number Generator initialization complete: NativePRNGNonBlocking (SecureRandomFactory)  
INFO  Trust manager disabled, cacerts have not been set (ApplicationTrustManagerFactory)  


In [3]:
# Reuse the project create or open in chapter 1 
# Necessary imports for ghidra project 
from ghidra.base.project import GhidraProject
from java.io import IOException
from pathlib import Path 

# Create Project Dir and name 
project_location = Path('./ghidra_project')
project_location.mkdir(exist_ok=True, parents=True)
project_name = "binmatch_project"

# create or open project 
try:
    project = GhidraProject.openProject(project_location, project_name, True)
    print(f'Opened project: {project.project.name}')
except IOException:
    project = GhidraProject.createProject(project_location, project_name, False)
    print(f'Created project: {project.project.name}')

INFO  Opening project: /home/dingisoul/dev/FirmFlaw/ghidra_project/binmatch_project (DefaultProject)  
Opened project: binmatch_project


In [6]:
duplicate_program_name = {}
def handle_duplicate(name: str) -> str:
    no = duplicate_program_name.get(name, 0)
    duplicate_program_name[name] = no + 1
    return f"{name}{no}"

def firm_valid(firminfo: dict) -> bool:
    '''
    firminfo: dict read from xx_friminfo.json
    return: True for valid
    '''
    return data.get('base address') is not None and \
           data.get('architecture') is not None and \
           data.get('file offset')  is not None and \
           len(data['base address']) > 0 and data['architecture'] == "arm" 
    
def remove_header(dir : Path, bin: str) -> str:
        info = bin + '_firminfo.json'
        no_header_bin = bin + 'noheader'  
        with open(dir / info, 'r') as file:
            data = json.load(file)
            if data.get('file offset') is None:
                return None
            file_offset  = data['file offset']
        with open(dir / bin, 'rb') as input_file:
            input_file.seek(file_offset)
            remaining_data = input_file.read()            
        with open(dir / no_header_bin, 'wb') as output_file:
            output_file.write(remaining_data)
        return no_header_bin

def timeout_handler(signum, frame):
    raise TimeoutError("Timed out!") 

In [None]:
from ghidra.program.flatapi import FlatProgramAPI
# import the program and analyze all 
json_end = '_firminfo.json'
lang =  get_language("ARM:LE:32:Cortex")

csv_name = './func_database.csv'
if os.path.exists(csv_name):
    os.remove(csv_name)
csv_file = open(csv_name, 'w')
# add first line
csv_file.write('Program,Handlers,Functions,Size\n')
program = None 
num = 0
signal.signal(signal.SIGALRM, timeout_handler)  # 注册信号处理器
for root, dirs, files in os.walk('./firmwares/'):
    info_files = [f for f in files if f.endswith(json_end)]
    dir_ = Path(root)
    for info_ in info_files: 
        bin_ = info_[:info_.find(json_end)]
        with open(dir_ / info_, 'r') as file:
            data = json.load(file)
            if not firm_valid(data):
                continue 
            base_address = data['base address']
            no_header_bin = remove_header(dir_, bin_)
        print(f"\033[31mIter file {dir_ / bin_} at {num}\033[0m")
        # timeout and try 
        program = project.importProgram(dir_ / no_header_bin, lang , get_compiler_spec(lang))
        signal.alarm(600)  # 设置闹钟，在指定秒数后发送信号
        try: 
            # get the flat api 
            flat_api = FlatProgramAPI(program)
            old_base = program.getImageBase()
            image_base = int(base_address, base=16)
            # 1. setImageBase (Address base, boolean commit)
            program.setImageBase(old_base.getNewAddress(image_base), True)
            handler_num = create_handlers(program, flat_api)
            flat_api.analyzeAll(program)
            csv_file.write(f'{bin_},{handler_num},{program.getFunctionManager().getFunctionCount()},{os.path.getsize(dir_/no_header_bin)}\n')
            print(f"\033[31mAdd {program.getFunctionManager().getFunctionCount()} functions\033[0m")
            num += 1
        except TimeoutError:
            print(f"\033[31mAnalyze {bin_} timeout!!\033[0m")
            csv_file.wirte(f'{bin_},-1,-1,{os.path.getsize(dir_/no_header_bin)}\n')
        finally:
            signal.alarm(0)
        project.saveAs(program, "/", handle_duplicate(program.getName()), True)
        project.close(program)
csv_file.close()
project.close()

[31mIter file firmwares/step2_postSig_EXT/dialog/extracted_originals/80A6203BA7A320A094B4B9F9185670AC08B8F957EDAB0C51F8E2E984B8C3E548.apk_out/assets/GOLiFE_CARE2HR_149.bin.fs37864fs. at 0[0m
INFO  Using Loader: Raw Binary (AutoImporter)  
INFO  Using Language/Compiler: ARM:LE:32:Cortex:default (AutoImporter)  
INFO  Packed database cache: /tmp/dingisoul-Ghidra/packed-db-cache (PackedDatabaseCache)  
INFO  Applied data type archive: generic_clib (ApplyDataArchiveAnalyzer)  
INFO  -----------------------------------------------------
    ARM Constant Reference Analyzer            0.571 secs
    ARM Symbol                                 0.000 secs
    ASCII Strings                              0.191 secs
    Apply Data Archives                        0.421 secs
    Call Convention ID                         0.002 secs
    Call-Fixup Installer                       0.009 secs
    Create Address Tables                      0.003 secs
    Create Address Tables - One Time           0.022 s

In [None]:
print(END)

In [None]:
project.close()

# FixMatch 

the project for fixed binaries 

In [10]:
from utils.db import *
project = openProject('fixmatch_project', Path('./ghidra_project'))
json_end = '_firminfo.json'
lang =  get_language("ARM:LE:32:Cortex")

csv_name = './fix_func_database.csv'
if os.path.exists(csv_name):
    os.remove(csv_name)
csv_file = open(csv_name, 'w')
program = None 
num = 0
for root, dirs, files in os.walk('./step2_postSig/arm/'):
    bin_files = [f for f in files if not f.endswith(json_end) and not f.endswith('noheader')]
    dir_ = Path(root)
    for bin_ in bin_files: 
        if bin_.endswith('db'):
            continue
        # for cf2- binaries 
        if not bin_.startswith('cf2-2'):
            continue 
        base_address = 0x8004000 
        print(f"\033[31mIter file {dir_ / bin_} at {num}\033[0m")
        program = project.importProgram(dir_ / bin_, lang , get_compiler_spec(lang))
        # get the flat api 
        from ghidra.program.flatapi import FlatProgramAPI
        flat_api = FlatProgramAPI(program)
        old_base = program.getImageBase()
        image_base = base_address
        # 1. setImageBase (Address base, boolean commit)
        program.setImageBase(old_base.getNewAddress(image_base), True)
        handler_num = create_handlers(program, flat_api)
        print(f'Create {handler_num} handlers')
        flat_api.analyzeAll(program)
        csv_file.write(f'{bin_},{program.getFunctionManager().getFunctionCount()}\n')
        print(f"\033[31mAdd {program.getFunctionManager().getFunctionCount()} functions\033[0m")
        num += 1
        project.saveAs(program, "/", handle_duplicate(program.getName()), True)
        project.close(program)
csv_file.close()
project.close()

INFO  Opening project: /home/dingisoul/dev/FirmFlaw/ghidra_project/fixmatch_project (DefaultProject)  
Opened project: fixmatch_project
[31mIter file step2_postSig/arm/org.appzeeinc.droneremotecontrol_crazyflie/firmware-cf2-2023.06/cf2-2023.06.bin.fs265496fs. at 0[0m
INFO  Using Loader: Raw Binary (AutoImporter)  
INFO  Using Language/Compiler: ARM:LE:32:Cortex:default (AutoImporter)  
Create 112 handlers
INFO  Applied data type archive: generic_clib (ApplyDataArchiveAnalyzer)  
INFO  -----------------------------------------------------
    ARM Constant Reference Analyzer            4.579 secs
    ARM Symbol                                 0.000 secs
    ASCII Strings                              0.039 secs
    Apply Data Archives                        0.161 secs
    Call Convention ID                         0.007 secs
    Call-Fixup Installer                       0.017 secs
    Create Address Tables                      0.063 secs
    Create Address Tables - One Time           0

In [11]:
# generate database for functions in binary 
from ghidra.program.flatapi import FlatProgramAPI
# from tqdm.notebook import tqdm
import os 
from functools import reduce
from operator import mul

DATABASE = './step2_postSig/arm/bin_func.db'
conn = sqlite3.connect(DATABASE)
cursor = conn.cursor()

project = openProject('fixmatch_project', Path('./ghidra_project'))
num = 0
# use tqdm to generate the progress bar 
for file_ in project.getRootFolder().getFiles():
    # print(f"Match program {name_}")
    name_ = file_.getName()
    program = project.openProgram('/', name_, True)
    rows_ = []
    for func_ in program.getListing().getFunctions(True):
        if filter_func(func_):
            row_ = (func_.getName(),program.getName())
            inst_ = get_inst_key(func_)
            graph = get_struct_graph_key(func_)
            # make sure the inst_[0] means the numAddress 
            hash_ = reduce(mul,(n for n in graph),1) * inst_[0]
            #if hash_ >= 0xffffffff:
            #    print(f'WARNING: {func_.getName()} hash is a little long {hash_}')
            row_ += (hash_,) + inst_ + graph
            # no check because every bin is different 
            # if not sql_check_duplicate_func(cursor, row_[func_key_idx('name')], row_[func_key_idx('hash')], FUNC_TABLE_NAME):
            rows_.append(row_)
        # insert the rows    
    sql_insert(cursor, FUNC_KEYS.keys(), rows_, FUNC_TABLE_NAME)
    conn.commit()      
    # remember closing the program to avoid memory usage 
    print(f"{program.getName()} insert {len(rows_)} functions at {num}")
    num += 1
    project.close(program)
conn.close()
project.close()

INFO  Opening project: /home/dingisoul/dev/FirmFlaw/ghidra_project/fixmatch_project (DefaultProject)  
Opened project: fixmatch_project
cf2-2015.08.1.bin.fs75152fs. insert 360 functions at 0
cf2-2016.02.bin.fs127792fs. insert 457 functions at 1
cf2-2016.09.bin.fs144640fs. insert 454 functions at 2
cf2-2016.09.bin.fs144640fs. insert 454 functions at 3
cf2-2017.04.bin.fs160192fs. insert 503 functions at 4
cf2-2017.05.bin.fs183624fs. insert 551 functions at 5
cf2-2017.06.bin.fs202392fs. insert 571 functions at 6
cf2-2018.01.1.bin.fs210396fs. insert 618 functions at 7
cf2-2018.10.bin.fs271892fs. insert 770 functions at 8
cf2-2018.12.bin.fs190476fs. insert 814 functions at 9
cf2-2019.01.bin.fs190548fs. insert 814 functions at 10
cf2-2019.02.bin.fs190580fs. insert 814 functions at 11
cf2-2019.09.bin.fs196660fs. insert 836 functions at 12
cf2-2020.02.bin.fs202888fs. insert 878 functions at 13
cf2-2020.04.bin.fs218340fs. insert 919 functions at 14
cf2-2020.06.bin.fs219052fs. insert 922 functio

# END

In [None]:
# fix the wrong file offset
import json 
import os
from pathlib import Path
json_end = '_firminfo.json'
for root, dirs, files in os.walk('./firmwares/step2_postSig/arm/org.appzeeinc.droneremotecontrol_crazyflie/'):
    info_files = [f for f in files if f.endswith(json_end)]
    dir_ = Path(root)
    for info_ in info_files:
        print(f'info: {info_}')
        bin_ = info_[:info_.find(json_end)]  
        no_header_bin = bin_ + 'noheader'  
        with open(dir_ / (info_), 'r') as file:
            data = json.load(file)
            data['file offset'] = 0
            if data['base address'] == "0x0" and info_.startswith('cf2-'):
                print(f'cf2: {info_}')
                data['base address'] = "0x8004000"
        with open(dir_ / (info_), 'w') as file:
            json.dump(data, file, indent=2)

In [15]:
# construct the noheader files 
json_end = '_firminfo.json'
sum = 0
bins = set()
for root, dirs, files in os.walk('./firmwares/'):
    info_files = [f for f in files if f.endswith(json_end)]
    dir_ = Path(root)
    for info_ in info_files:
        sum += 1
        bin_ = info_[:info_.find(json_end)]
        bins.add(bin_)
        no_header_bin = bin_ + 'noheader'  
        with open(dir_ / info_, 'r') as file:
            data = json.load(file)
            if data.get('file offset') is None:
                continue 
            file_offset  = data['file offset']
        with open(dir_ / bin_, 'rb') as input_file:
            input_file.seek(file_offset)
            remaining_data = input_file.read()            
        with open(dir_ / no_header_bin, 'wb') as output_file:
            output_file.write(remaining_data)
print(f'Total NoHeader Files {sum}')

Total NoHeader Files 9388


In [None]:
bins