In [1]:
import os, sys, zipfile, struct, shutil

def isdex(mm):
	if mm[0:3] == 'dex' and len(mm) > 0x70:
		return True
	return False


In [2]:
def header(mm):
	magic = mm[0:8]
	checksum = struct.unpack('<L', mm[8:0xC])[0]
	sal = mm[0xC:0x20]
	file_size = struct.unpack('<L', mm[0x20:0x24])[0]
	header_size = struct.unpack('<L', mm[0x24:0x28])[0]
	endian_tag = struct.unpack('<L', mm[0x28:0x2C])[0]
	link_size = struct.unpack('<L', mm[0x2C:0x30])[0]
	link_off = struct.unpack('<L', mm[0x30:0x34])[0]
	map_off = struct.unpack('<L', mm[0x34:0x38])[0]
	string_ids_size = struct.unpack('<L', mm[0x38:0x3C])[0]
	string_ids_off = struct.unpack('<L', mm[0x3C:0x40])[0]
	type_ids_size = struct.unpack('<L', mm[0x40:0x44])[0]
	type_ids_off = struct.unpack('<L', mm[0x44:0x48])[0]
	proto_ids_size = struct.unpack('<L', mm[0x48:0x4C])[0]
	proto_ids_off = struct.unpack('<L', mm[0x4C:0x50])[0]
	field_ids_size = struct.unpack('<L', mm[0x50:0x54])[0]
	field_ids_off = struct.unpack('<L', mm[0x54:0x58])[0]
	method_ids_size = struct.unpack('<L', mm[0x58:0x5C])[0]
	method_ids_off = struct.unpack('<L', mm[0x5C:0x60])[0]
	class_defs_size = struct.unpack('<L', mm[0x60:0x64])[0]
	class_defs_off = struct.unpack('<L', mm[0x64:0x68])[0]
	data_size = struct.unpack('<L', mm[0x68:0x6C])[0]
	data_off = struct.unpack('<L', mm[0x6C:0x70])[0]
	hdr = {}
	
	if len(mm) != file_size :
		return hdr
		
	hdr['magic'] = magic
	hdr['checksum'] = checksum
	hdr['sal'] = sal
	hdr['file_size'] = file_size
	hdr['header_size'] = header_size
	hdr['endian_tag'] = endian_tag
	hdr['link_size'] = link_size
	hdr['link_off'] = link_off
	hdr['map_off'] = map_off
	hdr['string_ids_size'] = string_ids_size
	hdr['string_ids_off'] = string_ids_off
	hdr['type_ids_size'] = type_ids_size
	hdr['type_ids_off'] = type_ids_off
	hdr['proto_ids_size'] = proto_ids_size
	hdr['proto_ids_off'] = proto_ids_off
	hdr['field_ids_size'] = field_ids_size
	hdr['field_ids_off'] = field_ids_off
	hdr['method_ids_size'] = method_ids_size
	hdr['method_ids_off'] = method_ids_off
	hdr['class_defs_size'] = class_defs_size
	hdr['class_defs_off'] = class_defs_off
	hdr['data_size'] = data_size
	hdr['data_off'] = data_off
	
	return hdr

In [3]:
dex_err_lst = []
ci_err_lst = []

In [4]:
def extract_ds(f_dex,ds_path):
	f = open(f_dex,'rb')
	f2 = open(os.path.join(ds_path,f_dex.split('/')[-1][:-4]+'.ds'),'wb+')
	
	mm = f.read()
	hdr = header(mm)
	data_off = hdr['data_off']
	f.seek(data_off)
	f2.write(f.read(hdr['data_size']))
	f2.close()
	f.close()

In [5]:
def extract_ci(d_path, ci_path):
	c_path = os.path.join(ci_path,d_path.split('/')[-1][:-4]+'.txt')
	
	cmd = './dexdump2-codeitem -d \''+d_path+'\' -o \''+c_path+'\''
	#print(cmd)
	os.system(cmd)
	try:
		#print(os.path.join(path_dest,fname.replace(ext,'.txt')))
		make_bin(c_path)
	except:
		ci_err_lst.append(d_path+"--CI ERROR")
		os.remove(c_path)
	    

In [6]:
def make_bin(ff):
	f = open(ff,'r')
	t_str=''
	tmp = f.readlines()

	for lines in tmp:
		if lines.startswith('CODEITEMS_BYTECODE'):
			tl = lines.split(' : ')[1].replace('\n','')
			while len(tl)%8 != 0:
				tl+='0'
			t_str+=tl
	f.close()
	b = int(t_str,16)
	f = open(ff[:-4]+'.ci','wb')
	f.write(b.to_bytes(int(len(t_str)/2),'big'))
	f.close()
	os.remove(ff)
    

In [7]:
def extract(src_path, dex_path, ds_path, ci_path):
	'''
	for (p,d,f) in os.walk(dex_path):
#		print(p)
		for fname in f:
			d_path = os.path.join(p,fname[:-4]+'.dex')
			print(d_path)
			extract_ds(d_path, ds_path)
			extract_ci(d_path, ci_path)
	'''
	if not os.path.isdir(dex_path):
		os.makedirs(dex_path)
	if not os.path.isdir(ds_path):
		os.makedirs(ds_path)
	if not os.path.isdir(ci_path):
		os.makedirs(ci_path)
	
	for (p,d,f) in os.walk(src_path):
		for fname in f:
			#cnt += 1
			
			apk = os.path.join(p,fname)
			
			try:
				# Chk multi-dex
				tmp = zipfile.ZipFile(apk)
				if 'classes2.dex' in tmp.namelist():
					dex_err_lst.append(apk+"--multidex")
					continue

				tmp.extract('classes.dex')
				tmp.close()
				d_path = os.path.join(dex_path,fname[:-4]+'.dex')
				os.rename('classes.dex',fname[:-4]+'.dex')
				shutil.move(fname[:-4]+'.dex', dex_path)
				
				#os.remove('classes.dex')
			except:
				dex_err_lst.append(apk+"--ZIP ERROR")
				continue
			
			d_path = os.path.join(dex_path,fname[:-4]+'.dex')
			extract_ds(d_path, ds_path)
			extract_ci(d_path, ci_path)
	

In [8]:
def err_log():
	f = open('./dex_err.txt','w')
	f.writelines([str(i)+"\n" for i in dex_err_lst])
	f.close()
	f = open('./ci_err.txt','w')
	f.writelines([str(i)+"\n" for i in ci_err_lst])
	f.close()

def dataset_trim(ci_path,ds_path,dex_path):
	#print(os.listdir(ci_path))
	ds_lst = []
	dex_lst = []
	for fname in os.listdir(ci_path):
		ds_lst.append(fname[:-3]+'.ds')
		dex_lst.append(fname[:-3]+'.dex')

	for fname in os.listdir(ds_path):
		if fname not in ds_lst:
			os.remove(os.path.join(ds_path,fname))

	for fname in os.listdir(dex_path):
		if fname not in dex_lst:
			os.remove(os.path.join(dex_path,fname))


In [10]:
if __name__ == "__main__":
	
	#src_path = '/Desktop/dataset/Drebin/'
	#src_path = '/Desktop/dataset/output/'
	#out_path = '/home/csos/Desktop/tt/'
	#out_path = '/media/csos/8CC8438EC843760C/csos_crawl-nodup-vt/googleplay-data/'
	src_path = '/Desktop/dataset/Drebin/'
	out_path = '/Desktop/dataset/output/'
	dex_path = out_path+'dex/'
	ds_path = out_path+'ds/'
	ci_path = out_path+'ci/'

	extract(src_path,dex_path,ds_path,ci_path)
	err_log()
	
	dataset_trim(ci_path,ds_path,dex_path)