# DSC180A Checkpoint #2

## Setup

In [1]:
import re
import pandas as pd
import os
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import networkx as nx

## ETL & feature extraction

In [20]:
# Transform smali file to call-anlysis table
def process_smali(sml):
    methods = pd.DataFrame(re.findall(r'method.* (\w+)[(].+[)].+;([\d\D]*?)\.end method', sml))
    if len(methods) == 0:
        return -1
    
    def process_method(x):
        res = pd.DataFrame(re.findall(r'invoke-(\w{5,9})\s.+}, (.*);->(.+)[(]', x[1]))
        if len(res) == 0:
            return
        res.columns = ['invoke_type', 'package_long', 'call']
        res['method'] = x[0]
        res['package'] = res.package_long.apply(lambda x:x[1:x.find('/')])
        res['type'] = res.package.apply(lambda x:x if x in ['android', 'androidx', 'google', 'java', 'javax', 'kotlin'] else 'self')
        return res
    
    dfs = methods.apply(process_method, axis=1)
    try:
        return pd.concat(dfs.tolist())
    except:
        return -1

In [21]:
# Generate Markov Chain
def generate_chain(df):
    return df.type.value_counts() / len(df)

In [23]:
# Apply smali file analysis to all smali files in an apk
def process_apk(path):
    i = 0
    df = pd.DataFrame()
    num_apicalls = 0
    for root, dirs, files in os.walk(path, topdown=False):  
        for name in files:
            if name.endswith('.smali'):
                if i >= 30000:
                    break
                f = open(os.path.join(root, name))
                sml_df = process_smali(f.read())
                if type(sml_df) == int:
                    pass
                else:
                    df = pd.concat([df, sml_df], ignore_index=True)
                f.close()
                i += 1
            
    return df

In [4]:
# Parse all training data in directory which contains all malware or benign-ware
def parse_all(path):
    res = pd.DataFrame()
    wares = [i for i in os.listdir(path)]
    max_w = 0
    for d in wares:
        if max_w >= 8:
            return res
        d_path = path + '/' + d
        res = pd.concat([res, process_apk(d_path)], ignore_index=True)
        max_w += 1
    return res

In [24]:
df = process_apk('Firefox')

In [26]:
generate_chain(df)

self        0.481803
java        0.240431
kotlin      0.123108
androidx    0.077027
android     0.076623
javax       0.001009
Name: type, dtype: float64

## Data Parsing

In [6]:
# Malware
path_mal = '/teams/DSC180A_FA20_A00/a04malware/malware' # Path
features_mal = run(path_mal, 1)

Kyview
Fjcon
Koler
Ksapp
FakeAV
Boxer
RuMMS
Kemoge


In [5]:
# Safeware
path_saf = '/teams/DSC180A_FA20_A00/a04malware/popular-apps' # Path
features_saf = run(path_saf, 0)

net.updategames.granny
com.gameloft.android.ANMP.GloftA8HM
com.devsisters.gb
com.huobi.cn
com.gretech.gomplayerko
com.buffstudio.sevendays_free
com.sega.comixzone
com.facebook.katana


In [7]:
features = features_mal + features_saf

## Model

## EDA