In [1]:
import pandas as pd

# Limpeza dos datasets

A limpeza de cada dataset segue os [seguintes passos](https://github.com/Malware-Hunter/Data_Preparation):

1. Remoção de valores faltantes
2. Remoção de amostras com valores duplicados nas colunas
3. Remoção de características irrelevantes (colunas com apenas um valor em todas as linhas)
4. Conversão de tipos de dados

In [2]:
## Funções úteis

def get_unique_values(df):
    for column_name in df.columns:
        yield (column_name, df[column_name].unique())

def drop_irrelevant_columns(df):
    # retorna o df sem colunas irrelevantes (colunas com menos de 2 valores possíveis)
    irrelevant_columns = []
    for (column_name, unique_values) in get_unique_values(df):
        if(len(unique_values) < 2):
            irrelevant_columns.append(column_name)
    print(f"{len(irrelevant_columns)} colunas irrelevantes:")
    print(irrelevant_columns)
    return df.drop(columns=irrelevant_columns)

def cast_to_numeric_inplace(df):
    for non_numeric_column in df.select_dtypes(exclude=['int','float']).columns:
        df[non_numeric_column] = pd.to_numeric(df[non_numeric_column])

## Androcrawl Dataset

In [3]:
df = pd.read_csv('./androcrawl.csv', low_memory=False)
df.columns = [name.replace("'", "") for name in df.columns]
df = df.rename(columns={'Detection Ratio': 'class'})
df

Unnamed: 0,Apk Name,Dangerous Permission: ACCESS_SUPERUSER,Dangerous Permission: BLUETOOTH_PRIVILEGED,Dangerous Permission: BRICK,Dangerous Permission: CHANGE_COMPONENT_ENABLED_STATE,Dangerous Permission: CLEAR_APP_USER_DATA,Dangerous Permission: DELETE_CACHE_FILES,Dangerous Permission: DELETE_PACKAGES,Dangerous Permission: DISABLE_KEYGUARD,Dangerous Permission: FACTORY_TEST,...,Hidden Apk,Sends SMS to Suspicious Number(s),Package Domain Exists,Reads phone data at startup,Sends SMS at startup,Starts service at startup,Sends SMS when receiving SMS,Sends data to a remote page when receiving SMS,Accesses a database when receiving SMS,class
0,51113d15a58841b8c81bc7fc8fd7b45f4ca19a29029b86...,False,False,False,False,False,False,False,False,False,...,false,false,true,false,false,false,false,false,false,Benign
1,3f944d58fbb88b8c64c82c3cd60b12d851176df8aca0d2...,False,False,False,False,False,False,False,False,False,...,false,false,true,false,false,false,false,false,false,Benign
2,1370ea8b3e2d895b83b5a7771348546eb397624cdc52aa...,False,False,False,False,False,False,False,False,False,...,false,false,true,false,false,false,false,false,false,Benign
3,e0ac1ffb30426d6f650fa7b3244277f273220fbe230ef6...,False,False,False,False,False,False,False,False,False,...,false,false,true,false,false,false,false,false,false,Malicious
4,0ede82b2a99ed040e65ce1816f054921187d184320301f...,False,False,False,False,False,False,False,False,False,...,false,false,true,false,false,false,false,false,false,Benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162978,559f0610501a1bd0f3e7265494cec8537bb0d8c2da47bb...,False,False,False,False,False,False,False,False,False,...,false,false,false,false,false,false,false,false,false,Benign
162979,a4f11ba03162b949aa29221d5bb2e4909bb116e8cc41a9...,False,False,False,False,False,False,False,False,False,...,false,false,false,true,false,false,false,false,false,Benign
162980,1f3d156bf6ac206e4cf79db74065865eaf414f181b3bcd...,False,False,False,False,False,False,False,False,False,...,false,false,false,true,false,false,false,false,false,Benign
162981,cc4b6e06f05a34779803e5f0d25b226a0a9cd10ba5453c...,False,False,False,False,False,False,False,False,False,...,false,false,true,true,false,false,false,false,false,Benign


### 1) Remoção de valores faltantes

In [93]:
df = df.dropna()
df

Unnamed: 0,Apk Name,Dangerous Permission: ACCESS_SUPERUSER,Dangerous Permission: BLUETOOTH_PRIVILEGED,Dangerous Permission: BRICK,Dangerous Permission: CHANGE_COMPONENT_ENABLED_STATE,Dangerous Permission: CLEAR_APP_USER_DATA,Dangerous Permission: DELETE_CACHE_FILES,Dangerous Permission: DELETE_PACKAGES,Dangerous Permission: DISABLE_KEYGUARD,Dangerous Permission: FACTORY_TEST,...,Hidden Apk,Sends SMS to Suspicious Number(s),Package Domain Exists,Reads phone data at startup,Sends SMS at startup,Starts service at startup,Sends SMS when receiving SMS,Sends data to a remote page when receiving SMS,Accesses a database when receiving SMS,class
0,51113d15a58841b8c81bc7fc8fd7b45f4ca19a29029b86...,False,False,False,False,False,False,False,False,False,...,false,false,true,false,false,false,false,false,false,Benign
1,3f944d58fbb88b8c64c82c3cd60b12d851176df8aca0d2...,False,False,False,False,False,False,False,False,False,...,false,false,true,false,false,false,false,false,false,Benign
2,1370ea8b3e2d895b83b5a7771348546eb397624cdc52aa...,False,False,False,False,False,False,False,False,False,...,false,false,true,false,false,false,false,false,false,Benign
3,e0ac1ffb30426d6f650fa7b3244277f273220fbe230ef6...,False,False,False,False,False,False,False,False,False,...,false,false,true,false,false,false,false,false,false,Malicious
4,0ede82b2a99ed040e65ce1816f054921187d184320301f...,False,False,False,False,False,False,False,False,False,...,false,false,true,false,false,false,false,false,false,Benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162978,559f0610501a1bd0f3e7265494cec8537bb0d8c2da47bb...,False,False,False,False,False,False,False,False,False,...,false,false,false,false,false,false,false,false,false,Benign
162979,a4f11ba03162b949aa29221d5bb2e4909bb116e8cc41a9...,False,False,False,False,False,False,False,False,False,...,false,false,false,true,false,false,false,false,false,Benign
162980,1f3d156bf6ac206e4cf79db74065865eaf414f181b3bcd...,False,False,False,False,False,False,False,False,False,...,false,false,false,true,false,false,false,false,false,Benign
162981,cc4b6e06f05a34779803e5f0d25b226a0a9cd10ba5453c...,False,False,False,False,False,False,False,False,False,...,false,false,true,true,false,false,false,false,false,Benign


In [94]:
print("Valores únicos para cada feature:")
for (column_name, unique_values) in get_unique_values(df):
    print(column_name, unique_values)

Valores únicos para cada feature:
Apk Name ['51113d15a58841b8c81bc7fc8fd7b45f4ca19a29029b86e4714f1451aa078340'
 '3f944d58fbb88b8c64c82c3cd60b12d851176df8aca0d2438013f654be474f72'
 '1370ea8b3e2d895b83b5a7771348546eb397624cdc52aae88c1e63fbcd0e37ee' ...
 'a4f11ba03162b949aa29221d5bb2e4909bb116e8cc41a9087a27c7c9dfd76eb9'
 'cc4b6e06f05a34779803e5f0d25b226a0a9cd10ba5453c8ad90e3995584fe245'
 'eda85939db98c9f5283ce9304f40ea1f841f25a8586b13e4a9024befc47e3970']
Dangerous Permission: ACCESS_SUPERUSER [False  True]
Dangerous Permission: BLUETOOTH_PRIVILEGED [False]
Dangerous Permission: BRICK [False]
Dangerous Permission: CHANGE_COMPONENT_ENABLED_STATE [False  True]
Dangerous Permission: CLEAR_APP_USER_DATA [False  True]
Dangerous Permission: DELETE_CACHE_FILES [False  True]
Dangerous Permission: DELETE_PACKAGES [False  True]
Dangerous Permission: DISABLE_KEYGUARD [False  True]
Dangerous Permission: FACTORY_TEST [False  True]
Dangerous Permission: INSTALL_PACKAGES [False  True]
Dangerous Permissio

Calls a System Routine: ls ['false' '?']
Calls a System Routine: loadjar ['false' '?']
Calls a System Routine: grep ['false' '?']
Calls a System Routine: /sh ['false' '?']
Calls a System Routine: /bin ['false' '?' 'true']
Calls a System Routine: pm install ['false' '?']
Calls a System Routine: /dev/net ['false' '?']
Calls a System Routine: insmod ['false' '?']
Calls a System Routine: rm ['false' '?']
Calls a System Routine: mount ['false' '?' 'true']
Calls a System Routine: root ['false' '?']
Calls a System Routine: /system ['false' '?' 'true']
Calls a System Routine: stdout ['false' '?']
Calls a System Routine: reboot ['false' '?']
Calls a System Routine: killall ['false' '?']
Calls a System Routine: chmod ['false' '?']
Calls a System Routine: stderr ['false' '?']
Calls a System Routine: ratc ['false' '?']
Harmless Permission: ACCESS_SURFACE_FLINGER [False  True]
Harmless Permission: ACCOUNT_MANAGER [False  True]
Harmless Permission: ADD_VOICEMAIL [False  True]
Harmless Permission: CO

In [95]:
count = 0
for (column_name, unique_values) in get_unique_values(df):
    if('?' in unique_values):
        count += 1
print("# de colunas com o valor de interrogação:", count)

  if('?' in unique_values):


# de colunas com o valor de interrogação: 159


Nota-se que o valor '?' ocorre na maioria das colunas (159 de um total de 222), então vamos remover apenas as amostras em que esse valor aparece. O número de colunas já será reduzido após a remoção de colunas irrelevantes no passo 3.

In [96]:
df = df[~df.isin(['?']).any(axis=1)]
df

Unnamed: 0,Apk Name,Dangerous Permission: ACCESS_SUPERUSER,Dangerous Permission: BLUETOOTH_PRIVILEGED,Dangerous Permission: BRICK,Dangerous Permission: CHANGE_COMPONENT_ENABLED_STATE,Dangerous Permission: CLEAR_APP_USER_DATA,Dangerous Permission: DELETE_CACHE_FILES,Dangerous Permission: DELETE_PACKAGES,Dangerous Permission: DISABLE_KEYGUARD,Dangerous Permission: FACTORY_TEST,...,Hidden Apk,Sends SMS to Suspicious Number(s),Package Domain Exists,Reads phone data at startup,Sends SMS at startup,Starts service at startup,Sends SMS when receiving SMS,Sends data to a remote page when receiving SMS,Accesses a database when receiving SMS,class
0,51113d15a58841b8c81bc7fc8fd7b45f4ca19a29029b86...,False,False,False,False,False,False,False,False,False,...,false,false,true,false,false,false,false,false,false,Benign
2,1370ea8b3e2d895b83b5a7771348546eb397624cdc52aa...,False,False,False,False,False,False,False,False,False,...,false,false,true,false,false,false,false,false,false,Benign
3,e0ac1ffb30426d6f650fa7b3244277f273220fbe230ef6...,False,False,False,False,False,False,False,False,False,...,false,false,true,false,false,false,false,false,false,Malicious
4,0ede82b2a99ed040e65ce1816f054921187d184320301f...,False,False,False,False,False,False,False,False,False,...,false,false,true,false,false,false,false,false,false,Benign
5,822967e368d00b993e90c0724bed3daf26ba9b59f3a55e...,False,False,False,False,False,False,False,False,False,...,false,false,true,false,false,false,false,false,false,Benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162978,559f0610501a1bd0f3e7265494cec8537bb0d8c2da47bb...,False,False,False,False,False,False,False,False,False,...,false,false,false,false,false,false,false,false,false,Benign
162979,a4f11ba03162b949aa29221d5bb2e4909bb116e8cc41a9...,False,False,False,False,False,False,False,False,False,...,false,false,false,true,false,false,false,false,false,Benign
162980,1f3d156bf6ac206e4cf79db74065865eaf414f181b3bcd...,False,False,False,False,False,False,False,False,False,...,false,false,false,true,false,false,false,false,false,Benign
162981,cc4b6e06f05a34779803e5f0d25b226a0a9cd10ba5453c...,False,False,False,False,False,False,False,False,False,...,false,false,true,true,false,false,false,false,false,Benign


### 3) Remoção de características irrelevantes

In [97]:
df = drop_irrelevant_columns(df)
df = df.drop(columns=['Apk Name'])
df

67 colunas irrelevantes:
['Dangerous Permission: BLUETOOTH_PRIVILEGED', 'Dangerous Permission: BRICK', 'Dangerous Permission: MOUNT_UNMOUNT_FILESYSTEM', 'Api Call: Landroid/telephony/cdma/CdmaCellLocation;->getSystemId', 'Api Call: Landroid/content/pm/PackageManager;->queryContentProviders', 'Api Call: Landroid/content/BroadcastReceiver;->abortBroadcast', 'Checks adb_enabled', 'Tries to modify adb_enabled', 'Can Steal Data: CAPTURE_AUDIO_OUTPUT', 'Can Steal Data: CAPTURE_SECURE_VIDEO_OUTPUT', 'Can Steal Data: CAPTURE_VIDEO_OUTPUT', 'Can Steal Data: LOCATION_HARDWARE', 'Can Steal Data: READ_SOCIAL_STREAM', 'Content: content://com.android.calendar', 'Content: content://calendar', 'Content: content://com.facebook.katana.provider.AttributionIdProvider', 'Content: content://media', 'C2M Permission: c2dm.permission.RECEIVE', 'C2M Permission: C2D_MESSAGE', 'C2M Permission: c2dm.permission.SEND', 'C2M Intent: c2dm.intent.RECEIVE', 'C2M Intent: c2dm.intent.REGISTRATION', 'C2M Intent: c2dm.inten

Unnamed: 0,Dangerous Permission: ACCESS_SUPERUSER,Dangerous Permission: CHANGE_COMPONENT_ENABLED_STATE,Dangerous Permission: CLEAR_APP_USER_DATA,Dangerous Permission: DELETE_CACHE_FILES,Dangerous Permission: DELETE_PACKAGES,Dangerous Permission: DISABLE_KEYGUARD,Dangerous Permission: FACTORY_TEST,Dangerous Permission: INSTALL_PACKAGES,Dangerous Permission: INJECT_EVENTS,Dangerous Permission: INTERNAL_SYSTEM_WINDOW,...,Suspicious Intent Filter: USER_PRESENT,Hidden Apk,Sends SMS to Suspicious Number(s),Package Domain Exists,Reads phone data at startup,Sends SMS at startup,Starts service at startup,Sends SMS when receiving SMS,Accesses a database when receiving SMS,class
0,False,False,False,False,False,False,False,False,False,False,...,false,false,false,true,false,false,false,false,false,Benign
2,False,False,False,False,False,False,False,False,False,False,...,false,false,false,true,false,false,false,false,false,Benign
3,False,False,False,False,False,False,False,False,False,False,...,false,false,false,true,false,false,false,false,false,Malicious
4,False,False,False,False,False,False,False,False,False,False,...,false,false,false,true,false,false,false,false,false,Benign
5,False,False,False,False,False,False,False,False,False,False,...,false,false,false,true,false,false,false,false,false,Benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162978,False,False,False,False,False,False,False,False,False,False,...,false,false,false,false,false,false,false,false,false,Benign
162979,False,False,False,False,False,False,False,False,False,False,...,false,false,false,false,true,false,false,false,false,Benign
162980,False,False,False,False,False,False,False,False,False,False,...,false,false,false,false,true,false,false,false,false,Benign
162981,False,False,False,False,False,False,False,False,False,False,...,false,false,false,true,true,false,false,false,false,Benign


### 4) Conversão de tipos de dados

In [98]:
df['class'] = df['class'].replace({'Benign' : 0, 'Malicious': 1})
df = df.replace(['false', 'true', False, True], [0, 1, 0, 1])
df = df.drop_duplicates()
df.head()

Unnamed: 0,Dangerous Permission: ACCESS_SUPERUSER,Dangerous Permission: CHANGE_COMPONENT_ENABLED_STATE,Dangerous Permission: CLEAR_APP_USER_DATA,Dangerous Permission: DELETE_CACHE_FILES,Dangerous Permission: DELETE_PACKAGES,Dangerous Permission: DISABLE_KEYGUARD,Dangerous Permission: FACTORY_TEST,Dangerous Permission: INSTALL_PACKAGES,Dangerous Permission: INJECT_EVENTS,Dangerous Permission: INTERNAL_SYSTEM_WINDOW,...,Suspicious Intent Filter: USER_PRESENT,Hidden Apk,Sends SMS to Suspicious Number(s),Package Domain Exists,Reads phone data at startup,Sends SMS at startup,Starts service at startup,Sends SMS when receiving SMS,Accesses a database when receiving SMS,class
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [99]:
df.select_dtypes(exclude=['int','float'])

Unnamed: 0,Classes with Ad prefix,Total number of packages,Total number of classes,Number of classes in main package,Average class size,Size of apk,Number of images,Number of files,Number of permissions,Number of activities,Number of services,Number of receivers
0,0,3,13,10,1776.846154,1474118,88,330,0,1,0,0
2,0,3,14,0,2001.857143,1616255,56,289,9,1,0,0
3,1,95,1406,0,10916.800142,2497258,3,1437,9,7,1,4
4,0,2,13,0,2017.384615,1942479,181,269,8,1,0,0
5,0,1,10,0,3019,498359,204,283,3,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
162978,0,2,52,0,57225.019231,1154683,1,114,3,1,0,1
162979,0,6,53,0,7713.679245,1194699,193,263,3,1,0,0
162980,0,6,53,0,7714.320755,1017805,191,261,0,1,0,0
162981,1,9,148,0,7578.695946,3003014,97,266,18,5,0,0


In [100]:
cast_to_numeric_inplace(df)
df.select_dtypes(exclude=['int','float'])

0
2
3
4
5
...
162978
162979
162980
162981
162982


In [101]:
df.head()

Unnamed: 0,Dangerous Permission: ACCESS_SUPERUSER,Dangerous Permission: CHANGE_COMPONENT_ENABLED_STATE,Dangerous Permission: CLEAR_APP_USER_DATA,Dangerous Permission: DELETE_CACHE_FILES,Dangerous Permission: DELETE_PACKAGES,Dangerous Permission: DISABLE_KEYGUARD,Dangerous Permission: FACTORY_TEST,Dangerous Permission: INSTALL_PACKAGES,Dangerous Permission: INJECT_EVENTS,Dangerous Permission: INTERNAL_SYSTEM_WINDOW,...,Suspicious Intent Filter: USER_PRESENT,Hidden Apk,Sends SMS to Suspicious Number(s),Package Domain Exists,Reads phone data at startup,Sends SMS at startup,Starts service at startup,Sends SMS when receiving SMS,Accesses a database when receiving SMS,class
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [102]:
df['class'].value_counts()

0    86562
1    10170
Name: class, dtype: int64

In [103]:
df.to_csv('Androcrawl_LIMPO.csv')

Aqui vou gerar uma versão do dataset limpo do Androidcrawl contendo apenas features catalogadas pela Tainá

In [104]:
df3 = pd.read_csv('datasets/AndroCrawl_CPI.csv', index_col=0, low_memory=False)
cols_to_keep = []
for col in df3.columns:
    if(col.replace("'", "") in df.columns):
        cols_to_keep.append(col.replace("'", ""))
df[cols_to_keep + ['class']].to_csv('Androcrawl_LIMPO_apiCalls_intents_permissions.csv')

## Android Permissions Cleaning Dataset

In [4]:
df2 = pd.read_csv('./Android_permissions_Cleaning.csv', index_col=0)
df2

Unnamed: 0,App,Package,Category,Description,Rating,Number of ratings,Price,Related apps,Dangerous permissions count,Safe permissions count,...,Your personal information : read calendar events (D),Your personal information : read contact data (D),Your personal information : read sensitive log data (D),Your personal information : read user defined dictionary (D),Your personal information : retrieve system internal state (S),Your personal information : set alarm in alarm clock (S),Your personal information : write Browser's history and bookmarks (D),Your personal information : write contact data (D),Your personal information : write to user defined dictionary (S),Class
0,Canada Post Corporation,com.canadapost.android,Business,Canada Post Mobile App gives you access to som...,3.1,77.0,0.0,"{com.adaffix.pub.ca.android, com.kevinquan.gas...",7.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,Benign
1,Word Farm,com.realcasualgames.words,Brain & Puzzle,Speed and strategy combine in this exciting wo...,4.3,199.0,0.0,"{air.com.zubawing.FastWordLite, com.joybits.do...",,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
2,Fortunes of War FREE,fortunesofwar.free,Cards & Casino,"Fortunes of War is a fast-paced, easy to learn...",4.1,243.0,0.0,"{com.kevinquan.condado, hu.monsta.pazaak, net....",1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
3,Better Keyboard: Avatar Purple,com.cc.betterkeyboard.skins.avatarpurple,Libraries & Demo,Skin for Better Keyboard featuring a glossy fe...,3.6,2589.0,0.0,{eu.gdumoulin.betterandroid.skins.transparent....,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29994,Beechwood Hotel,smartstay.beechwood,Travel & Local,"Beechwood is an Android hotel app for guests, ...",5.0,2.0,0.0,"{com.bluewaterpages, com.travelconnection.luxu...",,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Malware
29995,,,,,,,,,,,...,,,,,,,,,,
29996,,,,,,,,,,,...,,,,,,,,,,
29997,,,,,,,,,,,...,,,,,,,,,,


In [106]:
for col, unique_values in get_unique_values(df2):
    print(col, unique_values)

App ['Canada Post Corporation' 'Word Farm' 'Fortunes of War FREE' ...
 'Paid Taking Surveys' 'FORUM CU Mobile DeposZip' 'Beechwood Hotel']
Package ['com.canadapost.android' 'com.realcasualgames.words' 'fortunesofwar.free'
 ... 'com.v1_4.B06F370884C1473445134E7C.com'
 'com.Vertifi.DeposZip.P274074037' 'smartstay.beechwood']
Category ['Business' 'Brain & Puzzle' 'Cards & Casino' 'Libraries & Demo' nan
 'Personalization' 'Arcade & Action' 'Social' 'Health & Fitness'
 'Entertainment' 'Casual' 'Media & Video' 'Finance' 'Music & Audio'
 'Books & Reference' 'Tools' 'Education' 'Shopping' 'Travel & Local'
 'Lifestyle' 'Sports' 'Racing' 'Productivity' 'Communication'
 'Transportation' 'Comics' 'Medical' 'Sports Games' 'Photography'
 'News & Magazines' 'Weather']
Description ['Canada Post Mobile App gives you access to some of the the most popular tools on <a href="http://www.google.com/url?q=http://canadapost.ca&usg=AFQjCNGLsUtAzxPlFZBfYYGjv6mnRiMZJw" target="_blank">canadapost.ca</a>.  You can

Network communication : view network state (S) [ 1.  0. nan]
Phone calls : intercept outgoing calls (D) [ 0. nan  1.]
Phone calls : modify phone state (S) [ 0. nan  1.]
Phone calls : read phone state and identity (D) [ 0.  1. nan]
Services that cost you money : directly call phone numbers (D) [ 1.  0. nan]
Services that cost you money : send SMS messages (D) [ 0. nan  1.]
Storage : modify/delete USB storage contents modify/delete SD card contents (D) [ 1.  0. nan]
System tools : allow Wi-Fi Multicast reception (D) [ 0. nan  1.]
System tools : automatically start at boot (S) [ 0. nan  1.]
System tools : bluetooth administration (D) [ 0. nan  1.]
System tools : change Wi-Fi state (D) [ 0. nan  1.]
System tools : change background data usage setting (S) [ 0. nan]
System tools : change network connectivity (D) [ 0. nan  1.]
System tools : change your UI settings (D) [ 0. nan  1.]
System tools : delete all application cache data (D) [ 0. nan  1.]
System tools : disable keylock (D) [ 0. nan 

### 1) Verificar e remover valores faltantes

In [107]:
df2 = df2.dropna(subset=['Class'])

Existem amostras com valores faltantes (`NaN`) nas colunas `Dangerous permissions count` e `Safe permissions count`. O motivo disso é que há amostras de permissões com valores faltantes também. Logo, precisamos remover colunas (e não linhas/amostras, mais sobre isso a seguir) com features de permissões que tenham valores faltantes e recalcular `Dangerous permissions count` e `Safe permissions count`. 

Se tentássemos remover as linhas com permissões faltantes, nenhuma linha restaria:

In [108]:
get_danger_permission_cols = lambda : [col for col in df2.columns if '(D)' == col.replace(' ', '')[-3:]]
get_safe_permission_cols = lambda : [col for col in df2.columns if '(S)' == col.replace(' ', '')[-3:]]
permission_cols = get_danger_permission_cols() + get_safe_permission_cols()
df2.dropna(subset=permission_cols)

Unnamed: 0,App,Package,Category,Description,Rating,Number of ratings,Price,Related apps,Dangerous permissions count,Safe permissions count,...,Your personal information : read calendar events (D),Your personal information : read contact data (D),Your personal information : read sensitive log data (D),Your personal information : read user defined dictionary (D),Your personal information : retrieve system internal state (S),Your personal information : set alarm in alarm clock (S),Your personal information : write Browser's history and bookmarks (D),Your personal information : write contact data (D),Your personal information : write to user defined dictionary (S),Class


In [109]:
valid_cols = df2[permission_cols].dropna(axis=1).columns
df2 = df2.drop(columns=[col for col in permission_cols if col not in valid_cols])
df2

Unnamed: 0,App,Package,Category,Description,Rating,Number of ratings,Price,Related apps,Dangerous permissions count,Safe permissions count,...,Your personal information : read calendar events (D),Your personal information : read contact data (D),Your personal information : read sensitive log data (D),Your personal information : read user defined dictionary (D),Your personal information : retrieve system internal state (S),Your personal information : set alarm in alarm clock (S),Your personal information : write Browser's history and bookmarks (D),Your personal information : write contact data (D),Your personal information : write to user defined dictionary (S),Class
0,Canada Post Corporation,com.canadapost.android,Business,Canada Post Mobile App gives you access to som...,3.1,77.0,0.0,"{com.adaffix.pub.ca.android, com.kevinquan.gas...",7.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,Benign
1,Word Farm,com.realcasualgames.words,Brain & Puzzle,Speed and strategy combine in this exciting wo...,4.3,199.0,0.0,"{air.com.zubawing.FastWordLite, com.joybits.do...",,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
2,Fortunes of War FREE,fortunesofwar.free,Cards & Casino,"Fortunes of War is a fast-paced, easy to learn...",4.1,243.0,0.0,"{com.kevinquan.condado, hu.monsta.pazaak, net....",1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
3,Better Keyboard: Avatar Purple,com.cc.betterkeyboard.skins.avatarpurple,Libraries & Demo,Skin for Better Keyboard featuring a glossy fe...,3.6,2589.0,0.0,{eu.gdumoulin.betterandroid.skins.transparent....,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
5,Ms Claus Live Wallpaper,tmc.christmaslady.livewallpaper,Personalization,Ms Claus Live Wallpaper<p>Find more Free apps ...,4.5,16.0,0.0,"{tmc.christmassanta.livewallpaper, tmc.winterh...",,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29990,EonPhone,com.eonphone.sip,Communication,Eon Phone clearest VOIP calls to the USA and C...,3.0,47.0,0.0,"{com.mytunudialer.activity, cz.acrobits.softph...",11.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,Malware
29991,Paid Taking Surveys,com.v1_4.B06F370884C1473445134E7C.com,Business,Get paid for your opinion by taking easy onlin...,0.0,0.0,0.0,"{es.feliperipoll.android.inga, com.appmakr.app...",,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
29992,TeleNav GPS Navigator,com.telenav.app.android.sprint,Travel & Local,"Get voice-guided GPS, maps, business listings ...",4.1,2664.0,0.0,"{com.rocketouch.CRCKosher, com.cb.sfl, com.kos...",15.0,7.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,Malware
29993,FORUM CU Mobile DeposZip,com.Vertifi.DeposZip.P274074037,Finance,Make check deposits to your FORUM Credit Union...,4.8,4.0,0.0,"{com.forumcu.cuonline, com.msi.bktest, com.qua...",2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Malware


In [110]:
# recalcular as colunas que contam permissões perigosas e seguras
df2['Dangerous permissions count'] = df2[get_danger_permission_cols()].sum(axis=1)
df2['Safe permissions count'] = df2[get_safe_permission_cols()].sum(axis=1)
df2

Unnamed: 0,App,Package,Category,Description,Rating,Number of ratings,Price,Related apps,Dangerous permissions count,Safe permissions count,...,Your personal information : read calendar events (D),Your personal information : read contact data (D),Your personal information : read sensitive log data (D),Your personal information : read user defined dictionary (D),Your personal information : retrieve system internal state (S),Your personal information : set alarm in alarm clock (S),Your personal information : write Browser's history and bookmarks (D),Your personal information : write contact data (D),Your personal information : write to user defined dictionary (S),Class
0,Canada Post Corporation,com.canadapost.android,Business,Canada Post Mobile App gives you access to som...,3.1,77.0,0.0,"{com.adaffix.pub.ca.android, com.kevinquan.gas...",6.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,Benign
1,Word Farm,com.realcasualgames.words,Brain & Puzzle,Speed and strategy combine in this exciting wo...,4.3,199.0,0.0,"{air.com.zubawing.FastWordLite, com.joybits.do...",3.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
2,Fortunes of War FREE,fortunesofwar.free,Cards & Casino,"Fortunes of War is a fast-paced, easy to learn...",4.1,243.0,0.0,"{com.kevinquan.condado, hu.monsta.pazaak, net....",1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
3,Better Keyboard: Avatar Purple,com.cc.betterkeyboard.skins.avatarpurple,Libraries & Demo,Skin for Better Keyboard featuring a glossy fe...,3.6,2589.0,0.0,{eu.gdumoulin.betterandroid.skins.transparent....,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
5,Ms Claus Live Wallpaper,tmc.christmaslady.livewallpaper,Personalization,Ms Claus Live Wallpaper<p>Find more Free apps ...,4.5,16.0,0.0,"{tmc.christmassanta.livewallpaper, tmc.winterh...",3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29990,EonPhone,com.eonphone.sip,Communication,Eon Phone clearest VOIP calls to the USA and C...,3.0,47.0,0.0,"{com.mytunudialer.activity, cz.acrobits.softph...",11.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,Malware
29991,Paid Taking Surveys,com.v1_4.B06F370884C1473445134E7C.com,Business,Get paid for your opinion by taking easy onlin...,0.0,0.0,0.0,"{es.feliperipoll.android.inga, com.appmakr.app...",3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
29992,TeleNav GPS Navigator,com.telenav.app.android.sprint,Travel & Local,"Get voice-guided GPS, maps, business listings ...",4.1,2664.0,0.0,"{com.rocketouch.CRCKosher, com.cb.sfl, com.kos...",15.0,6.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,Malware
29993,FORUM CU Mobile DeposZip,com.Vertifi.DeposZip.P274074037,Finance,Make check deposits to your FORUM Credit Union...,4.8,4.0,0.0,"{com.forumcu.cuonline, com.msi.bktest, com.qua...",2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Malware


In [111]:
print("Colunas que ainda possuem NaN's:")
for col, unique_values in get_unique_values(df2):
    if (any([pd.isna(value) for value in unique_values])):
        print(col)

Colunas que ainda possuem NaN's:
App
Description
Related apps


### 2) Remoção de amostras com valores duplicados nas colunas

In [112]:
df2 = df2[~df2.duplicated()]
df2

Unnamed: 0,App,Package,Category,Description,Rating,Number of ratings,Price,Related apps,Dangerous permissions count,Safe permissions count,...,Your personal information : read calendar events (D),Your personal information : read contact data (D),Your personal information : read sensitive log data (D),Your personal information : read user defined dictionary (D),Your personal information : retrieve system internal state (S),Your personal information : set alarm in alarm clock (S),Your personal information : write Browser's history and bookmarks (D),Your personal information : write contact data (D),Your personal information : write to user defined dictionary (S),Class
0,Canada Post Corporation,com.canadapost.android,Business,Canada Post Mobile App gives you access to som...,3.1,77.0,0.0,"{com.adaffix.pub.ca.android, com.kevinquan.gas...",6.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,Benign
1,Word Farm,com.realcasualgames.words,Brain & Puzzle,Speed and strategy combine in this exciting wo...,4.3,199.0,0.0,"{air.com.zubawing.FastWordLite, com.joybits.do...",3.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
2,Fortunes of War FREE,fortunesofwar.free,Cards & Casino,"Fortunes of War is a fast-paced, easy to learn...",4.1,243.0,0.0,"{com.kevinquan.condado, hu.monsta.pazaak, net....",1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
3,Better Keyboard: Avatar Purple,com.cc.betterkeyboard.skins.avatarpurple,Libraries & Demo,Skin for Better Keyboard featuring a glossy fe...,3.6,2589.0,0.0,{eu.gdumoulin.betterandroid.skins.transparent....,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
5,Ms Claus Live Wallpaper,tmc.christmaslady.livewallpaper,Personalization,Ms Claus Live Wallpaper<p>Find more Free apps ...,4.5,16.0,0.0,"{tmc.christmassanta.livewallpaper, tmc.winterh...",3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29990,EonPhone,com.eonphone.sip,Communication,Eon Phone clearest VOIP calls to the USA and C...,3.0,47.0,0.0,"{com.mytunudialer.activity, cz.acrobits.softph...",11.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,Malware
29991,Paid Taking Surveys,com.v1_4.B06F370884C1473445134E7C.com,Business,Get paid for your opinion by taking easy onlin...,0.0,0.0,0.0,"{es.feliperipoll.android.inga, com.appmakr.app...",3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
29992,TeleNav GPS Navigator,com.telenav.app.android.sprint,Travel & Local,"Get voice-guided GPS, maps, business listings ...",4.1,2664.0,0.0,"{com.rocketouch.CRCKosher, com.cb.sfl, com.kos...",15.0,6.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,Malware
29993,FORUM CU Mobile DeposZip,com.Vertifi.DeposZip.P274074037,Finance,Make check deposits to your FORUM Credit Union...,4.8,4.0,0.0,"{com.forumcu.cuonline, com.msi.bktest, com.qua...",2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Malware


### 3) Remoção de características irrelevantes

In [113]:
df2 = df2.drop(columns=['App', 'Package', 'Category','Description', 'Related apps', 'Price'])
df2 = drop_irrelevant_columns(df2)
df2

21 colunas irrelevantes:
['Default : Modify Google service configuration (S)', 'Default : access to passwords for Google accounts (S)', 'Default : act as an account authenticator (S)', 'Default : coarse (network-based) location (S)', 'Default : discover known accounts (S)', 'Default : full Internet access (S)', 'Default : mock location sources for testing (S)', 'Default : modify/delete USB storage contents modify/delete SD card contents (S)', 'Default : permanently disable device (S)', 'Default : read instant messages (S)', 'Default : reset system to factory defaults (S)', 'Default : run in factory test mode (S)', 'Default : set wallpaper size hints (S)', 'Default : start IM service (S)', 'Default : write contact data (S)', 'Default : write instant messages (S)', 'Network communication : download files without notification (S)', 'System tools : change background data usage setting (S)', 'Your accounts : Blogger (D)', 'Your messages : read Gmail attachment previews (D)', 'Your messages 

Unnamed: 0,Rating,Number of ratings,Dangerous permissions count,Safe permissions count,Default : Access Email provider data (S),Default : Advanced download manager functions. (S),Default : Audio File Access (S),Default : Install DRM content. (S),Default : Modify Google settings (S),Default : Move application resources (S),...,Your personal information : read calendar events (D),Your personal information : read contact data (D),Your personal information : read sensitive log data (D),Your personal information : read user defined dictionary (D),Your personal information : retrieve system internal state (S),Your personal information : set alarm in alarm clock (S),Your personal information : write Browser's history and bookmarks (D),Your personal information : write contact data (D),Your personal information : write to user defined dictionary (S),Class
0,3.1,77.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,Benign
1,4.3,199.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
2,4.1,243.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
3,3.6,2589.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
5,4.5,16.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29990,3.0,47.0,11.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,Malware
29991,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
29992,4.1,2664.0,15.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,Malware
29993,4.8,4.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Malware


### 4) Conversão de tipos de dados

Apenas a coluna `Class` precisa ser corrigida

In [114]:
print(df2.select_dtypes(exclude=['int', 'float']))
print(df2['Class'].unique())
df2['Class'] = df2['Class'].replace({'Benign' : 0, 'Malware' : 1})

         Class
0       Benign
1       Benign
2       Benign
3       Benign
5       Benign
...        ...
29990  Malware
29991   Benign
29992  Malware
29993  Malware
29994  Malware

[27298 rows x 1 columns]
['Benign' 'Malware']


In [115]:
df2

Unnamed: 0,Rating,Number of ratings,Dangerous permissions count,Safe permissions count,Default : Access Email provider data (S),Default : Advanced download manager functions. (S),Default : Audio File Access (S),Default : Install DRM content. (S),Default : Modify Google settings (S),Default : Move application resources (S),...,Your personal information : read calendar events (D),Your personal information : read contact data (D),Your personal information : read sensitive log data (D),Your personal information : read user defined dictionary (D),Your personal information : retrieve system internal state (S),Your personal information : set alarm in alarm clock (S),Your personal information : write Browser's history and bookmarks (D),Your personal information : write contact data (D),Your personal information : write to user defined dictionary (S),Class
0,3.1,77.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
1,4.3,199.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,4.1,243.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,3.6,2589.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
5,4.5,16.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29990,3.0,47.0,11.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1
29991,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
29992,4.1,2664.0,15.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1
29993,4.8,4.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [116]:
df2['Class'].value_counts()

1    18224
0     9074
Name: Class, dtype: int64

In [117]:
df2.to_csv("Android_Permissions_Cleaning_Dataset_LIMPO.csv")