# Malware Related Activity Prediction Pipeline

#### Imports and Data Load

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

connections = pd.read_csv("data/connections.csv", delimiter='\t')
devices = pd.read_csv("data/devices.csv", delimiter='\t')
processes = pd.read_csv("data/processes.csv", delimiter='\t')
profiles = pd.read_csv("data/profiles.csv", delimiter='\t')

## 1 Phase I

### 1.1 Basic Data Description and Characteristics

#### 1.1.1 Data Structure Analysis

In [6]:
connections.head()

Unnamed: 0,ts,imei,mwra,c.android.youtube,c.katana,c.android.gm,c.android.chrome,c.dogalize,c.updateassist,c.android.vending,c.UCMobile.x86,c.UCMobile.intl,c.raider
0,2018-05-05 10:00:00,8630330696303481495,1.0,12.26513,9.00831,9.49543,11.28728,10.38606,37.47582,63.36036,41.38373,83.30335,74.60602
1,2018-05-05 10:01:00,359043379931766510,0.0,10.99262,9.58422,13.94219,13.5727,14.60374,26.04814,10.12276,39.58289,2.53433,69.83137
2,2018-05-05 10:02:00,8630330696303481107,1.0,15.36004,10.17672,11.29302,13.42295,11.4689,62.91819,43.93041,65.39351,66.58459,11.94376
3,2018-05-05 10:03:00,863033069630348180,0.0,9.85083,10.50735,11.33704,9.01125,12.00125,44.88155,67.47714,33.1843,40.30341,37.94169
4,2018-05-05 10:04:00,8630330696303482360,0.0,14.09594,11.07454,9.21168,8.94069,12.61083,61.19159,64.32489,32.20124,63.26817,99.50581


In [7]:
connections.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15160 entries, 0 to 15159
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   ts                 15160 non-null  object 
 1   imei               15160 non-null  int64  
 2   mwra               15160 non-null  float64
 3   c.android.youtube  15160 non-null  float64
 4   c.katana           15160 non-null  float64
 5   c.android.gm       15160 non-null  float64
 6   c.android.chrome   15160 non-null  float64
 7   c.dogalize         15160 non-null  float64
 8   c.updateassist     15160 non-null  float64
 9   c.android.vending  15160 non-null  float64
 10  c.UCMobile.x86     15160 non-null  float64
 11  c.UCMobile.intl    15160 non-null  float64
 12  c.raider           15160 non-null  float64
dtypes: float64(11), int64(1), object(1)
memory usage: 1.5+ MB


In [8]:
devices.head()

Unnamed: 0,latitude,longitude,store_name,code,location,imei
0,46.08333,122.08333,Ulanhot,CN,Asia/Shanghai,8630330696303481735
1,16.68911,98.50893,Myawadi,MM,Asia/Yangon,8630330696303481164
2,38.96667,-0.18333,Gandia,ES,Europe/Madrid,8630330696303481057
3,48.52961,12.16179,Landshut,DE,Europe/Berlin,359043379931766353
4,56.85836,35.90057,Tver,RU,Europe/Moscow,3590433799317661941


In [9]:
devices.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2980 entries, 0 to 2979
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   latitude    2980 non-null   float64
 1   longitude   2980 non-null   float64
 2   store_name  2980 non-null   object 
 3   code        2977 non-null   object 
 4   location    2980 non-null   object 
 5   imei        2980 non-null   int64  
dtypes: float64(2), int64(1), object(3)
memory usage: 139.8+ KB


In [10]:
processes.head()

Unnamed: 0,ts,imei,mwra,p.android.gm,p.android.externalstorage,p.android.chrome,p.system,p.android.settings,p.android.packageinstaller,p.android.documentsui,...,p.android.defcontainer,p.android.vending,p.process.gapps,p.simulator,p.dogalize,p.android.gms,p.browser.provider,p.gms.persistent,p.katana,p.inputmethod.latin
0,2018-05-05 10:00:00,8630330696303481495,1.0,8.86395,13.02477,10.8675,7.49424,15.51358,9.41107,15.3919,...,44.94347,99.03688,54.05561,47.74283,62.14289,28.71385,32.05187,86.35658,79.93212,65.50034
1,2018-05-05 10:01:00,359043379931766510,0.0,9.91497,14.95342,8.75484,11.12868,12.84456,9.83628,14.32174,...,30.23815,24.04779,83.70527,10.86666,82.33455,76.61692,32.05482,98.16903,7.14569,53.82484
2,2018-05-05 10:02:00,8630330696303481107,1.0,8.85825,12.41158,6.81129,6.27154,14.63611,11.36688,11.93349,...,52.1996,2.3333,6.62872,51.04675,28.04786,29.1432,50.56621,84.50028,57.20779,58.70091
3,2018-05-05 10:03:00,863033069630348180,0.0,9.10183,14.59227,8.36162,13.74939,12.17126,11.66347,12.27116,...,68.17462,25.85253,10.93414,37.06926,47.92182,34.653,14.38753,44.09099,41.96218,75.97173
4,2018-05-05 10:04:00,8630330696303482360,0.0,9.46824,12.61537,14.8991,7.73386,12.17794,14.301,13.77368,...,54.84178,6.55989,70.29777,76.34902,39.33797,7.34483,27.86712,95.83795,55.63924,66.55716


In [11]:
processes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15160 entries, 0 to 15159
Data columns (total 23 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   ts                          15160 non-null  object 
 1   imei                        15160 non-null  int64  
 2   mwra                        15160 non-null  float64
 3   p.android.gm                15160 non-null  float64
 4   p.android.externalstorage   15160 non-null  float64
 5   p.android.chrome            15160 non-null  float64
 6   p.system                    15160 non-null  float64
 7   p.android.settings          15160 non-null  float64
 8   p.android.packageinstaller  15160 non-null  float64
 9   p.android.documentsui       15160 non-null  float64
 10  p.google                    15160 non-null  float64
 11  p.notifier                  15160 non-null  float64
 12  p.olauncher                 15160 non-null  float64
 13  p.android.defcontainer      151

In [12]:
profiles.head()

Unnamed: 0,residence,job,birthdate,current_location,imei,ssn,company,registration,name,username,user_id,mail
0,024 Austin Throughway Apt. 956\r\nLake Elizabe...,,,"(Decimal('-37.3871205'), Decimal('63.057817'))",8630330696303481297,045-68-8610,Fuentes-Wilson,"05/24/2018, 00:00:00",Elizabeth Brooks,oconnorsandra,698,ajoseph@yahoo.com
1,,,1986-04-01,"(Decimal('-9.8577275'), Decimal('10.693479'))",863033069630348487,655-75-2092,Gomez Ltd,2018/12/10,Spencer Brooks,lwright,15,michaelyates@yahoo.com
2,"219 Sarah Causeway\r\nSandraland, NV 51026",,,"(Decimal('-69.389991'), Decimal('-154.055961'))",8630330696303482485,047-42-5809,Robinson and Sons,13 Mar 2024,Monica Mendoza,seangarner,2339,joneschristina@hotmail.com
3,,,,"(Decimal('86.206030'), Decimal('-46.129473'))",359043379931766320,611-26-7229,"Bowman, White and Diaz",2022-09-09,James Hurley MD,anthonytucker,777,angelavaughan@gmail.com
4,55553 Jennifer Hollow Suite 158\r\nLake Crysta...,,,"(Decimal('15.107049'), Decimal('150.404188'))",3590433799317661461,313-91-3441,Coleman-Riley,2023/10/22,Anthony Sherman,warnersean,1966,mcgeealexander@gmail.com


In [13]:
profiles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2520 entries, 0 to 2519
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   residence         882 non-null    object
 1   job               756 non-null    object
 2   birthdate         1386 non-null   object
 3   current_location  2394 non-null   object
 4   imei              2520 non-null   int64 
 5   ssn               2520 non-null   object
 6   company           2520 non-null   object
 7   registration      2520 non-null   object
 8   name              2520 non-null   object
 9   username          2520 non-null   object
 10  user_id           2520 non-null   int64 
 11  mail              2520 non-null   object
dtypes: int64(2), object(10)
memory usage: 236.4+ KB


#### 1.1.2 Individual Attribute Analysis

#### 1.1.3 Paired Data Analysis - Relationships and Dependencies Between Attribute Pairs Identification

#### 1.1.4 Paired Data Analysis - Potential Predictors

### 1.2 Problem Identification, Data Integration and Cleaning

#### 1.2.1 Data Problems Identification and Solving

#### 1.2.2 Missing Values Solving

#### 1.2.3 Outlier Detection

### 1.3 Formulation and Statistical Verification of Hypotheses

#### 1.3.1 Hypotheses

##### 1.3.1.1 Hypothesis I

##### 1.3.1.2 Hypothesis II

#### 1.3.2 Statistical Power (Data Support) of the Tests Check