# Phase 2

In [131]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import scipy.stats as stats
from datetime import datetime
from sklearn.model_selection import train_test_split
import category_encoders as ce
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import RobustScaler

## 2.1 Data preprocessing

In [113]:
def convert_to_datetime(value):
    date_formats = ["%Y-%m-%d", "%Y/%m/%d", "%m/%d/%Y, %H:%M:%S", "%d %b %Y"]
    for fmt in date_formats:
        try:
            return datetime.strptime(value, fmt)
        except ValueError:
            continue
    return None

def preprocess_data():
    connections, devices, processes, profiles = pd.read_csv('data/connections.csv', sep='\t', keep_default_na=False, na_values=''), pd.read_csv('data/devices.csv', sep='\t', keep_default_na=False, na_values=''), pd.read_csv('data/processes.csv', sep='\t', keep_default_na=False, na_values=''), pd.read_csv('data/profiles.csv', sep='\t', keep_default_na=False, na_values='')

    print("Connections beginning", connections.shape)
    print("Processes end", processes.shape)

    connections['ts'] = pd.to_datetime(connections['ts'])
    processes['ts'] = pd.to_datetime(processes['ts'])

    connections = connections.drop_duplicates()
    processes = processes.drop_duplicates()

    connections['mwra'] = connections['mwra'].astype(int)
    processes['mwra'] = processes['mwra'].astype(int)

    lower = connections['c.dogalize'].quantile(0.25) - 1.5 * stats.iqr(connections['c.dogalize'])
    upper = connections['c.dogalize'].quantile(0.75) + 1.5 * stats.iqr(connections['c.dogalize'])
    connections['c.dogalize'] = np.where(connections['c.dogalize'] < lower, lower, connections['c.dogalize'])
    connections['c.dogalize'] = np.where(connections['c.dogalize'] > upper, upper, connections['c.dogalize'])

    lower = connections['c.android.youtube'].quantile(0.25) - 1.5 * stats.iqr(connections['c.android.youtube'])
    upper = connections['c.android.youtube'].quantile(0.75) + 1.5 * stats.iqr(connections['c.android.youtube'])
    connections = connections[(connections['c.android.youtube'] >= lower) & (connections['c.android.youtube'] <= upper)]

    Q1 = processes['p.android.documentsui'].quantile(0.25)
    Q3 = processes['p.android.documentsui'].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    processes['p.android.documentsui'] = processes['p.android.documentsui'].clip(lower=lower_bound, upper=upper_bound)

    print("Connections end", connections.shape)
    print("Processes end", processes.shape)

    merged = processes.merge(connections, on=['ts', 'imei', 'mwra'], how='inner')
    print("Merged", merged.shape)

    return merged

In [114]:
data = preprocess_data()

Connections beginning (15074, 13)
Processes end (15074, 23)
Connections end (14878, 13)
Processes end (14895, 23)
Merged (14878, 33)


In [115]:
def split_data(data, ratio): #ratio = training subset percentage
    slice_index = (data.shape[0]*ratio)//100
    training_data = data[:slice_index]
    test_data = data[slice_index:]
    return training_data, test_data

In [116]:
training_data, test_data = split_data(data,75)
print("training: ", training_data.shape)
print("test: ", test_data.shape)

training:  (11158, 33)
test:  (3720, 33)


### Data Normalization

In [117]:
training_data.head()

Unnamed: 0,ts,imei,mwra,p.android.chrome,p.android.documentsui,p.android.gm,p.system,p.android.packageinstaller,p.android.settings,p.android.externalstorage,...,c.android.youtube,c.dogalize,c.android.gm,c.katana,c.android.chrome,c.raider,c.android.vending,c.UCMobile.intl,c.UCMobile.x86,c.updateassist
0,2018-05-05 10:00:00,3590433799317661107,0,7.41473,10.17656,14.80917,12.14702,11.5562,8.33912,14.0245,...,11.65403,10.65335,10.48791,16.9162,5.04564,36.21508,14.24956,4.09319,15.20934,5.11764
1,2018-05-05 10:01:00,3590433799317662063,1,9.40603,6.57378,6.06519,10.56643,16.74062,13.78434,7.57297,...,10.29551,10.46363,10.98438,15.65637,14.82931,24.83765,57.49911,23.94919,8.82448,44.1357
2,2018-05-05 10:02:00,863033069630348313,0,13.61225,11.73312,8.99679,10.79425,12.60312,10.78121,11.90788,...,12.12831,9.53752,6.7808,6.98948,10.92433,3.29441,20.37891,52.55353,49.51037,99.75215
3,2018-05-05 10:03:00,359043379931766114,1,8.1497,9.53996,8.28249,10.80629,13.9767,12.73047,11.64714,...,11.06419,11.55759,8.87744,11.84499,15.45902,24.78878,36.40299,25.77644,21.84167,65.17774
4,2018-05-05 10:04:00,3590433799317661842,1,8.20358,8.94156,8.62248,8.33003,13.44049,13.31239,11.40689,...,12.08457,10.4286,9.82241,12.48869,12.85363,40.98237,74.13316,96.7745,18.48116,90.77304


In [119]:
feature_columns = ['p.android.chrome', 'p.android.documentsui', 'p.android.gm']

In [120]:
def min_max_scaling(data, feature_columns):
    scaler = MinMaxScaler()
    data[feature_columns] = scaler.fit_transform(data[feature_columns])
    return data

In [121]:
normalized_data = min_max_scaling(training_data, feature_columns)
normalized_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[feature_columns] = scaler.fit_transform(data[feature_columns])


Unnamed: 0,ts,imei,mwra,p.android.chrome,p.android.documentsui,p.android.gm,p.system,p.android.packageinstaller,p.android.settings,p.android.externalstorage,...,c.android.youtube,c.dogalize,c.android.gm,c.katana,c.android.chrome,c.raider,c.android.vending,c.UCMobile.intl,c.UCMobile.x86,c.updateassist
0,2018-05-05 10:00:00,3590433799317661107,0,0.318339,0.490964,0.681877,12.14702,11.55620,8.33912,14.02450,...,11.65403,10.65335,10.48791,16.91620,5.04564,36.21508,14.24956,4.09319,15.20934,5.11764
1,2018-05-05 10:01:00,3590433799317662063,1,0.427289,0.205080,0.204867,10.56643,16.74062,13.78434,7.57297,...,10.29551,10.46363,10.98438,15.65637,14.82931,24.83765,57.49911,23.94919,8.82448,44.13570
2,2018-05-05 10:02:00,863033069630348313,0,0.657424,0.614478,0.364795,10.79425,12.60312,10.78121,11.90788,...,12.12831,9.53752,6.78080,6.98948,10.92433,3.29441,20.37891,52.55353,49.51037,99.75215
3,2018-05-05 10:03:00,359043379931766114,1,0.358551,0.440449,0.325828,10.80629,13.97670,12.73047,11.64714,...,11.06419,11.55759,8.87744,11.84499,15.45902,24.78878,36.40299,25.77644,21.84167,65.17774
4,2018-05-05 10:04:00,3590433799317661842,1,0.361499,0.392965,0.344375,8.33003,13.44049,13.31239,11.40689,...,12.08457,10.42860,9.82241,12.48869,12.85363,40.98237,74.13316,96.77450,18.48116,90.77304
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11153,2018-05-13 04:07:00,8630330696303482394,0,0.674334,0.735604,0.324217,8.30896,11.22312,8.48987,9.49179,...,12.15486,7.89836,5.73909,6.31173,13.66826,13.68408,77.09689,31.92989,16.11913,67.12787
11154,2018-05-13 04:08:00,8630330696303481149,0,0.521492,0.407818,0.496179,12.59512,9.24899,13.94088,12.59623,...,10.72010,5.90902,8.43279,8.10863,5.78574,96.78929,7.26321,30.14710,49.65198,50.11389
11155,2018-05-13 04:09:00,8630330696303482071,1,0.545218,0.718159,0.527567,4.87616,13.14365,7.40306,10.81714,...,10.90218,14.52762,13.92867,16.00132,11.27901,30.13061,51.31294,88.02742,58.81128,62.34618
11156,2018-05-13 04:10:00,863033069630348644,0,0.517123,0.767216,0.329142,11.84447,9.02283,12.40197,12.06240,...,13.77264,6.77637,7.82257,6.67953,8.42699,1.12654,93.45526,48.25914,92.08085,37.28778


### Data Standardization

In [122]:
def standard_scaling(data, feature_columns):
    scaler = StandardScaler()
    data[feature_columns] = scaler.fit_transform(data[feature_columns])
    return data

In [123]:
standardized_data = standard_scaling(training_data, feature_columns)
standardized_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[feature_columns] = scaler.fit_transform(data[feature_columns])


Unnamed: 0,ts,imei,mwra,p.android.chrome,p.android.documentsui,p.android.gm,p.system,p.android.packageinstaller,p.android.settings,p.android.externalstorage,...,c.android.youtube,c.dogalize,c.android.gm,c.katana,c.android.chrome,c.raider,c.android.vending,c.UCMobile.intl,c.UCMobile.x86,c.updateassist
0,2018-05-05 10:00:00,3590433799317661107,0,-1.242509,0.044947,2.269220,12.14702,11.55620,8.33912,14.02450,...,11.65403,10.65335,10.48791,16.91620,5.04564,36.21508,14.24956,4.09319,15.20934,5.11764
1,2018-05-05 10:01:00,3590433799317662063,1,-0.472420,-1.466121,-1.412354,10.56643,16.74062,13.78434,7.57297,...,10.29551,10.46363,10.98438,15.65637,14.82931,24.83765,57.49911,23.94919,8.82448,44.13570
2,2018-05-05 10:02:00,863033069630348313,0,1.154240,0.697795,-0.178030,10.79425,12.60312,10.78121,11.90788,...,12.12831,9.53752,6.78080,6.98948,10.92433,3.29441,20.37891,52.55353,49.51037,99.75215
3,2018-05-05 10:03:00,359043379931766114,1,-0.958277,-0.222054,-0.478779,10.80629,13.97670,12.73047,11.64714,...,11.06419,11.55759,8.87744,11.84499,15.45902,24.78878,36.40299,25.77644,21.84167,65.17774
4,2018-05-05 10:04:00,3590433799317661842,1,-0.937440,-0.473033,-0.335630,8.33003,13.44049,13.31239,11.40689,...,12.08457,10.42860,9.82241,12.48869,12.85363,40.98237,74.13316,96.77450,18.48116,90.77304
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11153,2018-05-13 04:07:00,8630330696303482394,0,1.273765,1.338018,-0.491213,8.30896,11.22312,8.48987,9.49179,...,12.15486,7.89836,5.73909,6.31173,13.66826,13.68408,77.09689,31.92989,16.11913,67.12787
11154,2018-05-13 04:08:00,8630330696303481149,0,0.193435,-0.394527,0.836001,12.59512,9.24899,13.94088,12.59623,...,10.72010,5.90902,8.43279,8.10863,5.78574,96.78929,7.26321,30.14710,49.65198,50.11389
11155,2018-05-13 04:09:00,8630330696303482071,1,0.361136,1.245814,1.078255,4.87616,13.14365,7.40306,10.81714,...,10.90218,14.52762,13.92867,16.00132,11.27901,30.13061,51.31294,88.02742,58.81128,62.34618
11156,2018-05-13 04:10:00,863033069630348644,0,0.162551,1.505106,-0.453201,11.84447,9.02283,12.40197,12.06240,...,13.77264,6.77637,7.82257,6.67953,8.42699,1.12654,93.45526,48.25914,92.08085,37.28778


### Power trasformer

In [125]:
def power_transform(data, feature_columns):
    transformer = PowerTransformer()
    data[feature_columns] = transformer.fit_transform(data[feature_columns])
    return data

In [129]:
power_transformed_data = power_transform(training_data, feature_columns)
power_transformed_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[feature_columns] = transformer.fit_transform(data[feature_columns])


Unnamed: 0,ts,imei,mwra,p.android.chrome,p.android.documentsui,p.android.gm,p.system,p.android.packageinstaller,p.android.settings,p.android.externalstorage,...,c.android.youtube,c.dogalize,c.android.gm,c.katana,c.android.chrome,c.raider,c.android.vending,c.UCMobile.intl,c.UCMobile.x86,c.updateassist
0,2018-05-05 10:00:00,3590433799317661107,0,-1.302669,0.165463,1.981686,12.14702,11.55620,8.33912,14.02450,...,11.65403,10.65335,10.48791,16.91620,5.04564,36.21508,14.24956,4.09319,15.20934,5.11764
1,2018-05-05 10:01:00,3590433799317662063,1,-0.422777,-1.665786,-1.594204,10.56643,16.74062,13.78434,7.57297,...,10.29551,10.46363,10.98438,15.65637,14.82931,24.83765,57.49911,23.94919,8.82448,44.13570
2,2018-05-05 10:02:00,863033069630348313,0,1.137468,0.765307,-0.077957,10.79425,12.60312,10.78121,11.90788,...,12.12831,9.53752,6.78080,6.98948,10.92433,3.29441,20.37891,52.55353,49.51037,99.75215
3,2018-05-05 10:03:00,359043379931766114,1,-0.969171,-0.113921,-0.415377,10.80629,13.97670,12.73047,11.64714,...,11.06419,11.55759,8.87744,11.84499,15.45902,24.78878,36.40299,25.77644,21.84167,65.17774
4,2018-05-05 10:04:00,3590433799317661842,1,-0.945099,-0.396370,-0.251777,8.33003,13.44049,13.31239,11.40689,...,12.08457,10.42860,9.82241,12.48869,12.85363,40.98237,74.13316,96.77450,18.48116,90.77304
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11153,2018-05-13 04:07:00,8630330696303482394,0,1.239669,1.280829,-0.429830,8.30896,11.22312,8.48987,9.49179,...,12.15486,7.89836,5.73909,6.31173,13.66826,13.68408,77.09689,31.92989,16.11913,67.12787
11154,2018-05-13 04:08:00,8630330696303481149,0,0.264487,-0.306106,0.882723,12.59512,9.24899,13.94088,12.59623,...,10.72010,5.90902,8.43279,8.10863,5.78574,96.78929,7.26321,30.14710,49.65198,50.11389
11155,2018-05-13 04:09:00,8630330696303482071,1,0.425008,1.209816,1.083748,4.87616,13.14365,7.40306,10.81714,...,10.90218,14.52762,13.92867,16.00132,11.27901,30.13061,51.31294,88.02742,58.81128,62.34618
11156,2018-05-13 04:10:00,863033069630348644,0,0.234436,1.407173,-0.385764,11.84447,9.02283,12.40197,12.06240,...,13.77264,6.77637,7.82257,6.67953,8.42699,1.12654,93.45526,48.25914,92.08085,37.28778


### Quantile transformer

In [128]:
def quantile_transform(data, feature_columns):
    transformer = QuantileTransformer(output_distribution='normal')
    data[feature_columns] = transformer.fit_transform(data[feature_columns])
    return data

In [130]:
quantile_transformed_data = quantile_transform(training_data, feature_columns)
quantile_transformed_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[feature_columns] = transformer.fit_transform(data[feature_columns])


Unnamed: 0,ts,imei,mwra,p.android.chrome,p.android.documentsui,p.android.gm,p.system,p.android.packageinstaller,p.android.settings,p.android.externalstorage,...,c.android.youtube,c.dogalize,c.android.gm,c.katana,c.android.chrome,c.raider,c.android.vending,c.UCMobile.intl,c.UCMobile.x86,c.updateassist
0,2018-05-05 10:00:00,3590433799317661107,0,-1.321696,0.188200,1.880902,12.14702,11.55620,8.33912,14.02450,...,11.65403,10.65335,10.48791,16.91620,5.04564,36.21508,14.24956,4.09319,15.20934,5.11764
1,2018-05-05 10:01:00,3590433799317662063,1,-0.369831,-1.701712,-1.613786,10.56643,16.74062,13.78434,7.57297,...,10.29551,10.46363,10.98438,15.65637,14.82931,24.83765,57.49911,23.94919,8.82448,44.13570
2,2018-05-05 10:02:00,863033069630348313,0,1.066915,0.703103,-0.087206,10.79425,12.60312,10.78121,11.90788,...,12.12831,9.53752,6.78080,6.98948,10.92433,3.29441,20.37891,52.55353,49.51037,99.75215
3,2018-05-05 10:03:00,359043379931766114,1,-0.953864,-0.078005,-0.435366,10.80629,13.97670,12.73047,11.64714,...,11.06419,11.55759,8.87744,11.84499,15.45902,24.78878,36.40299,25.77644,21.84167,65.17774
4,2018-05-05 10:04:00,3590433799317661842,1,-0.926131,-0.365683,-0.269182,8.33003,13.44049,13.31239,11.40689,...,12.08457,10.42860,9.82241,12.48869,12.85363,40.98237,74.13316,96.77450,18.48116,90.77304
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11153,2018-05-13 04:07:00,8630330696303482394,0,1.190681,1.202923,-0.450295,8.30896,11.22312,8.48987,9.49179,...,12.15486,7.89836,5.73909,6.31173,13.66826,13.68408,77.09689,31.92989,16.11913,67.12787
11154,2018-05-13 04:08:00,8630330696303481149,0,0.247505,-0.270859,0.928107,12.59512,9.24899,13.94088,12.59623,...,10.72010,5.90902,8.43279,8.10863,5.78574,96.78929,7.26321,30.14710,49.65198,50.11389
11155,2018-05-13 04:09:00,8630330696303482071,1,0.389788,1.120692,1.125870,4.87616,13.14365,7.40306,10.81714,...,10.90218,14.52762,13.92867,16.00132,11.27901,30.13061,51.31294,88.02742,58.81128,62.34618
11156,2018-05-13 04:10:00,863033069630348644,0,0.221515,1.331008,-0.407512,11.84447,9.02283,12.40197,12.06240,...,13.77264,6.77637,7.82257,6.67953,8.42699,1.12654,93.45526,48.25914,92.08085,37.28778


### Robust Scaling

In [132]:
def robust_scaling(data, feature_columns):
    scaler = RobustScaler()
    data[feature_columns] = scaler.fit_transform(data[feature_columns])
    return data

In [133]:
robust_scaled_data = robust_scaling(training_data, feature_columns)
robust_scaled_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[feature_columns] = scaler.fit_transform(data[feature_columns])


Unnamed: 0,ts,imei,mwra,p.android.chrome,p.android.documentsui,p.android.gm,p.system,p.android.packageinstaller,p.android.settings,p.android.externalstorage,...,c.android.youtube,c.dogalize,c.android.gm,c.katana,c.android.chrome,c.raider,c.android.vending,c.UCMobile.intl,c.UCMobile.x86,c.updateassist
0,2018-05-05 10:00:00,3590433799317661107,0,-0.975050,0.137159,1.392015,12.14702,11.55620,8.33912,14.02450,...,11.65403,10.65335,10.48791,16.91620,5.04564,36.21508,14.24956,4.09319,15.20934,5.11764
1,2018-05-05 10:01:00,3590433799317662063,1,-0.269885,-1.255646,-1.188158,10.56643,16.74062,13.78434,7.57297,...,10.29551,10.46363,10.98438,15.65637,14.82931,24.83765,57.49911,23.94919,8.82448,44.13570
2,2018-05-05 10:02:00,863033069630348313,0,0.794491,0.516626,-0.061064,10.79425,12.60312,10.78121,11.90788,...,12.12831,9.53752,6.78080,6.98948,10.92433,3.29441,20.37891,52.55353,49.51037,99.75215
3,2018-05-05 10:03:00,359043379931766114,1,-0.702551,-0.059025,-0.318115,10.80629,13.97670,12.73047,11.64714,...,11.06419,11.55759,8.87744,11.84499,15.45902,24.78878,36.40299,25.77644,21.84167,65.17774
4,2018-05-05 10:04:00,3590433799317661842,1,-0.682006,-0.271035,-0.195420,8.33003,13.44049,13.31239,11.40689,...,12.08457,10.42860,9.82241,12.48869,12.85363,40.98237,74.13316,96.77450,18.48116,90.77304
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11153,2018-05-13 04:07:00,8630330696303482394,0,0.886180,0.884977,-0.329137,8.30896,11.22312,8.48987,9.49179,...,12.15486,7.89836,5.73909,6.31173,13.66826,13.68408,77.09689,31.92989,16.11913,67.12787
11154,2018-05-13 04:08:00,8630330696303481149,0,0.187453,-0.201153,0.688554,12.59512,9.24899,13.94088,12.59623,...,10.72010,5.90902,8.43279,8.10863,5.78574,96.78929,7.26321,30.14710,49.65198,50.11389
11155,2018-05-13 04:09:00,8630330696303482071,1,0.292859,0.824376,0.834565,4.87616,13.14365,7.40306,10.81714,...,10.90218,14.52762,13.92867,16.00132,11.27901,30.13061,51.31294,88.02742,58.81128,62.34618
11156,2018-05-13 04:10:00,863033069630348644,0,0.168198,0.979372,-0.297550,11.84447,9.02283,12.40197,12.06240,...,13.77264,6.77637,7.82257,6.67953,8.42699,1.12654,93.45526,48.25914,92.08085,37.28778
