In [226]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import scipy.stats as stats
from openpyxl.descriptors import MinMax
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.linear_model import LogisticRegression, Lasso, LassoCV, Ridge, RidgeCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, PowerTransformer, QuantileTransformer
from sklearn.feature_selection import SelectKBest, VarianceThreshold, f_classif, mutual_info_classif, RFE

# Phase 2: Data preprocessing

## First we redo the data changes from the 1st phase

In [2]:
connections, devices, processes, profiles = pd.read_csv('data/connections.csv', sep='\t', keep_default_na=False, na_values=''), pd.read_csv('data/devices.csv', sep='\t', keep_default_na=False, na_values=''), pd.read_csv('data/processes.csv', sep='\t', keep_default_na=False, na_values=''), pd.read_csv('data/profiles.csv', sep='\t', keep_default_na=False, na_values='')

Iterative way to redo the changes:

In [3]:
def get_outliers(column: pd.Series):
    lower_quartile = column.quantile(0.25)
    upper_quartile = column.quantile(0.75)
    iqr = upper_quartile - lower_quartile
    return column[(column < lower_quartile - 1.5*iqr) | (column > upper_quartile + 1.5*iqr)]


In [5]:
def iterative_reformat(processes_ptr: pd.DataFrame, connections_ptr: pd.DataFrame) -> pd.DataFrame:
    connections_ptr['ts'] = pd.to_datetime(connections_ptr['ts'])
    processes_ptr['ts'] = pd.to_datetime(processes_ptr['ts'])
    merged = processes_ptr.merge(connections_ptr, on=['ts', 'imei', 'mwra'], how='inner')
    merged.drop(columns=['ts', 'imei'], inplace=True)
    to_drop = []
    # handle null values and outliers
    for column in merged.columns:
        # if more than 5% are NaN values or more than 5% are outliers, we don't use that column
        column_outliers = get_outliers(merged[column])
        if ((merged[column].isna().sum()/merged.shape[0] > 0.05) or 
            (column_outliers.shape[0] / merged.shape[0] > 0.05)):
            to_drop.append(column)
            continue
        # if there are some null values, we replace the data that's neutral in respect to mwra
        if merged[column].isnull().any():
            # we get means of the distributions for rows with present and non-present malware related activity
            means_per_mwra = merged.groupby('mwra')[column].mean()
            # we average those means, meaning the manufactured value won't be likely to affect predicted mwra 
            imputed_value = means_per_mwra.mean()
            merged[column].fillna(imputed_value, inplace=True)
        #  if there are any outliers, we replace them with the edge values. If we clipped all outliers, we would clutter way too much data together, so we clip only the most extreme ones
        if column_outliers.shape[0]:
            iqr = stats.iqr(merged[column])
            lower_limit = merged[column].quantile(0.25)  - 2.5 * iqr
            upper_limit = merged[column].quantile(0.75)  + 2.5 * iqr
            merged[column] = merged[column].clip(lower=lower_limit, upper=upper_limit)
    return merged.drop(columns=to_drop)

# Phase 2-1: Data transformation

## 2-1a & 2-1b
Splitting the data into training and testing sets + transforming data for ML

First we create a combined table for data to work with. As we learnt in the previous phase, we will use only connections and processes tables. Devices and profiles couldn't be connected logically with the other two tables. That's because there were multiple profiles/devices per imei. And it wasn't a fixed amount of profiles/devices per imei either, so we can't just make a column for all locations/usernames/etc. Even if we did that, there wasn't a correlation found between any of the columns in these tables and mwra.

In [7]:
combined_table = iterative_reformat(processes, connections)

now onto splitting the data into testing and training

In [8]:
# we separate the features and the target
X = combined_table.drop(columns=['mwra'])
y = combined_table['mwra']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69)

As we didn't use any non-numerical data, we don't need to do any conversions to numerical. Most of the data had huge cardinality either way which would increase likelihood of overfitting and difficulty of encoding. 

Example for what we would do if we were to use the categorical data from profiles and devices table

In [12]:
# if the cardinality was too high to use one hot encoding, we can hash the values and now they are numbers
mail_encoded = profiles['mail'].apply(lambda x: hash(x))
profiles['mail'].nunique(), mail_encoded.nunique()

(2594, 2594)

In [20]:
# if one hot encoding was feasible, it could be doable like this
devices["continent"] = devices["location"].apply(lambda x: x.split('/')[0])
devices = pd.get_dummies(devices, columns=["continent"])
# We could combine rows based on imei and then it'd be easier to join with table based on imei
devices = devices.groupby('imei').sum()
devices.drop(columns=["latitude", "longitude", "store_name", "code", "location"]).head()

Unnamed: 0_level_0,continent_Africa,continent_America,continent_Asia,continent_Atlantic,continent_Australia,continent_Europe,continent_Indian,continent_Pacific,continent_Africa,continent_America,...,continent_Indian,continent_Pacific,continent_Africa,continent_America,continent_Asia,continent_Atlantic,continent_Australia,continent_Europe,continent_Indian,continent_Pacific
imei,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
359043379931766007,0,0,2,0,0,2,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
359043379931766015,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
359043379931766023,1,1,3,0,0,0,0,0,1,1,...,0,0,1,0,0,0,0,0,0,0
359043379931766031,1,3,1,0,0,7,0,0,1,3,...,0,0,0,0,1,0,0,0,0,0
359043379931766049,0,1,0,0,0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0


## 2-1c: Data transformation

In [45]:
def transorm_dataframe(preprocessor: TransformerMixin, dataframe: pd.DataFrame) -> pd.DataFrame:
    return pd.DataFrame(preprocessor.fit_transform(dataframe), columns=dataframe.columns, index=dataframe.index)

### Data scaling

First, let's test what each type of scaler does to the data

In [36]:
# We check the data after standard scaling
transorm_dataframe(StandardScaler(), X_train).describe()

Unnamed: 0,p.android.chrome,p.android.documentsui,p.android.gm,p.system,p.android.packageinstaller,p.android.settings,p.android.externalstorage,p.android.gms,p.browser.provider,p.dogalize,...,c.android.youtube,c.dogalize,c.android.gm,c.katana,c.android.chrome,c.raider,c.android.vending,c.UCMobile.intl,c.UCMobile.x86,c.updateassist
count,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,...,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0
mean,6.762962e-16,1.243234e-16,1.726714e-16,4.52399e-16,-5.042004e-16,2.923902e-16,-4.086556e-16,-4.949913e-16,1.323814e-16,-9.554483000000001e-17,...,-2.656261e-16,1.674912e-16,6.791741e-16,3.338313e-16,2.883612e-16,-7.252198000000001e-17,-9.669597000000001e-17,-1.12812e-16,2.325308e-16,-1.574187e-16
std,1.000041,1.000041,1.000041,1.000041,1.000041,1.000041,1.000041,1.000041,1.000041,1.000041,...,1.000041,1.000041,1.000041,1.000041,1.000041,1.000041,1.000041,1.000041,1.000041,1.000041
min,-3.169838,-2.585017,-3.313403,-3.786887,-3.960686,-3.736212,-3.366769,-3.666513,-3.655651,-3.842422,...,-3.788481,-3.947397,-3.869382,-3.740315,-3.83449,-1.718797,-1.733157,-1.720994,-1.721076,-1.735094
25%,-0.742981,-0.7244385,-0.6628154,-0.7054325,-0.6539208,-0.6923263,-0.6898504,-0.6801022,-0.6847679,-0.6778703,...,-0.742168,-0.6458324,-0.7163378,-0.580312,-0.707736,-0.8581145,-0.8678329,-0.869761,-0.8730573,-0.875094
50%,-0.09404088,-0.1440667,-0.1068292,-0.09852335,0.005657922,0.06395903,0.03113026,-4.794129e-05,-0.004224147,0.01080093,...,-0.04991202,0.08932653,-0.04218215,0.09924735,-0.01768695,-0.01026861,0.007429942,0.004425234,-0.007777796,-0.0009780786
75%,0.717756,0.6565743,0.5503672,0.6661457,0.6679419,0.7187275,0.7008837,0.6791898,0.6705818,0.6831262,...,0.7349707,0.709884,0.7122013,0.6806337,0.7268036,0.8702019,0.8673749,0.8603163,0.8806997,0.8625707
max,3.570484,3.667127,3.605873,4.09165,4.01368,3.399445,3.797596,3.973676,3.844842,3.714689,...,3.972474,2.972356,3.236194,3.385552,3.812436,1.749177,1.728812,1.716978,1.726126,1.730243


In [25]:
# We check the data after robust scaling
transorm_dataframe(RobustScaler(), X_train).describe()

Unnamed: 0,p.android.chrome,p.android.documentsui,p.android.gm,p.system,p.android.packageinstaller,p.android.settings,p.android.externalstorage,p.android.gms,p.browser.provider,p.dogalize,...,c.android.youtube,c.dogalize,c.android.gm,c.katana,c.android.chrome,c.raider,c.android.vending,c.UCMobile.intl,c.UCMobile.x86,c.updateassist
count,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,...,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0
mean,0.064379,0.10432,0.088057,0.071832,-0.00428,-0.045327,-0.022384,3.5e-05,0.003117,-0.007936,...,0.03379,-0.065889,0.029528,-0.078709,0.012329,0.005941,-0.004282,-0.002558,0.004435,0.000563
std,0.684614,0.724136,0.824312,0.729117,0.756539,0.708719,0.719074,0.735707,0.737847,0.734786,...,0.677012,0.737647,0.700044,0.793088,0.697116,0.578621,0.576323,0.578032,0.570228,0.575508
min,-2.105647,-1.767507,-2.643109,-2.689139,-3.000572,-2.693144,-2.443242,-2.697335,-2.694085,-2.831178,...,-2.530953,-2.977558,-2.679101,-3.044986,-2.660647,-0.988551,-1.0031,-0.997308,-0.97693,-0.997958
25%,-0.444255,-0.420251,-0.458287,-0.44249,-0.498977,-0.535972,-0.518417,-0.5003,-0.502117,-0.506005,...,-0.468647,-0.542266,-0.47192,-0.538928,-0.481025,-0.490562,-0.504414,-0.505287,-0.493386,-0.503041
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.555745,0.579749,0.541713,0.55751,0.501023,0.464028,0.481583,0.4997,0.497883,0.493995,...,0.531353,0.457734,0.52808,0.461072,0.518975,0.509438,0.495586,0.494713,0.506614,0.496959
max,2.508683,2.759709,3.0603,3.055001,3.032102,2.363826,2.708257,2.923378,2.839907,2.721453,...,2.723093,2.126573,2.294915,2.606222,2.669932,1.018011,0.992032,0.989871,0.98868,0.996292


In [26]:
# We check the data after minmax scaling
transorm_dataframe(MinMaxScaler(), X_train).describe()

Unnamed: 0,p.android.chrome,p.android.documentsui,p.android.gm,p.system,p.android.packageinstaller,p.android.settings,p.android.externalstorage,p.android.gms,p.browser.provider,p.dogalize,...,c.android.youtube,c.dogalize,c.android.gm,c.katana,c.android.chrome,c.raider,c.android.vending,c.UCMobile.intl,c.UCMobile.x86,c.updateassist
count,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,...,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0
mean,0.47028,0.413461,0.478866,0.480659,0.496677,0.523597,0.469933,0.479898,0.487388,0.508451,...,0.488146,0.570453,0.544556,0.524893,0.501442,0.49562,0.500628,0.500584,0.499268,0.5007
std,0.148367,0.159952,0.14453,0.126932,0.125407,0.140147,0.139585,0.130892,0.13333,0.132331,...,0.128855,0.14452,0.14074,0.140339,0.130777,0.288364,0.288865,0.290881,0.290102,0.288584
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.360051,0.29759,0.383073,0.39112,0.414674,0.426574,0.373644,0.390882,0.396092,0.418751,...,0.392518,0.477122,0.443742,0.443455,0.40889,0.24818,0.249951,0.247597,0.246002,0.248172
50%,0.456328,0.390418,0.463426,0.468153,0.497387,0.532561,0.474278,0.479892,0.486825,0.50988,...,0.481715,0.583362,0.538619,0.53882,0.499129,0.492659,0.502774,0.501871,0.497011,0.500418
75%,0.576767,0.518477,0.558407,0.565211,0.580438,0.624321,0.567762,0.568795,0.576793,0.598846,...,0.582847,0.673042,0.644787,0.620409,0.596487,0.746545,0.751171,0.750824,0.75475,0.749614
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


We can see that the StandardScaler and RobustScaler have a mean of 0 and roughly 0. The minimum and maximum are relatively close to each other, meaning both could be used at the same time without issues with methods like knn. MinMaxScaler put the data in the range of 0 to 1, which is useful for neural networks and algorithms that use euclidean distance, but the ranges and mean were different from the other two scalers. 

#### When to use which scaler
First, we need to assess when we need to use which scaler. We can use the following rules of thumb:
- StandardScaler: When the data is normally distributed
- MinMaxScaler: When the data is not normally distributed and the data doesn't have outliers
- RobustScaler: When the data is not normally distributed and the data has outliers  

Problems we can encounter with scaling:
- Columns not matching conditions for the scaler
- If we use different scalers on the same data, the columns won't be in comparable ranges

Solution:
- Appy Standard Scaler and Robust Scaler to columns based on whether they are normally distributed
- Apply MinMax Scaler on all the columns to make them comparable

In [27]:
# split the columns into ones with normal and non-normal distribution
normal_columns = []
non_normal_columns = []
for column in X_train.columns:
    _, p_value = stats.normaltest(X[column])
    if p_value > 0.05:
        normal_columns.append(column)
    else:
        non_normal_columns.append(column)

# We combine the mutually exclusive scalers
init_scaler = ColumnTransformer([
    ('standard_scaler', StandardScaler(), normal_columns),
    ('robust_scaler', RobustScaler(), non_normal_columns)
])

X_train_scaled = transorm_dataframe(init_scaler, X_train)
X_train_scaled = transorm_dataframe(MinMaxScaler(), X_train_scaled)
X_train_scaled.head(10)

Unnamed: 0,p.android.chrome,p.android.documentsui,p.android.gm,p.system,p.android.packageinstaller,p.android.settings,p.android.externalstorage,p.android.gms,p.browser.provider,p.dogalize,...,c.android.youtube,c.dogalize,c.android.gm,c.katana,c.android.chrome,c.raider,c.android.vending,c.UCMobile.intl,c.UCMobile.x86,c.updateassist
9005,0.483595,0.441366,0.70209,0.636406,0.48603,0.384492,0.619022,0.310601,0.410976,0.418514,...,0.375617,0.7371,0.739181,0.626091,0.494479,0.771396,0.353118,0.072081,0.786613,0.701407
2969,0.567983,0.330349,0.406478,0.656822,0.504451,0.399848,0.308145,0.173968,0.328807,0.366263,...,0.503285,0.683208,0.434231,0.732879,0.650314,0.568595,0.680306,0.44843,0.81603,0.210317
1814,0.407768,0.653001,0.460639,0.48585,0.620401,0.442087,0.791549,0.69021,0.599075,0.617908,...,0.603602,0.338604,0.439296,0.388662,0.450632,0.369682,0.01306,0.671148,0.772845,0.314295
1329,0.494783,0.436368,0.756157,0.512127,0.595985,0.598577,0.456346,0.164639,0.230139,0.364623,...,0.476818,0.497982,0.690077,0.401531,0.394446,0.197623,0.819457,0.500501,0.238766,0.313109
11936,0.682511,0.477391,0.464776,0.616985,0.512693,0.354695,0.316714,0.431555,0.537392,0.476451,...,0.451656,0.46169,0.655311,0.58403,0.651975,0.818106,0.251199,0.115245,0.230938,0.719118
8275,0.459064,0.614562,0.572466,0.393997,0.55097,0.347811,0.527403,0.352207,0.545393,0.318834,...,0.433061,0.642685,0.731407,0.601039,0.452107,0.981662,0.304753,0.781191,0.011719,0.150772
14095,0.560736,0.483777,0.620579,0.633239,0.492865,0.366477,0.194222,0.366023,0.247459,0.376766,...,0.424913,0.51636,0.706712,0.435621,0.379597,0.455415,0.936982,0.22776,0.149069,0.653988
10345,0.54885,0.323317,0.488591,0.285537,0.626571,0.407907,0.49074,0.307615,0.27378,0.463639,...,0.61633,0.743202,0.485979,0.586254,0.652278,0.503222,0.117929,0.541092,0.386154,0.005807
4130,0.427708,0.476409,0.581966,0.505028,0.494746,0.771965,0.65175,0.237039,0.574927,0.338505,...,0.51656,0.65421,0.659599,0.556275,0.463475,0.378758,0.395199,0.486079,0.171829,0.397431
1819,0.470254,0.547077,0.505392,0.362881,0.639874,0.65476,0.265357,0.286843,0.582914,0.414703,...,0.53368,0.617495,0.460009,0.560442,0.616836,0.2307,0.556081,0.788547,0.067034,0.795465


In [234]:
# Let's create a class that could combine this
class CombinedScaler(BaseEstimator, TransformerMixin):
    def __init__(self, normality_threshold=0.05):
        self.normality_threshold = normality_threshold
        self.scaler = None
    
    def fit(self, X: np.array or pd.DataFrame, y=None):
        _, p_values = stats.normaltest(X, axis=0)

        # Use boolean indexing to identify normal and non-normal columns based on the threshold
        normal_columns = np.where(p_values > self.normality_threshold)[0]
        non_normal_columns = np.where(p_values <= self.normality_threshold)[0]
        self.scaler = ColumnTransformer([
            ('standard_scaler', StandardScaler(), normal_columns),
            ('robust_scaler', RobustScaler(), non_normal_columns)
        ])
        return self
    
    def transform(self, X, y=None):
        return MinMaxScaler().fit_transform(self.scaler.fit_transform(X))

In [165]:
X_train_scaled = transorm_dataframe(CombinedScaler(), X_train)
X_train_scaled.head(10)

Unnamed: 0,p.android.chrome,p.android.documentsui,p.android.gm,p.system,p.android.packageinstaller,p.android.settings,p.android.externalstorage,p.android.gms,p.browser.provider,p.dogalize,...,c.android.youtube,c.dogalize,c.android.gm,c.katana,c.android.chrome,c.raider,c.android.vending,c.UCMobile.intl,c.UCMobile.x86,c.updateassist
9005,0.441366,0.70209,0.636406,0.48603,0.384492,0.619022,0.310601,0.410976,0.418514,0.483595,...,0.375617,0.7371,0.739181,0.626091,0.494479,0.771396,0.353118,0.072081,0.786613,0.701407
2969,0.330349,0.406478,0.656822,0.504451,0.399848,0.308145,0.173968,0.328807,0.366263,0.567983,...,0.503285,0.683208,0.434231,0.732879,0.650314,0.568595,0.680306,0.44843,0.81603,0.210317
1814,0.653001,0.460639,0.48585,0.620401,0.442087,0.791549,0.69021,0.599075,0.617908,0.407768,...,0.603602,0.338604,0.439296,0.388662,0.450632,0.369682,0.01306,0.671148,0.772845,0.314295
1329,0.436368,0.756157,0.512127,0.595985,0.598577,0.456346,0.164639,0.230139,0.364623,0.494783,...,0.476818,0.497982,0.690077,0.401531,0.394446,0.197623,0.819457,0.500501,0.238766,0.313109
11936,0.477391,0.464776,0.616985,0.512693,0.354695,0.316714,0.431555,0.537392,0.476451,0.682511,...,0.451656,0.46169,0.655311,0.58403,0.651975,0.818106,0.251199,0.115245,0.230938,0.719118
8275,0.614562,0.572466,0.393997,0.55097,0.347811,0.527403,0.352207,0.545393,0.318834,0.459064,...,0.433061,0.642685,0.731407,0.601039,0.452107,0.981662,0.304753,0.781191,0.011719,0.150772
14095,0.483777,0.620579,0.633239,0.492865,0.366477,0.194222,0.366023,0.247459,0.376766,0.560736,...,0.424913,0.51636,0.706712,0.435621,0.379597,0.455415,0.936982,0.22776,0.149069,0.653988
10345,0.323317,0.488591,0.285537,0.626571,0.407907,0.49074,0.307615,0.27378,0.463639,0.54885,...,0.61633,0.743202,0.485979,0.586254,0.652278,0.503222,0.117929,0.541092,0.386154,0.005807
4130,0.476409,0.581966,0.505028,0.494746,0.771965,0.65175,0.237039,0.574927,0.338505,0.427708,...,0.51656,0.65421,0.659599,0.556275,0.463475,0.378758,0.395199,0.486079,0.171829,0.397431
1819,0.547077,0.505392,0.362881,0.639874,0.65476,0.265357,0.286843,0.582914,0.414703,0.470254,...,0.53368,0.617495,0.460009,0.560442,0.616836,0.2307,0.556081,0.788547,0.067034,0.795465


As we can see, we get the same results with the class as with the manual method. Now it's easier to for example pass it into a pipeline.

### Using transformers
Since there are many columns that aren't normally distributed, we could benefit from using Power Transformer or Quantile Transformer.

In [28]:
power_transformed = transorm_dataframe(PowerTransformer(), X_train)
quantile_transformed = transorm_dataframe(QuantileTransformer(output_distribution='normal'), X_train)

In [29]:
power_transformed.describe()

Unnamed: 0,p.android.chrome,p.android.documentsui,p.android.gm,p.system,p.android.packageinstaller,p.android.settings,p.android.externalstorage,p.android.gms,p.browser.provider,p.dogalize,...,c.android.youtube,c.dogalize,c.android.gm,c.katana,c.android.chrome,c.raider,c.android.vending,c.UCMobile.intl,c.UCMobile.x86,c.updateassist
count,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,...,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0
mean,1.16841e-16,9.255186e-16,-9.105538e-16,6.492444e-16,-3.384359e-16,3.5685420000000005e-17,-3.079306e-16,-5.801758e-16,6.561513000000001e-17,-9.439369e-17,...,-3.539763e-16,-5.20892e-17,-4.581547e-16,2.705185e-16,-1.254745e-16,2.405888e-16,-5.467927e-18,-2.049034e-16,4.8347990000000004e-17,-8.748683000000001e-17
std,1.000041,1.000041,1.000041,1.000041,1.000041,1.000041,1.000041,1.000041,1.000041,1.000041,...,1.000041,1.000041,1.000041,1.000041,1.000041,1.000041,1.000041,1.000041,1.000041,1.000041
min,-4.163891,-3.513875,-5.113789,-5.082017,-3.837395,-3.259458,-3.282411,-3.778996,-3.787796,-3.778426,...,-4.685564,-3.043914,-4.351662,-2.909413,-3.976129,-2.083549,-2.083021,-2.089294,-2.093688,-2.087517
25%,-0.7150467,-0.6880269,-0.6234693,-0.6811566,-0.6586892,-0.7144958,-0.6940907,-0.676607,-0.6807061,-0.6797892,...,-0.7232777,-0.6910718,-0.7048393,-0.6407408,-0.7037059,-0.788505,-0.8022435,-0.8022434,-0.8059467,-0.8106785
50%,-0.03181553,-0.05241102,-0.02470813,-0.05334653,-0.001441591,0.03429448,0.02504619,0.004858696,0.00154017,0.00816421,...,-0.01266656,0.03879722,-0.02292431,0.03621729,-0.011439,0.08593203,0.09809294,0.09977492,0.0888183,0.09020096
75%,0.7478928,0.7152999,0.6130645,0.6900006,0.6646622,0.7099323,0.6986521,0.6811286,0.6729371,0.6821273,...,0.7502262,0.6979696,0.7198393,0.6633683,0.7289575,0.8688939,0.8659023,0.8606716,0.8779721,0.8619097
max,3.049508,2.903852,2.948925,3.590079,4.109668,3.698497,3.866796,3.916931,3.782406,3.742447,...,3.565548,3.393666,3.08944,4.127903,3.744676,1.577445,1.567718,1.552584,1.558984,1.568818


In [30]:
quantile_transformed.describe()

Unnamed: 0,p.android.chrome,p.android.documentsui,p.android.gm,p.system,p.android.packageinstaller,p.android.settings,p.android.externalstorage,p.android.gms,p.browser.provider,p.dogalize,...,c.android.youtube,c.dogalize,c.android.gm,c.katana,c.android.chrome,c.raider,c.android.vending,c.UCMobile.intl,c.UCMobile.x86,c.updateassist
count,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,...,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0
mean,0.004589,-0.000953,0.011295,0.004485,-0.006543,0.008947,0.004298,-0.007101,0.00094,-0.001883,...,-0.005702,-0.004223,0.003014,-0.002694,0.000378,0.004734,-0.004636,-0.005196,0.008609,0.001418
std,1.003285,1.005104,1.022319,1.000865,1.005327,1.000006,1.001826,1.00254,1.00036,1.000178,...,1.004315,1.000852,1.000814,1.007129,1.001844,1.000745,1.003128,1.000419,1.005802,1.00058
min,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,...,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338
25%,-0.667584,-0.685233,-0.660305,-0.669773,-0.684706,-0.664382,-0.66593,-0.683546,-0.663997,-0.680974,...,-0.684628,-0.679073,-0.669304,-0.671797,-0.67469,-0.668828,-0.675079,-0.680808,-0.67007,-0.674841
50%,0.004559,-0.004622,0.003386,-0.000478,-0.006571,0.012294,0.000304,-0.00722,0.003301,-0.000151,...,-0.00052,-0.006636,0.00028,-0.00412,0.006929,0.00021,-0.007121,-0.00643,0.014848,0.00326
75%,0.680112,0.677155,0.674801,0.680237,0.665806,0.677367,0.675761,0.670242,0.675446,0.671301,...,0.670791,0.669944,0.680658,0.66655,0.674118,0.676957,0.670545,0.667215,0.68901,0.680352
max,5.199338,5.199338,5.199338,5.199338,5.199338,5.199338,5.199338,5.199338,5.199338,5.199338,...,5.199338,5.199338,5.199338,5.199338,5.199338,5.199338,5.199338,5.199338,5.199338,5.199338


In [41]:
for column in X_train.columns:
    print("Histogram of", column)
    print("Using power transformer")
    sns.histplot(power_transformed[column])
    plt.show()
    print("Using quantile transformer")
    sns.histplot(quantile_transformed[column])
    plt.show()

We can see power transformation normalized some of the columns, but the ones that were uniformly distributed are now skewed to the side. The ranges for columns are approximately the same. Quantile transformation made the data normally distributed, also making the data in the columns be in the same range which is useful for some algorithms.

As these aren't all that co-usable, we use them separately without combining them.

# Phase 2-2: Feature Selection
We need to figure out which features help best with predicting the target variable. We can use the following methods.

In [114]:
def get_features_dataframe(selector, dataframe: pd.DataFrame, target: pd.Series) -> pd.DataFrame:
    selector.fit_transform(dataframe, target)
    return dataframe.loc[:, selector.get_support()]

In [105]:
def column_importance_ordered(selector, dataframe: pd.DataFrame = None, target: pd.Series = None, fitted = False) -> pd.Series:
    if not fitted:
        selector.fit(dataframe, target)
    return pd.Series(X_train.columns[(-selector.scores_).argsort()])

## Filter Methods

### Variance Threshold

In [31]:
variance_selector = VarianceThreshold(threshold=500)
high_variance_columns = variance_selector.fit_transform(X_train)
high_variance_columns

array([[94.35106,  8.89805, 63.99106, ...,  7.20809, 78.66131, 70.14069],
       [78.87992, 49.9576 , 98.09531, ..., 44.84301, 81.60302, 21.03168],
       [80.06156,  4.65163, 95.10592, ..., 67.11484, 77.28452, 31.42955],
       ...,
       [19.83578, 31.40051, 10.15434, ..., 85.65207, 10.54911,  2.67848],
       [66.27537, 27.31082, 85.45787, ..., 18.65774, 32.00129, 34.18675],
       [22.98821, 77.07455, 16.27829, ..., 14.29205, 32.80857, 82.23808]])

It's important to keep in mind whether we already scaled the data. The threshold to get intended amount of columns vastly differs based on that.

In [35]:
variance_selector_scaled = VarianceThreshold(threshold=0.05)
high_variance_columns_scaled = variance_selector_scaled.fit_transform(X_train_scaled)
high_variance_columns_scaled

array([[0.9435046 , 0.08895335, 0.63982556, ..., 0.0720809 , 0.7866131 ,
        0.7014069 ],
       [0.78877677, 0.49956109, 0.9809486 , ..., 0.4484301 , 0.8160302 ,
        0.2103168 ],
       [0.80059442, 0.04648789, 0.95104764, ..., 0.6711484 , 0.7728452 ,
        0.3142955 ],
       ...,
       [0.19827266, 0.31398466, 0.10133122, ..., 0.8565207 , 0.1054911 ,
        0.0267848 ],
       [0.66271788, 0.27308654, 0.85454436, ..., 0.1865774 , 0.3200129 ,
        0.3418675 ],
       [0.2298003 , 0.77073867, 0.16258519, ..., 0.1429205 , 0.3280857 ,
        0.8223808 ]])

We can see we had to lower the threshold 10000 to get the same amount of columns.

### Mutual Information
We use mutual_info_classif as we are dealing with classification problem, not regression one

In [136]:
mutual_info_selector = SelectKBest(mutual_info_classif, k=9)
get_features_dataframe(mutual_info_selector, X_train, y_train)

Unnamed: 0,p.android.documentsui,p.android.gm,p.system,p.android.settings,p.android.externalstorage,c.dogalize,c.android.gm,c.katana,c.android.chrome
9005,8.55273,8.310800,8.76234,10.28676,8.55662,14.38637,13.52111,13.46906,9.85361
2969,6.51885,6.955010,7.67476,13.65197,11.66651,13.34282,7.95219,15.24416,12.95897
1814,14.20350,11.414430,12.91264,10.30770,12.79993,6.67005,8.04468,9.52237,8.97985
1329,6.37998,5.326990,7.64062,10.86644,5.00109,9.75619,12.62439,9.73628,7.86023
11936,10.35323,10.396660,9.96827,11.58500,13.17353,9.05345,11.98950,12.76989,12.99207
...,...,...,...,...,...,...,...,...,...
9818,8.86998,5.610960,12.31177,8.99678,12.83352,7.06201,9.58491,4.86440,12.11389
10859,10.98764,13.811110,8.32750,13.73589,11.37172,10.25129,11.75157,7.56727,7.57071
4041,6.42616,11.202170,10.87475,6.44743,8.06089,15.91097,12.86750,11.38725,6.45651
7371,8.46329,8.981700,8.00942,12.78849,13.54101,13.03149,7.44688,10.25082,12.89054


In [139]:
mi_fs = column_importance_ordered(mutual_info_selector, fitted=True)
mi_fs

0          p.android.documentsui
1                       p.system
2                     c.dogalize
3                       c.katana
4                   p.android.gm
5                   c.android.gm
6      p.android.externalstorage
7             p.android.settings
8               c.android.chrome
9               p.gms.persistent
10                 p.android.gms
11              p.android.chrome
12               p.process.gapps
13             c.android.vending
14                   p.olauncher
15                c.updateassist
16            p.browser.provider
17                    p.dogalize
18             p.android.vending
19                    p.notifier
20                      c.raider
21    p.android.packageinstaller
22             c.android.youtube
23           p.inputmethod.latin
24        p.android.defcontainer
25                      p.google
26               c.UCMobile.intl
27                c.UCMobile.x86
28                   p.simulator
dtype: object

### F-value

In [81]:
f_value_selector = SelectKBest(f_classif, k=8)
get_features_dataframe(f_value_selector, X_train, y_train)

Unnamed: 0,p.android.documentsui,p.android.gm,p.system,p.android.settings,c.dogalize,c.android.gm,c.katana,c.android.chrome
9005,8.55273,8.310800,8.76234,10.28676,14.38637,13.52111,13.46906,9.85361
2969,6.51885,6.955010,7.67476,13.65197,13.34282,7.95219,15.24416,12.95897
1814,14.20350,11.414430,12.91264,10.30770,6.67005,8.04468,9.52237,8.97985
1329,6.37998,5.326990,7.64062,10.86644,9.75619,12.62439,9.73628,7.86023
11936,10.35323,10.396660,9.96827,11.58500,9.05345,11.98950,12.76989,12.99207
...,...,...,...,...,...,...,...,...
9818,8.86998,5.610960,12.31177,8.99678,7.06201,9.58491,4.86440,12.11389
10859,10.98764,13.811110,8.32750,13.73589,10.25129,11.75157,7.56727,7.57071
4041,6.42616,11.202170,10.87475,6.44743,15.91097,12.86750,11.38725,6.45651
7371,8.46329,8.981700,8.00942,12.78849,13.03149,7.44688,10.25082,12.89054


In [140]:
f_fs = column_importance_ordered(f_value_selector, fitted=True)
f_fs

0          p.android.documentsui
1                       p.system
2                     c.dogalize
3                       c.katana
4                   p.android.gm
5             p.android.settings
6                   c.android.gm
7               c.android.chrome
8      p.android.externalstorage
9              p.android.vending
10                   p.simulator
11                 p.android.gms
12                      c.raider
13            p.browser.provider
14             c.android.vending
15           p.inputmethod.latin
16                   p.olauncher
17                      p.google
18        p.android.defcontainer
19             c.android.youtube
20               c.UCMobile.intl
21                c.updateassist
22               p.process.gapps
23              p.android.chrome
24                    p.dogalize
25              p.gms.persistent
26    p.android.packageinstaller
27                    p.notifier
28                c.UCMobile.x86
dtype: object

## Wrapper Methods

### Recursive Feature Elimination

In [73]:
# Let's try this method for the logistic regression
rfe_selector = RFE(estimator=LogisticRegression(max_iter=1000), n_features_to_select=8, step=1)
# max_iter sets the maximum number of attempts the algorithm makes to find the best solution
# it threw warnings with max_iter default or too low, so we set it to 1000
get_features_dataframe(rfe_selector, X_train, y_train)

Unnamed: 0,p.android.documentsui,p.android.gm,p.system,p.android.settings,p.android.externalstorage,c.dogalize,c.katana,c.android.chrome
9005,8.55273,8.310800,8.76234,10.28676,8.55662,14.38637,13.46906,9.85361
2969,6.51885,6.955010,7.67476,13.65197,11.66651,13.34282,15.24416,12.95897
1814,14.20350,11.414430,12.91264,10.30770,12.79993,6.67005,9.52237,8.97985
1329,6.37998,5.326990,7.64062,10.86644,5.00109,9.75619,9.73628,7.86023
11936,10.35323,10.396660,9.96827,11.58500,13.17353,9.05345,12.76989,12.99207
...,...,...,...,...,...,...,...,...
9818,8.86998,5.610960,12.31177,8.99678,12.83352,7.06201,4.86440,12.11389
10859,10.98764,13.811110,8.32750,13.73589,11.37172,10.25129,7.56727,7.57071
4041,6.42616,11.202170,10.87475,6.44743,8.06089,15.91097,11.38725,6.45651
7371,8.46329,8.981700,8.00942,12.78849,13.54101,13.03149,10.25082,12.89054


Ranking the columns based on their importance

In [142]:
# We set n features to select to one, so no columns share the first place
rfe_selector = RFE(estimator=LogisticRegression(max_iter=1000), n_features_to_select=1, step=1)
rfe_selector.fit(X_train, y_train)
rfe_fs = pd.Series(X_train.columns[rfe_selector.ranking_.argsort()])
rfe_fs

0          p.android.documentsui
1                       p.system
2                     c.dogalize
3               c.android.chrome
4                       c.katana
5                   p.android.gm
6             p.android.settings
7      p.android.externalstorage
8              c.android.youtube
9               p.android.chrome
10    p.android.packageinstaller
11                  c.android.gm
12                      p.google
13                   p.simulator
14                 p.android.gms
15                    p.dogalize
16             p.android.vending
17             c.android.vending
18              p.gms.persistent
19           p.inputmethod.latin
20                   p.olauncher
21                c.updateassist
22               c.UCMobile.intl
23                    p.notifier
24        p.android.defcontainer
25            p.browser.provider
26               p.process.gapps
27                c.UCMobile.x86
28                      c.raider
dtype: object

## Embedded Methods

### Lasso/L1

First we get optimal alpha using cross validation

In [130]:
lasso_cv = LassoCV(cv=5) # cv is the number of folds, the higher the number, the more accurate the alpha is but the slower the process 

# Fit LassoCV to the data
lasso_cv.fit(X_train, y_train)

# Get the optimal alpha chosen by LassoCV
print(f"Optimal Alpha: {lasso_cv.alpha_}")

Optimal Alpha: 0.001161127881405129


In [133]:
lasso_selector = Lasso(alpha=0.001161127881405129) # alpha is the regularization strength, the higher the value, the more features are removed
lasso_selector.fit(X_train, y_train)

selected_features_lasso = pd.Series(X_train.columns[lasso_selector.coef_ != 0])
selected_features_lasso


0         p.android.documentsui
1                  p.android.gm
2                      p.system
3            p.android.settings
4     p.android.externalstorage
5             p.android.vending
6                   p.olauncher
7              p.gms.persistent
8                    c.dogalize
9                      c.katana
10             c.android.chrome
dtype: object

In [147]:
lasso_fs = pd.Series(X_train.columns[(-abs(lasso_selector.coef_)).argsort()])

### Ridge/L2

In [131]:
ridge_cv = RidgeCV(cv=5)
ridge_cv.fit(X_train, y_train)
print(f"Optimal Alpha: {ridge_cv.alpha_}")

Optimal Alpha: 10.0


In [144]:
ridge = Ridge(alpha=10)
ridge.fit(X_train, y_train)

ridge_fs = pd.Series(X_train.columns[(-abs(ridge.coef_)).argsort()])
ridge_fs

0          p.android.documentsui
1                       p.system
2               c.android.chrome
3                     c.dogalize
4                       c.katana
5                   p.android.gm
6             p.android.settings
7               p.android.chrome
8              c.android.youtube
9      p.android.externalstorage
10    p.android.packageinstaller
11                  c.android.gm
12                      p.google
13                   p.simulator
14                 p.android.gms
15                    p.dogalize
16             p.android.vending
17              p.gms.persistent
18                   p.olauncher
19             c.android.vending
20           p.inputmethod.latin
21               p.process.gapps
22                      c.raider
23                c.updateassist
24                    p.notifier
25               c.UCMobile.intl
26                c.UCMobile.x86
27            p.browser.provider
28        p.android.defcontainer
dtype: object

### Optimization attempt

We can create a class that automatically finds the optimal alpha and fits the model with it. It's computationally expensive, so it's not always the best choice, but can be usefull when we want the best possible results.

In [178]:
class LassoOptimal(Lasso):
    def __init__(self, cv=10, **kwargs):
        # Initialize with cv and pass any additional parameters to the Lasso superclass
        self.cv = cv
        super().__init__(**kwargs)
    
    def fit(self, X, y, **kwargs):
        # Run LassoCV to find the best alpha
        lasso_cv = LassoCV(cv=self.cv).fit(X, y)
        # Set the optimal alpha for Lasso
        self.alpha = lasso_cv.alpha_
        # Now fit Lasso with the optimal alpha
        return super().fit(X, y)

class RidgeOptimal(Ridge):
    def __init__(self, cv=10, **kwargs):
        self.cv = cv
        super().__init__(**kwargs)
    
    def fit(self, X, y, **kwargs):
        ridge_cv = RidgeCV(cv=self.cv).fit(X, y)
        self.alpha = ridge_cv.alpha_
        return super().fit(X, y)

## Comparison of the results
First, we combine the columns ordered by influence on mwra from each method.

In [152]:
comparison_df = pd.DataFrame({
    "Mutual Information": mi_fs,
    "F-value": f_fs,
    "RFE": rfe_fs,
    "Ridge": ridge_fs,
    "Lasso": lasso_fs,
})
comparison_df

Unnamed: 0,Mutual Information,F-value,RFE,Ridge,Lasso
0,p.android.documentsui,p.android.documentsui,p.android.documentsui,p.android.documentsui,p.android.documentsui
1,p.system,p.system,p.system,p.system,p.system
2,c.dogalize,c.dogalize,c.dogalize,c.android.chrome,c.dogalize
3,c.katana,c.katana,c.android.chrome,c.dogalize,c.android.chrome
4,p.android.gm,p.android.gm,c.katana,c.katana,c.katana
5,c.android.gm,p.android.settings,p.android.gm,p.android.gm,p.android.settings
6,p.android.externalstorage,c.android.gm,p.android.settings,p.android.settings,p.android.externalstorage
7,p.android.settings,c.android.chrome,p.android.externalstorage,p.android.chrome,p.android.gm
8,c.android.chrome,p.android.externalstorage,c.android.youtube,c.android.youtube,p.android.vending
9,p.gms.persistent,p.android.vending,p.android.chrome,p.android.externalstorage,p.gms.persistent


We can see that all methods placed p.android.documentsui on 1st place and p.system on 2nd. All of them has c.katana in top 5 and other than lasso, all had c.dogalize in top 4.
Expectably, Mutual Information and F-value methods had the most similar results, as they are both filter methods.

We may proceed with one of the method. Other option is to combine them and get the best features on average.

Following cumulative method is just a proof of concept. It uses dataframes with strings and it does 5 different selections, so it's too computationally expensive to be used in practice.

In [239]:
class CombinedFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, top_n=10):
        self.top_n = top_n
        self.selected_features_ = None
    
    def fit(self, X, y):
        X_df = pd.DataFrame(X, columns=X.columns if isinstance(X, pd.DataFrame) else [f"feature_{i}" for i in range(X.shape[1])])
        ranking_df = pd.DataFrame(index=X_df.columns)

        # Mutual Information
        mi_selector = mutual_info_classif(X_df, y)
        ranking_df['Mutual Information'] = pd.Series(mi_selector, index=X_df.columns).rank(ascending=False)

        # F-value
        f_selector, _ = f_classif(X_df, y)
        ranking_df['F-value'] = pd.Series(f_selector, index=X_df.columns).rank(ascending=False)

        # RFE with Logistic Regression
        rfe_selector = RFE(estimator=LogisticRegression(max_iter=1000), n_features_to_select=self.top_n, step=1).fit(X_df, y)
        ranking_df['RFE'] = pd.Series(rfe_selector.ranking_, index=X_df.columns)

        # Lasso
        lasso = Lasso(alpha=0.01).fit(StandardScaler().fit_transform(X_df), y)
        ranking_df['Lasso'] = pd.Series(abs(lasso.coef_), index=X_df.columns).rank(ascending=False, method='min')

        # Ridge
        ridge = Ridge(alpha=0.01).fit(StandardScaler().fit_transform(X_df), y)
        ranking_df['Ridge'] = pd.Series(abs(ridge.coef_), index=X_df.columns).rank(ascending=False, method='min')

        # Sum the ranks and select top features
        ranking_df['Cumulative Rank'] = ranking_df.sum(axis=1)
        self.selected_features_ = ranking_df.nsmallest(self.top_n, 'Cumulative Rank').index.tolist()
        return self
    
    def transform(self, X, y=None):
        return X[self.selected_features_]

In [162]:
combined_selector = CombinedFeatureSelector()
combined_selector.fit(X_train, y_train)
combined_selector.selected_features_

['p.android.documentsui',
 'p.system',
 'c.dogalize',
 'c.katana',
 'p.android.gm',
 'c.android.chrome',
 'p.android.settings',
 'p.android.externalstorage',
 'c.android.gm',
 'p.android.gms']

# Phase 2-3: Replicability
Without testing, we can't be sure which methods we will be using, so for now, we will use only some combinations as an example.

We will use the following methods: custom combined selector for feature selection followed by combined scaling.

In [284]:
transformation_pipeline_combined = Pipeline([
    ('feature_selector', CombinedFeatureSelector()),
    ('scaler', CombinedScaler()),
])

In [281]:
transformation_pipeline_combined.fit_transform(X_test, y_test)

array([[0.56534214, 0.54347437, 0.48409891, ..., 0.59242516, 0.59546645,
        0.34410255],
       [0.64627502, 0.52517318, 0.3909906 , ..., 0.58509064, 0.23329129,
        0.39373152],
       [0.3434799 , 0.46771404, 0.46958065, ..., 0.64823586, 0.33797153,
        0.63061602],
       ...,
       [0.37559722, 0.32357994, 0.77349774, ..., 0.82917859, 0.42250245,
        0.39769173],
       [0.68968427, 0.46901226, 0.51555136, ..., 0.49064094, 0.43184151,
        0.67483308],
       [0.33844043, 0.27086655, 0.62877029, ..., 0.27312784, 0.60318161,
        0.44910462]])

We can show other combination, just to use only unmodified classes.

In [279]:
transformation_pipeline = Pipeline([
    ('feature_selector', SelectKBest(f_classif, k=8)),
    ('scaler', MinMaxScaler()),
])

In [282]:
transformation_pipeline.fit_transform(X_train, y_train)

array([[0.31060087, 0.41097593, 0.41851402, ..., 0.73918108, 0.62609114,
        0.49447917],
       [0.17396828, 0.32880692, 0.366263  , ..., 0.43423111, 0.73287927,
        0.65031402],
       [0.69020997, 0.59907452, 0.61790844, ..., 0.4392958 , 0.38866246,
        0.45063167],
       ...,
       [0.16774152, 0.58621029, 0.52000133, ..., 0.70338987, 0.50085166,
        0.32400406],
       [0.30459244, 0.45163649, 0.3823412 , ..., 0.40656071, 0.43248524,
        0.64688003],
       [0.36905606, 1.        , 0.61281393, ..., 0.50546689, 0.6654766 ,
        0.4324962 ]])

We can test the accuracy of the initially made pipeline as a baseline, to track our progress.

In [285]:
from sklearn.metrics import accuracy_score

# Get predictions
prediction_pipeline_combined = Pipeline([
    ('transformation', transformation_pipeline_combined),
    ('classifier', LogisticRegression())
])

prediction_pipeline_combined.fit(X_train, y_train)
y_pred = prediction_pipeline_combined.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 91.38%
