In [35]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import scipy.stats as stats
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.base import TransformerMixin
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, PowerTransformer, QuantileTransformer
from sklearn.feature_selection import SelectKBest, VarianceThreshold, SelectKBest, f_classif, mutual_info_classif, RFE

# Phase 2: Data preprocessing

## First we redo the data changes from the 1st phase

In [2]:
connections, devices, processes, profiles = pd.read_csv('data/connections.csv', sep='\t', keep_default_na=False, na_values=''), pd.read_csv('data/devices.csv', sep='\t', keep_default_na=False, na_values=''), pd.read_csv('data/processes.csv', sep='\t', keep_default_na=False, na_values=''), pd.read_csv('data/profiles.csv', sep='\t', keep_default_na=False, na_values='')

Iterative way to redo the changes:

In [3]:
def get_outliers(column: pd.Series):
    lower_quartile = column.quantile(0.25)
    upper_quartile = column.quantile(0.75)
    iqr = upper_quartile - lower_quartile
    return column[(column < lower_quartile - 1.5*iqr) | (column > upper_quartile + 1.5*iqr)]


In [4]:
def iterative_reformat(processes_ptr: pd.DataFrame, connections_ptr: pd.DataFrame) -> pd.DataFrame:
    connections_ptr['ts'] = pd.to_datetime(connections_ptr['ts'])
    processes_ptr['ts'] = pd.to_datetime(processes_ptr['ts'])
    merged = processes_ptr.merge(connections_ptr, on=['ts', 'imei', 'mwra'], how='inner')
    merged.drop(columns=['ts', 'imei'], inplace=True)
    to_drop = []
    # handle null values and outliers
    for column in merged.columns:
        # if more than 5% are NaN values or more than 5% are outliers, we don't use that column
        column_outliers = get_outliers(merged[column])
        if ((merged[column].isna().sum()/merged.shape[0] > 0.05) or 
            (column_outliers.shape[0] / merged.shape[0] > 0.05)):
            to_drop.append(column)
            continue
        # if there are some null values, we replace the data that's neutral in respect to mwra
        if merged[column].isnull().any():
            # we get means of the distributions for rows with present and non-present malware related activity
            means_per_mwra = merged.groupby('mwra')[column].mean()
            # we average those means, meaning the manufactured value won't be likely to affect predicted mwra 
            imputed_value = means_per_mwra.mean()
            merged[column].fillna(imputed_value, inplace=True)
        #  if there are any outliers, we replace them with the edge values. If we clipped all outliers, we would clutter way too much data together, so we clip only the most extreme ones
        if column_outliers.shape[0]:
            iqr = stats.iqr(merged[column])
            lower_limit = merged[column].quantile(0.25)  - 2.5 * iqr
            upper_limit = merged[column].quantile(0.75)  + 2.5 * iqr
            merged[column] = merged[column].clip(lower=lower_limit, upper=upper_limit)
    return merged.drop(columns=to_drop)

In [42]:
def transorm_dataframe(preprocessor: TransformerMixin, dataframe: pd.DataFrame) -> pd.DataFrame:
    return pd.DataFrame(preprocessor.fit_transform(dataframe), columns=dataframe.columns, index=dataframe.index)

# Phase 2-1: Data transformation

## 2-1a & 2-1b
Splitting the data into training and testing sets + transforming data for ML

First we create a combined table for data to work with. As we learnt in the previous phase, we will use only connections and processes tables. Devices and profiles couldn't be connected logically with the other two tables. That's because there were multiple profiles/devices per imei. And it wasn't a fixed amount of profiles/devices per imei either, so we can't just make a column for all locations/usernames/etc. Even if we did that, there wasn't a correlation found between any of the columns in these tables and mwra.

In [6]:
combined_table = iterative_reformat(processes, connections)

now onto splitting the data into testing and training

In [7]:
# we separate the features and the target
X = combined_table.drop(columns=['mwra'])
y = combined_table['mwra']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69)

As we didn't use any non-numerical data, we don't need to do any conversions to numerical. Most of the data had huge cardinality either way which would increase likelihood of overfitting and difficulty of encoding. 

Example for what we would do if we were to use the categorical data from profiles and devices table

In [7]:
# if the cardinality was too high to use one hot encoding, we can hash the values and now they are numbers
mail_encoded = profiles['mail'].apply(lambda x: hash(x))
profiles['mail'].nunique(), mail_encoded.nunique()
# if one hot encoding was feasible, it could be doable like this
continents = devices["location"].apply(lambda x: x.split('/')[0])
continents.head()

0      America
1    Australia
2       Europe
3       Europe
4      America
Name: location, dtype: object

## 2-1c: Data transformation

### Data scaling

First, let's test what each type of scaler does to the data

In [18]:
X_train_std_scaled = transorm_dataframe(StandardScaler(), X_train)
X_train_robust_scaled = transorm_dataframe(RobustScaler(), X_train)
X_train_minmax_scaled = transorm_dataframe(MinMaxScaler(), X_train)

In [19]:
X_train_std_scaled.describe()

Unnamed: 0,p.android.chrome,p.android.documentsui,p.android.gm,p.system,p.android.packageinstaller,p.android.settings,p.android.externalstorage,p.android.gms,p.browser.provider,p.dogalize,...,c.android.youtube,c.dogalize,c.android.gm,c.katana,c.android.chrome,c.raider,c.android.vending,c.UCMobile.intl,c.UCMobile.x86,c.updateassist
count,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,...,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0
mean,6.762962e-16,1.243234e-16,1.726714e-16,4.52399e-16,-5.042004e-16,2.923902e-16,-4.086556e-16,-4.949913e-16,1.323814e-16,-9.554483000000001e-17,...,-2.656261e-16,1.674912e-16,6.791741e-16,3.338313e-16,2.883612e-16,-7.252198000000001e-17,-9.669597000000001e-17,-1.12812e-16,2.325308e-16,-1.574187e-16
std,1.000041,1.000041,1.000041,1.000041,1.000041,1.000041,1.000041,1.000041,1.000041,1.000041,...,1.000041,1.000041,1.000041,1.000041,1.000041,1.000041,1.000041,1.000041,1.000041,1.000041
min,-3.169838,-2.585017,-3.313403,-3.786887,-3.960686,-3.736212,-3.366769,-3.666513,-3.655651,-3.842422,...,-3.788481,-3.947397,-3.869382,-3.740315,-3.83449,-1.718797,-1.733157,-1.720994,-1.721076,-1.735094
25%,-0.742981,-0.7244385,-0.6628154,-0.7054325,-0.6539208,-0.6923263,-0.6898504,-0.6801022,-0.6847679,-0.6778703,...,-0.742168,-0.6458324,-0.7163378,-0.580312,-0.707736,-0.8581145,-0.8678329,-0.869761,-0.8730573,-0.875094
50%,-0.09404088,-0.1440667,-0.1068292,-0.09852335,0.005657922,0.06395903,0.03113026,-4.794129e-05,-0.004224147,0.01080093,...,-0.04991202,0.08932653,-0.04218215,0.09924735,-0.01768695,-0.01026861,0.007429942,0.004425234,-0.007777796,-0.0009780786
75%,0.717756,0.6565743,0.5503672,0.6661457,0.6679419,0.7187275,0.7008837,0.6791898,0.6705818,0.6831262,...,0.7349707,0.709884,0.7122013,0.6806337,0.7268036,0.8702019,0.8673749,0.8603163,0.8806997,0.8625707
max,3.570484,3.667127,3.605873,4.09165,4.01368,3.399445,3.797596,3.973676,3.844842,3.714689,...,3.972474,2.972356,3.236194,3.385552,3.812436,1.749177,1.728812,1.716978,1.726126,1.730243


In [21]:
X_train_robust_scaled.describe()

Unnamed: 0,p.android.chrome,p.android.documentsui,p.android.gm,p.system,p.android.packageinstaller,p.android.settings,p.android.externalstorage,p.android.gms,p.browser.provider,p.dogalize,...,c.android.youtube,c.dogalize,c.android.gm,c.katana,c.android.chrome,c.raider,c.android.vending,c.UCMobile.intl,c.UCMobile.x86,c.updateassist
count,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,...,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0
mean,0.064379,0.10432,0.088057,0.071832,-0.00428,-0.045327,-0.022384,3.5e-05,0.003117,-0.007936,...,0.03379,-0.065889,0.029528,-0.078709,0.012329,0.005941,-0.004282,-0.002558,0.004435,0.000563
std,0.684614,0.724136,0.824312,0.729117,0.756539,0.708719,0.719074,0.735707,0.737847,0.734786,...,0.677012,0.737647,0.700044,0.793088,0.697116,0.578621,0.576323,0.578032,0.570228,0.575508
min,-2.105647,-1.767507,-2.643109,-2.689139,-3.000572,-2.693144,-2.443242,-2.697335,-2.694085,-2.831178,...,-2.530953,-2.977558,-2.679101,-3.044986,-2.660647,-0.988551,-1.0031,-0.997308,-0.97693,-0.997958
25%,-0.444255,-0.420251,-0.458287,-0.44249,-0.498977,-0.535972,-0.518417,-0.5003,-0.502117,-0.506005,...,-0.468647,-0.542266,-0.47192,-0.538928,-0.481025,-0.490562,-0.504414,-0.505287,-0.493386,-0.503041
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.555745,0.579749,0.541713,0.55751,0.501023,0.464028,0.481583,0.4997,0.497883,0.493995,...,0.531353,0.457734,0.52808,0.461072,0.518975,0.509438,0.495586,0.494713,0.506614,0.496959
max,2.508683,2.759709,3.0603,3.055001,3.032102,2.363826,2.708257,2.923378,2.839907,2.721453,...,2.723093,2.126573,2.294915,2.606222,2.669932,1.018011,0.992032,0.989871,0.98868,0.996292


In [20]:
X_train_minmax_scaled.describe()

Unnamed: 0,p.android.chrome,p.android.documentsui,p.android.gm,p.system,p.android.packageinstaller,p.android.settings,p.android.externalstorage,p.android.gms,p.browser.provider,p.dogalize,...,c.android.youtube,c.dogalize,c.android.gm,c.katana,c.android.chrome,c.raider,c.android.vending,c.UCMobile.intl,c.UCMobile.x86,c.updateassist
count,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,...,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0
mean,0.47028,0.413461,0.478866,0.480659,0.496677,0.523597,0.469933,0.479898,0.487388,0.508451,...,0.488146,0.570453,0.544556,0.524893,0.501442,0.49562,0.500628,0.500584,0.499268,0.5007
std,0.148367,0.159952,0.14453,0.126932,0.125407,0.140147,0.139585,0.130892,0.13333,0.132331,...,0.128855,0.14452,0.14074,0.140339,0.130777,0.288364,0.288865,0.290881,0.290102,0.288584
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.360051,0.29759,0.383073,0.39112,0.414674,0.426574,0.373644,0.390882,0.396092,0.418751,...,0.392518,0.477122,0.443742,0.443455,0.40889,0.24818,0.249951,0.247597,0.246002,0.248172
50%,0.456328,0.390418,0.463426,0.468153,0.497387,0.532561,0.474278,0.479892,0.486825,0.50988,...,0.481715,0.583362,0.538619,0.53882,0.499129,0.492659,0.502774,0.501871,0.497011,0.500418
75%,0.576767,0.518477,0.558407,0.565211,0.580438,0.624321,0.567762,0.568795,0.576793,0.598846,...,0.582847,0.673042,0.644787,0.620409,0.596487,0.746545,0.751171,0.750824,0.75475,0.749614
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


We can see that the StandardScaler and RobustScaler have a mean of 0 and roughly 0. The minimum and maximum are relatively close to each other, meaning both could be used at the same time without issues with methods like knn. MinMaxScaler put the data in the range of 0 to 1, which is useful for neural networks and algorithms that use euclidean distance, but the ranges and mean were different from the other two scalers. 

#### When to use which scaler
First, we need to assess when we need to use which scaler. We can use the following rules of thumb:
- StandardScaler: When the data is normally distributed
- MinMaxScaler: When the data is not normally distributed and the data doesn't have outliers
- RobustScaler: When the data is not normally distributed and the data has outliers  

Problems we can encounter with scaling:
- Columns not matching conditions for the scaler
- If we use different scalers on the same data, the columns won't be in comparable ranges

Solution:
- Appy Standard Scaler and Robust Scaler to columns based on whether they are normally distributed
- Apply MinMax Scaler on all the columns to make them comparable

In [22]:
# split the columns into ones with normal and non-normal distribution
normal_columns = []
non_normal_columns = []
for column in X_train.columns:
    _, p_value = stats.normaltest(X[column])
    if p_value > 0.05:
        normal_columns.append(column)
    else:
        non_normal_columns.append(column)

# We combine the mutually exclusive scalers
init_scaler = ColumnTransformer([
    ('standard_scaler', StandardScaler(), normal_columns),
    ('robust_scaler', RobustScaler(), non_normal_columns)
])

X_train_scaled = transorm_dataframe(init_scaler, X_train)
X_train_scaled = transorm_dataframe(MinMaxScaler(), X_train_scaled)
X_train_scaled.head(10)

Unnamed: 0,p.android.chrome,p.android.documentsui,p.android.gm,p.system,p.android.packageinstaller,p.android.settings,p.android.externalstorage,p.android.gms,p.browser.provider,p.dogalize,...,c.android.youtube,c.dogalize,c.android.gm,c.katana,c.android.chrome,c.raider,c.android.vending,c.UCMobile.intl,c.UCMobile.x86,c.updateassist
9005,0.483595,0.441366,0.70209,0.636406,0.48603,0.384492,0.619022,0.310601,0.410976,0.418514,...,0.375617,0.7371,0.739181,0.626091,0.494479,0.771396,0.353118,0.072081,0.786613,0.701407
2969,0.567983,0.330349,0.406478,0.656822,0.504451,0.399848,0.308145,0.173968,0.328807,0.366263,...,0.503285,0.683208,0.434231,0.732879,0.650314,0.568595,0.680306,0.44843,0.81603,0.210317
1814,0.407768,0.653001,0.460639,0.48585,0.620401,0.442087,0.791549,0.69021,0.599075,0.617908,...,0.603602,0.338604,0.439296,0.388662,0.450632,0.369682,0.01306,0.671148,0.772845,0.314295
1329,0.494783,0.436368,0.756157,0.512127,0.595985,0.598577,0.456346,0.164639,0.230139,0.364623,...,0.476818,0.497982,0.690077,0.401531,0.394446,0.197623,0.819457,0.500501,0.238766,0.313109
11936,0.682511,0.477391,0.464776,0.616985,0.512693,0.354695,0.316714,0.431555,0.537392,0.476451,...,0.451656,0.46169,0.655311,0.58403,0.651975,0.818106,0.251199,0.115245,0.230938,0.719118
8275,0.459064,0.614562,0.572466,0.393997,0.55097,0.347811,0.527403,0.352207,0.545393,0.318834,...,0.433061,0.642685,0.731407,0.601039,0.452107,0.981662,0.304753,0.781191,0.011719,0.150772
14095,0.560736,0.483777,0.620579,0.633239,0.492865,0.366477,0.194222,0.366023,0.247459,0.376766,...,0.424913,0.51636,0.706712,0.435621,0.379597,0.455415,0.936982,0.22776,0.149069,0.653988
10345,0.54885,0.323317,0.488591,0.285537,0.626571,0.407907,0.49074,0.307615,0.27378,0.463639,...,0.61633,0.743202,0.485979,0.586254,0.652278,0.503222,0.117929,0.541092,0.386154,0.005807
4130,0.427708,0.476409,0.581966,0.505028,0.494746,0.771965,0.65175,0.237039,0.574927,0.338505,...,0.51656,0.65421,0.659599,0.556275,0.463475,0.378758,0.395199,0.486079,0.171829,0.397431
1819,0.470254,0.547077,0.505392,0.362881,0.639874,0.65476,0.265357,0.286843,0.582914,0.414703,...,0.53368,0.617495,0.460009,0.560442,0.616836,0.2307,0.556081,0.788547,0.067034,0.795465


### Using transformers
Since there are many columns that aren't normally distributed, we could benefit from using Power Transformer or Quantile Transformer.

In [29]:
power_transformed = transorm_dataframe(PowerTransformer(), X_train)
quantile_transformed = transorm_dataframe(QuantileTransformer(output_distribution='normal'), X_train)

In [31]:
power_transformed.describe()

Unnamed: 0,p.android.chrome,p.android.documentsui,p.android.gm,p.system,p.android.packageinstaller,p.android.settings,p.android.externalstorage,p.android.gms,p.browser.provider,p.dogalize,...,c.android.youtube,c.dogalize,c.android.gm,c.katana,c.android.chrome,c.raider,c.android.vending,c.UCMobile.intl,c.UCMobile.x86,c.updateassist
count,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,...,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0
mean,1.16841e-16,9.255186e-16,-9.105538e-16,6.492444e-16,-3.384359e-16,3.5685420000000005e-17,-3.079306e-16,-5.801758e-16,6.561513000000001e-17,-9.439369e-17,...,-3.539763e-16,-5.20892e-17,-4.581547e-16,2.705185e-16,-1.254745e-16,2.405888e-16,-5.467927e-18,-2.049034e-16,4.8347990000000004e-17,-8.748683000000001e-17
std,1.000041,1.000041,1.000041,1.000041,1.000041,1.000041,1.000041,1.000041,1.000041,1.000041,...,1.000041,1.000041,1.000041,1.000041,1.000041,1.000041,1.000041,1.000041,1.000041,1.000041
min,-4.163891,-3.513875,-5.113789,-5.082017,-3.837395,-3.259458,-3.282411,-3.778996,-3.787796,-3.778426,...,-4.685564,-3.043914,-4.351662,-2.909413,-3.976129,-2.083549,-2.083021,-2.089294,-2.093688,-2.087517
25%,-0.7150467,-0.6880269,-0.6234693,-0.6811566,-0.6586892,-0.7144958,-0.6940907,-0.676607,-0.6807061,-0.6797892,...,-0.7232777,-0.6910718,-0.7048393,-0.6407408,-0.7037059,-0.788505,-0.8022435,-0.8022434,-0.8059467,-0.8106785
50%,-0.03181553,-0.05241102,-0.02470813,-0.05334653,-0.001441591,0.03429448,0.02504619,0.004858696,0.00154017,0.00816421,...,-0.01266656,0.03879722,-0.02292431,0.03621729,-0.011439,0.08593203,0.09809294,0.09977492,0.0888183,0.09020096
75%,0.7478928,0.7152999,0.6130645,0.6900006,0.6646622,0.7099323,0.6986521,0.6811286,0.6729371,0.6821273,...,0.7502262,0.6979696,0.7198393,0.6633683,0.7289575,0.8688939,0.8659023,0.8606716,0.8779721,0.8619097
max,3.049508,2.903852,2.948925,3.590079,4.109668,3.698497,3.866796,3.916931,3.782406,3.742447,...,3.565548,3.393666,3.08944,4.127903,3.744676,1.577445,1.567718,1.552584,1.558984,1.568818


In [32]:
quantile_transformed.describe()

Unnamed: 0,p.android.chrome,p.android.documentsui,p.android.gm,p.system,p.android.packageinstaller,p.android.settings,p.android.externalstorage,p.android.gms,p.browser.provider,p.dogalize,...,c.android.youtube,c.dogalize,c.android.gm,c.katana,c.android.chrome,c.raider,c.android.vending,c.UCMobile.intl,c.UCMobile.x86,c.updateassist
count,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,...,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0,12345.0
mean,-0.003531,0.00319,0.006626,-0.002605,0.000926,-0.001641,-0.001902,-0.006671,-0.004588,-0.005029,...,-0.000389,0.007459,0.001001,0.003392,-0.001533,0.003501,6e-06,0.000772,-0.002356,-0.013123
std,1.000144,1.001558,1.017141,0.99689,1.001998,1.005815,0.999724,1.002899,1.005188,1.00128,...,0.998506,1.001913,0.997897,1.001926,1.007411,1.004498,0.999787,1.002697,1.002892,1.00509
min,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,...,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338
25%,-0.676745,-0.67048,-0.670222,-0.67746,-0.668971,-0.679831,-0.676531,-0.68618,-0.680091,-0.682357,...,-0.679973,-0.674308,-0.673876,-0.671894,-0.678663,-0.675525,-0.670639,-0.671756,-0.675276,-0.691191
50%,-0.001386,0.003086,0.001046,-0.003298,0.005892,-0.00614,0.00092,-0.007098,-0.000399,-0.00539,...,-0.00362,0.003603,0.000377,0.002549,-0.007204,0.005615,-0.002084,-0.001651,-0.003377,-0.018479
75%,0.673066,0.676595,0.677953,0.667418,0.677447,0.681446,0.671023,0.671128,0.674299,0.668959,...,0.67822,0.681275,0.675932,0.679398,0.674288,0.677052,0.675419,0.671533,0.672253,0.664847
max,5.199338,5.199338,5.199338,5.199338,5.199338,5.199338,5.199338,5.199338,5.199338,5.199338,...,5.199338,5.199338,5.199338,5.199338,5.199338,5.199338,5.199338,5.199338,5.199338,5.199338


In [41]:
# for column in X_train.columns:
#     print("Histogram of", column)
#     print("Using power transformer")
#     sns.histplot(power_transformed[column])
#     plt.show()
#     print("Using quantile transformer")
#     sns.histplot(quantile_transformed[column])
#     plt.show()
pass

We can see power transformation normalized some of the columns, but the ones that were uniformly distributed are now skewed to the side. The ranges for columns are approximately the same. Quantile transformation made the data normally distributed, also making the data in the columns be in the same range which is useful for some algorithms.

As these aren't all that co-usable, we use them separately without combining them.

# Phase 2-2: Feature Selection
We need to figure out which features help best with predicting the target variable. We can use the following methods.

## Filter Methods

### Variance Threshold

In [77]:
variance_selector = VarianceThreshold(threshold=500)
high_variance_columns = variance_selector.fit_transform(X_train)
high_variance_columns

array([[94.35106,  8.89805, 63.99106, ...,  7.20809, 78.66131, 70.14069],
       [78.87992, 49.9576 , 98.09531, ..., 44.84301, 81.60302, 21.03168],
       [80.06156,  4.65163, 95.10592, ..., 67.11484, 77.28452, 31.42955],
       ...,
       [19.83578, 31.40051, 10.15434, ..., 85.65207, 10.54911,  2.67848],
       [66.27537, 27.31082, 85.45787, ..., 18.65774, 32.00129, 34.18675],
       [22.98821, 77.07455, 16.27829, ..., 14.29205, 32.80857, 82.23808]])

It's important to keep in mind whether we already scaled the data. The threshold to get intended amount of columns vastly differs based on that.

In [80]:
variance_selector_scaled = VarianceThreshold(threshold=0.05)
high_variance_columns_scaled = variance_selector_scaled.fit_transform(X_train_scaled)
high_variance_columns_scaled

array([[0.9435046 , 0.08895335, 0.63982556, ..., 0.0720809 , 0.7866131 ,
        0.7014069 ],
       [0.78877677, 0.49956109, 0.9809486 , ..., 0.4484301 , 0.8160302 ,
        0.2103168 ],
       [0.80059442, 0.04648789, 0.95104764, ..., 0.6711484 , 0.7728452 ,
        0.3142955 ],
       ...,
       [0.19827266, 0.31398466, 0.10133122, ..., 0.8565207 , 0.1054911 ,
        0.0267848 ],
       [0.66271788, 0.27308654, 0.85454436, ..., 0.1865774 , 0.3200129 ,
        0.3418675 ],
       [0.2298003 , 0.77073867, 0.16258519, ..., 0.1429205 , 0.3280857 ,
        0.8223808 ]])

We can see we had to lower the threshold 10000 to get the same amount of columns.