# Phase 2

In [1095]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import scipy.stats as stats
from datetime import datetime
from sklearn.model_selection import train_test_split
import category_encoders as ce
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import SelectKBest 
from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_regression
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.compose import ColumnTransformer

## 2.1 Data preprocessing

In [1096]:
def convert_to_datetime(value):
    date_formats = ["%Y-%m-%d", "%Y/%m/%d", "%m/%d/%Y, %H:%M:%S", "%d %b %Y"]
    for fmt in date_formats:
        try:
            return datetime.strptime(value, fmt)
        except ValueError:
            continue
    return None

def load_and_preprocess_data():
    connections, processes = pd.read_csv('data/connections.csv', sep='\t', keep_default_na=False, na_values=''), pd.read_csv('data/processes.csv', sep='\t', keep_default_na=False, na_values='')

    connections['ts'] = pd.to_datetime(connections['ts'])
    processes['ts'] = pd.to_datetime(processes['ts'])

    connections = connections.drop_duplicates()
    processes = processes.drop_duplicates()

    connections['mwra'] = connections['mwra'].astype(int)
    processes['mwra'] = processes['mwra'].astype(int)

    lower = connections['c.dogalize'].quantile(0.25) - 1.5 * stats.iqr(connections['c.dogalize'])
    upper = connections['c.dogalize'].quantile(0.75) + 1.5 * stats.iqr(connections['c.dogalize'])
    connections['c.dogalize'] = np.where(connections['c.dogalize'] < lower, lower, connections['c.dogalize'])
    connections['c.dogalize'] = np.where(connections['c.dogalize'] > upper, upper, connections['c.dogalize'])

    lower = connections['c.android.youtube'].quantile(0.25) - 1.5 * stats.iqr(connections['c.android.youtube'])
    upper = connections['c.android.youtube'].quantile(0.75) + 1.5 * stats.iqr(connections['c.android.youtube'])
    connections = connections[(connections['c.android.youtube'] >= lower) & (connections['c.android.youtube'] <= upper)]

    Q1 = processes['p.android.documentsui'].quantile(0.25)
    Q3 = processes['p.android.documentsui'].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    processes['p.android.documentsui'] = processes['p.android.documentsui'].clip(lower=lower_bound, upper=upper_bound)

    merged = processes.merge(connections, on=['ts', 'imei', 'mwra'], how='inner')
    merged = merged.drop(columns=['ts', 'imei'])

    print("Dataset:", merged.shape)

    return merged

In [1097]:
data = load_and_preprocess_data()

Dataset: (14878, 31)


In [1098]:
def split_data(data, ratio): #ratio = training subset percentage
    slice_index = (data.shape[0]*ratio)//100
    training_data = data[:slice_index]
    test_data = data[slice_index:]
    return training_data, test_data

In [1099]:
training_data, test_data = split_data(data,75)
print("training: ", training_data.shape)
print("test: ", test_data.shape)

training:  (11158, 31)
test:  (3720, 31)


### Data Normalization

In [1100]:
training_data.head()

Unnamed: 0,mwra,p.android.chrome,p.android.documentsui,p.android.gm,p.system,p.android.packageinstaller,p.android.settings,p.android.externalstorage,p.android.gms,p.katana,...,c.android.youtube,c.dogalize,c.android.gm,c.katana,c.android.chrome,c.raider,c.android.vending,c.UCMobile.intl,c.UCMobile.x86,c.updateassist
0,0,7.41473,10.17656,14.80917,12.14702,11.5562,8.33912,14.0245,73.80657,0.01502,...,11.65403,10.65335,10.48791,16.9162,5.04564,36.21508,14.24956,4.09319,15.20934,5.11764
1,1,9.40603,6.57378,6.06519,10.56643,16.74062,13.78434,7.57297,18.42794,0.01253,...,10.29551,10.46363,10.98438,15.65637,14.82931,24.83765,57.49911,23.94919,8.82448,44.1357
2,0,13.61225,11.73312,8.99679,10.79425,12.60312,10.78121,11.90788,60.61602,0.00536,...,12.12831,9.53752,6.7808,6.98948,10.92433,3.29441,20.37891,52.55353,49.51037,99.75215
3,1,8.1497,9.53996,8.28249,10.80629,13.9767,12.73047,11.64714,48.0334,0.03681,...,11.06419,11.55759,8.87744,11.84499,15.45902,24.78878,36.40299,25.77644,21.84167,65.17774
4,1,8.20358,8.94156,8.62248,8.33003,13.44049,13.31239,11.40689,43.77363,0.01229,...,12.08457,10.4286,9.82241,12.48869,12.85363,40.98237,74.13316,96.7745,18.48116,90.77304


In [1101]:
#feature_columns = ['p.android.chrome', 'p.android.documentsui', 'p.android.gm']
feature_columns = ['p.android.chrome', 'p.android.documentsui', 'p.android.gm', 'p.system', 'p.android.packageinstaller', 'p.android.settings', 'p.android.externalstorage', 'p.android.gms', 'p.katana', 'p.browser.provider', 'p.dogalize', 'p.simulator', 'p.google', 'p.android.vending', 'p.inputmethod.latin', 'p.process.gapps', 'p.notifier', 'p.olauncher', 'p.gms.persistent', 'p.android.defcontainer', 'c.android.youtube', 'c.dogalize', 'c.android.gm', 'c.katana', 'c.android.chrome', 'c.raider', 'c.android.vending', 'c.UCMobile.intl', 'c.UCMobile.x86', 'c.updateassist']

In [1102]:
def min_max_scaling(data, feature_columns):
    scaler = MinMaxScaler()
    data[feature_columns] = scaler.fit_transform(data[feature_columns])
    return data

In [1103]:
data_to_transform = training_data.copy()
normalized_data = min_max_scaling(data_to_transform, feature_columns)
normalized_data

Unnamed: 0,mwra,p.android.chrome,p.android.documentsui,p.android.gm,p.system,p.android.packageinstaller,p.android.settings,p.android.externalstorage,p.android.gms,p.katana,...,c.android.youtube,c.dogalize,c.android.gm,c.katana,c.android.chrome,c.raider,c.android.vending,c.UCMobile.intl,c.UCMobile.x86,c.updateassist
0,0,0.318339,0.490964,0.681877,0.570709,0.428630,0.365279,0.603899,0.735925,0.000164,...,0.563435,0.461205,0.574090,0.840767,0.258522,0.362151,0.142496,0.040932,0.152067,0.051176
1,1,0.427289,0.205080,0.204867,0.496133,0.776338,0.662044,0.273348,0.177613,0.000137,...,0.471048,0.448699,0.601324,0.768300,0.759804,0.248377,0.574991,0.239492,0.088217,0.441357
2,0,0.657424,0.614478,0.364795,0.506882,0.498845,0.498373,0.495452,0.602942,0.000059,...,0.595688,0.387652,0.370735,0.269764,0.559726,0.032944,0.203789,0.525535,0.495088,0.997522
3,1,0.358551,0.440449,0.325828,0.507451,0.590968,0.604608,0.482092,0.476087,0.000403,...,0.523322,0.520811,0.485747,0.549062,0.792068,0.247888,0.364030,0.257764,0.218393,0.651777
4,1,0.361499,0.392965,0.344375,0.390615,0.555005,0.636323,0.469783,0.433141,0.000134,...,0.592714,0.446390,0.537584,0.586089,0.658577,0.409824,0.741332,0.967745,0.184786,0.907730
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11153,0,0.674334,0.735604,0.324217,0.389621,0.406291,0.373495,0.371661,0.598830,0.000016,...,0.597494,0.279603,0.313592,0.230778,0.700315,0.136841,0.770969,0.319299,0.161165,0.671279
11154,0,0.521492,0.407818,0.496179,0.591852,0.273890,0.670576,0.530720,0.501307,0.000010,...,0.499922,0.148470,0.461356,0.334139,0.296442,0.967893,0.072632,0.301471,0.496504,0.501139
11155,1,0.545218,0.718159,0.527567,0.227654,0.535097,0.314264,0.439566,0.312212,0.007759,...,0.512305,0.716588,0.762834,0.788142,0.577898,0.301306,0.513129,0.880274,0.588100,0.623462
11156,0,0.517123,0.767216,0.329142,0.556434,0.258722,0.586705,0.503369,0.449470,0.000009,...,0.707511,0.205644,0.427882,0.251935,0.431770,0.011265,0.934553,0.482591,0.920806,0.372878


### Data Standardization

In [1104]:
def standard_scaling(data, feature_columns):
    scaler = StandardScaler()
    data[feature_columns] = scaler.fit_transform(data[feature_columns])
    return data

In [1105]:
data_to_transform = training_data.copy()
standardized_data = standard_scaling(data_to_transform, feature_columns)
standardized_data

Unnamed: 0,mwra,p.android.chrome,p.android.documentsui,p.android.gm,p.system,p.android.packageinstaller,p.android.settings,p.android.externalstorage,p.android.gms,p.katana,...,c.android.youtube,c.dogalize,c.android.gm,c.katana,c.android.chrome,c.raider,c.android.vending,c.UCMobile.intl,c.UCMobile.x86,c.updateassist
0,0,-1.242509,0.044947,2.269220,0.799890,-0.454192,-1.130095,1.003581,2.073762,-0.154337,...,0.402588,-0.190757,0.201808,2.204756,-1.903257,-0.456628,-1.242146,-1.579027,-1.200166,-1.547316
1,1,-0.472420,-1.466121,-1.412354,0.202218,2.352231,0.974259,-1.315797,-2.238911,-0.155166,...,-0.135164,-0.259479,0.395243,1.661913,1.865498,-0.850041,0.255222,-0.894925,-1.420549,-0.191975
2,0,1.154240,0.697795,-0.178030,0.288364,0.112525,-0.186327,0.242639,1.046533,-0.157552,...,0.590325,-0.594946,-1.242564,-2.072530,0.361266,-1.594971,-1.029938,0.090584,-0.016216,1.739933
3,1,-0.958277,-0.222054,-0.478779,0.292917,0.856070,0.566982,0.148901,0.066648,-0.147084,...,0.169108,0.136788,-0.425667,0.019642,2.108068,-0.851731,-0.475159,-0.831971,-0.971242,0.538947
4,1,-0.937440,-0.473033,-0.335630,-0.643437,0.565809,0.791870,0.062529,-0.265087,-0.155246,...,0.573011,-0.272168,-0.057486,0.297004,1.104449,-0.291782,0.831119,1.614134,-1.087234,1.428032
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11153,0,1.273765,1.338018,-0.491213,-0.651405,-0.634495,-1.071836,-0.625965,1.014772,-0.158861,...,0.600834,-1.188702,-1.648437,-2.364563,1.418252,-1.235713,0.933728,-0.619965,-1.168763,0.606687
11154,0,0.193435,-0.394527,0.836001,0.969331,-1.703128,1.034755,0.490106,0.261462,-0.159040,...,0.032904,-1.909304,-0.598912,-1.590303,-1.618164,1.637930,-1.484024,-0.681388,-0.011328,0.015685
11155,1,0.361136,1.245814,1.078255,-1.949457,0.405124,-1.491843,-0.149491,-1.199204,0.076691,...,0.104978,1.212628,1.542404,1.810547,0.497892,-0.667019,0.041047,1.312770,0.304818,0.440589
11156,0,0.162551,1.505106,-0.453201,0.685486,-1.825553,0.440030,0.298190,-0.138953,-0.159050,...,1.241210,-1.595122,-0.836668,-2.206083,-0.600731,-1.669932,1.500080,-0.057372,1.453166,-0.429846


### Power trasformer

In [1106]:
def power_transform(data, feature_columns):
    transformer = PowerTransformer()
    data[feature_columns] = transformer.fit_transform(data[feature_columns])
    return data

In [1107]:
data_to_transform = training_data.copy()
power_transformed_data = power_transform(data_to_transform, feature_columns)
power_transformed_data

Unnamed: 0,mwra,p.android.chrome,p.android.documentsui,p.android.gm,p.system,p.android.packageinstaller,p.android.settings,p.android.externalstorage,p.android.gms,p.katana,...,c.android.youtube,c.dogalize,c.android.gm,c.katana,c.android.chrome,c.raider,c.android.vending,c.UCMobile.intl,c.UCMobile.x86,c.updateassist
0,0,-1.286834,0.144762,2.016132,0.817118,-0.456455,-1.130395,1.004431,2.061759,-0.682387,...,0.431485,-0.239972,0.215592,2.518335,-1.910094,-0.350287,-1.257418,-1.774003,-1.205499,-1.718469
1,1,-0.423361,-1.646526,-1.561019,0.248308,2.363645,0.979119,-1.313141,-2.257006,-0.711192,...,-0.098792,-0.308496,0.406821,1.824118,1.859507,-0.777522,0.326940,-0.831179,-1.513158,-0.089817
2,0,1.139414,0.755618,-0.093057,0.332144,0.109969,-0.214376,0.238273,1.045543,-0.796187,...,0.612111,-0.636408,-1.249253,-1.900836,0.363221,-1.823857,-0.991697,0.179609,0.078773,1.571905
3,1,-0.956204,-0.128232,-0.415615,0.336556,0.855615,0.552602,0.144244,0.070442,-0.445008,...,0.203690,0.092706,-0.412141,-0.047903,2.100036,-0.779465,-0.378786,-0.759113,-0.921109,0.584384
4,1,-0.932552,-0.399240,-0.259724,-0.613418,0.564213,0.787132,0.057675,-0.261186,-0.713988,...,0.595545,-0.321098,-0.042379,0.243528,1.103555,-0.183265,0.834646,1.473043,-1.061768,1.326269
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11153,0,1.243779,1.290449,-0.429367,-0.621910,-0.636297,-1.075439,-0.629731,1.014031,-0.844099,...,0.622158,-1.188574,-1.675383,-2.108178,1.415506,-1.256288,0.921741,-0.525961,-1.164769,0.643485
11154,0,0.252138,-0.312870,0.875105,0.973274,-1.698225,1.043142,0.486893,0.264813,-0.850758,...,0.069101,-1.805282,-0.587643,-1.529914,-1.622030,1.486880,-1.600115,-0.592132,0.083403,0.109431
11155,1,0.413780,1.216813,1.084077,-2.132043,0.403071,-1.466538,-0.154523,-1.200096,1.829461,...,0.140481,1.251851,1.521961,2.010562,0.499496,-0.573022,0.128455,1.233268,0.375657,0.497678
11156,0,0.222024,1.421230,-0.387432,0.710469,-1.819414,0.421302,0.294036,-0.134993,-0.851128,...,1.222328,-1.544078,-0.830196,-1.997303,-0.598801,-1.984817,1.387494,0.040193,1.346365,-0.327579


### Quantile transformer

In [1108]:
def quantile_transform(data, feature_columns):
    transformer = QuantileTransformer(output_distribution='normal')
    data[feature_columns] = transformer.fit_transform(data[feature_columns])
    return data

In [1109]:
data_to_transform = training_data.copy()
quantile_transformed_data = quantile_transform(data_to_transform, feature_columns)
quantile_transformed_data

Unnamed: 0,mwra,p.android.chrome,p.android.documentsui,p.android.gm,p.system,p.android.packageinstaller,p.android.settings,p.android.externalstorage,p.android.gms,p.katana,...,c.android.youtube,c.dogalize,c.android.gm,c.katana,c.android.chrome,c.raider,c.android.vending,c.UCMobile.intl,c.UCMobile.x86,c.updateassist
0,0,-1.296282,0.186144,1.878704,0.785029,-0.451520,-1.067684,0.993862,2.080287,-0.262884,...,0.382725,-0.261051,0.210243,2.396129,-1.967325,-0.340771,-1.057840,-1.729047,-1.018526,-1.626672
1,1,-0.362886,-1.701442,-1.607229,0.277178,2.326118,0.948612,-1.268486,-2.229749,-0.323698,...,-0.078272,-0.322279,0.384390,1.872844,1.896141,-0.668532,0.180031,-0.695548,-1.347216,-0.134601
2,0,1.065763,0.697189,-0.081390,0.357208,0.116339,-0.223485,0.201393,1.045976,-0.617586,...,0.545919,-0.625219,-1.239536,-1.800634,0.352908,-1.803040,-0.822168,0.061936,-0.018711,2.792639
3,1,-0.940475,-0.081758,-0.434677,0.361121,0.858892,0.501811,0.111471,0.074017,0.044713,...,0.180118,0.045208,-0.373368,-0.081345,2.215463,-0.670203,-0.356364,-0.637628,-0.770340,0.400650
4,1,-0.914318,-0.363253,-0.266887,-0.594373,0.567933,0.753414,0.022822,-0.261171,-0.329171,...,0.527438,-0.331743,-0.019922,0.226333,1.056338,-0.205888,0.635939,1.843071,-0.888065,1.337379
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11153,0,1.188439,1.186115,-0.448465,-0.604761,-0.646815,-1.015797,-0.623234,1.013240,-1.111702,...,0.555440,-1.123826,-1.697842,-2.052185,1.375083,-1.077791,0.731214,-0.466983,-0.984338,0.453693
11154,0,0.253469,-0.271142,0.919071,0.919269,-1.695296,1.021077,0.455628,0.265890,-1.381364,...,0.071265,-1.785100,-0.546666,-1.438215,-1.683021,1.900432,-1.445398,-0.517692,-0.015258,0.017385
11155,1,0.392248,1.103483,1.115867,-2.199759,0.406098,-1.415764,-0.178883,-1.192142,1.258869,...,0.127304,1.259105,1.511208,2.013224,0.473567,-0.502983,0.016492,1.162090,0.214451,0.325478
11156,0,0.227667,1.311238,-0.405315,0.689250,-1.798114,0.371022,0.251505,-0.138301,-1.411293,...,1.170637,-1.475263,-0.789348,-1.908560,-0.553828,-2.256577,1.518582,-0.046674,1.407237,-0.322213


### Robust Scaling

In [1110]:
def robust_scaling(data, feature_columns):
    scaler = RobustScaler()
    data[feature_columns] = scaler.fit_transform(data[feature_columns])
    return data

In [1111]:
data_to_transform = training_data.copy()
robust_scaled_data = robust_scaling(data_to_transform, feature_columns)
robust_scaled_data

Unnamed: 0,mwra,p.android.chrome,p.android.documentsui,p.android.gm,p.system,p.android.packageinstaller,p.android.settings,p.android.externalstorage,p.android.gms,p.katana,...,c.android.youtube,c.dogalize,c.android.gm,c.katana,c.android.chrome,c.raider,c.android.vending,c.UCMobile.intl,c.UCMobile.x86,c.updateassist
0,0,-0.793439,0.138112,1.956396,0.657407,-0.347282,-0.840317,0.700525,1.520385,-0.098505,...,0.303418,-0.202971,0.164876,1.678565,-1.313658,-0.255182,-0.721845,-0.916235,-0.691728,-0.889374
1,1,-0.260950,-0.954491,-1.074273,0.222148,1.772398,0.639026,-0.972174,-1.642254,-0.112807,...,-0.059952,-0.253202,0.300242,1.245138,1.305283,-0.482451,0.136913,-0.521411,-0.817949,-0.105850
2,0,0.863827,0.610165,-0.058179,0.284885,0.080757,-0.176857,0.151745,0.767081,-0.153990,...,0.430275,-0.498402,-0.845895,-1.736587,0.259978,-0.912787,-0.600141,0.047368,-0.013638,1.010986
3,1,-0.596903,-0.054947,-0.305755,0.288200,0.642353,0.352713,0.084143,0.048495,0.026651,...,0.145651,0.036438,-0.274231,-0.066115,1.473847,-0.483427,-0.281969,-0.485077,-0.560615,0.316696
4,1,-0.582495,-0.236422,-0.187915,-0.393706,0.423121,0.510808,0.021853,-0.194777,-0.114186,...,0.418576,-0.262477,-0.016578,0.155341,0.776423,-0.159953,0.467197,0.926674,-0.627048,0.830676
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11153,0,0.946474,1.073090,-0.315991,-0.399508,-0.483464,-0.799362,-0.474679,0.743789,-0.176563,...,0.437377,-0.932391,-1.129925,-1.969757,0.994487,-0.705249,0.526045,-0.362720,-0.673743,0.355856
11154,0,0.199465,-0.179657,0.776571,0.780804,-1.290598,0.681555,0.330215,0.191359,-0.179664,...,0.053615,-1.459094,-0.395468,-1.351559,-1.115545,0.954816,-0.860565,-0.398169,-0.010839,0.014198
11155,1,0.315424,1.006419,0.975994,-1.344823,0.301756,-1.094624,-0.131053,-0.879799,3.888054,...,0.102317,0.822792,1.103025,1.363813,0.354921,-0.376722,0.014081,0.752744,0.170230,0.259835
11156,0,0.178110,1.193905,-0.284699,0.574092,-1.383065,0.263467,0.191808,-0.102279,-0.179837,...,0.870092,-1.229452,-0.561849,-1.843221,-0.408522,-0.956091,0.850854,-0.038023,0.827929,-0.243364


## 2.2 Feature selection

In [1112]:
pocet=1

X = normalized_data.select_dtypes(include=['float64'])
y = normalized_data['mwra']

We used normalized data because chi-squared requires no-negative values

### Mutual Information

In [1113]:
mi_data = SelectKBest(mutual_info_regression, k=pocet).fit_transform(X, y)

selected_features = X.columns[SelectKBest(mutual_info_regression, k=pocet).fit(X, y).get_support()]
print("Selected Features:", selected_features)

Selected Features: Index(['p.android.documentsui'], dtype='object')


### Chi-Squared

In [1114]:
chi_data = SelectKBest(chi2, k=pocet).fit_transform(X, y)

selected_features = X.columns[SelectKBest(chi2, k=pocet).fit(X, y).get_support()]
print("Selected Features:", selected_features)

Selected Features: Index(['p.android.documentsui'], dtype='object')


### F-value

In [1115]:
f_data = SelectKBest(f_regression, k=pocet).fit_transform(X, y)

selected_features = X.columns[SelectKBest(f_regression, k=pocet).fit(X, y).get_support()]
print("Selected Features:", selected_features)

Selected Features: Index(['p.android.documentsui'], dtype='object')


### Poradie

The selected features are now ordered sequentially. When the process was run for 1, 2, 3, 4, and 5 features, additional features were appended at the end each time.

## 2.3 Reproducibility of preprocessing

In [1116]:
#parameters
data_ratio = 75 #percentage of training data
number_of_selected_features = 1
transformer_function = MinMaxScaler() # Transformations: MinMaxScaler(), StandardScaler(), PowerTransformer(), QuantileTransformer(output_distribution='normal')
selection_function = chi2 # Selections: mutual_info_regression, chi2, f_regression

data = load_and_preprocess_data()
training_data, test_data = split_data(data,data_ratio)

feature_columns = ['p.android.chrome', 'p.android.documentsui', 'p.android.gm', 'p.system', 'p.android.packageinstaller', 'p.android.settings', 'p.android.externalstorage', 'p.android.gms', 'p.katana', 'p.browser.provider', 'p.dogalize', 'p.simulator', 'p.google', 'p.android.vending', 'p.inputmethod.latin', 'p.process.gapps', 'p.notifier', 'p.olauncher', 'p.gms.persistent', 'p.android.defcontainer', 'c.android.youtube', 'c.dogalize', 'c.android.gm', 'c.katana', 'c.android.chrome', 'c.raider', 'c.android.vending', 'c.UCMobile.intl', 'c.UCMobile.x86', 'c.updateassist']

transformations = ColumnTransformer(
    transformers=[
        ('transformer_function', transformer_function, feature_columns)
    ],
    remainder='passthrough'
)

feature_selection = SelectKBest(selection_function, k=number_of_selected_features)

# def apply_pipeline(data, target):
#     selected_data = feature_selection.fit_transform(data, target)
#     selected_features = transformed_df.columns[feature_selection.get_support()]
#     return pd.DataFrame(selected_data, columns=selected_features)

pipeline = Pipeline(steps=[
    ('transformations', transformations)
    # ('feature_selection', feature_selection)
])

transformed_data = pipeline.fit_transform(training_data)
print("Transformed data: ", transformed_data)

# selected_columns = transformed_data.get_support()
# transformed_df = pd.DataFrame(training_data, columns=['mwra'] + selected_columns)
# print("Head:", transformed_df.head())

# X = normalized_data.select_dtypes(include=['float64'])

'''
X = transformed_df[feature_columns]
y = transformed_df['mwra']
transformed_data = pipeline.fit_transform(X, y)
'''

# selected_features = X.columns[transformed_data.get_support()]
# transformed_df = pd.DataFrame(transformed_data, columns=selected_features)
# transformed_df = pd.DataFrame(transformed_data, columns=selected_features)

# chi_data = SelectKBest(chi2, k=pocet).fit_transform(X, y)
# selected_features = X.columns[SelectKBest(chi2, k=pocet).fit(X, y).get_support()]



Dataset: (14878, 31)
Transformed data:  [[0.31833887 0.49096362 0.68187693 ... 0.15206728 0.0511764  0.        ]
 [0.42728888 0.20507992 0.20486743 ... 0.08821672 0.441357   1.        ]
 [0.65742382 0.614478   0.36479472 ... 0.49508815 0.9975215  0.        ]
 ...
 [0.54521784 0.71815943 0.52756749 ... 0.58810011 0.6234618  1.        ]
 [0.51712271 0.76721572 0.32914167 ... 0.92080606 0.3728778  0.        ]
 [0.34552576 0.39172585 0.54230932 ... 0.15183238 0.0725044  0.        ]]


"\nX = transformed_df[feature_columns]\ny = transformed_df['mwra']\ntransformed_data = pipeline.fit_transform(X, y)\n"