# Fáza 3 - Strojové učenie

In [1]:
# imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PowerTransformer, QuantileTransformer, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
connections = pd.read_csv('data/connections.csv', sep='\t')
connections['ts'] = pd.to_datetime(connections['ts'])

processes = pd.read_csv('data/processes.csv', sep='\t')
processes['ts'] = pd.to_datetime(processes['ts'])

conn_processes = pd.merge(connections, processes, on=['ts', 'imei', 'mwra'], how='inner')
conn_processes.drop_duplicates(inplace=True)

In [3]:
X = conn_processes.drop(columns=['mwra'])
y = conn_processes['mwra']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
def encode_values(df):
    df['ts'] = df['ts'].astype('int64')
    return df

def remove_attributes(df):
    df = df.drop(['ts', 'imei'], axis=1)
    return df

def fill_missing_values(df):
    for col in df.select_dtypes(include=[np.number]).columns:
        df[col] = df[col].fillna(df[col].median())

    return df

def outlier_replacement(df):
    for col in df.select_dtypes(include=[np.number]).columns:
        lower_bound = df[col].quantile(0.05)
        upper_bound = df[col].quantile(0.95)

        df[col] = np.where(df[col] < lower_bound, lower_bound, df[col])
        df[col] = np.where(df[col] > upper_bound, upper_bound, df[col])
    
    return df

In [5]:
encoder_transformer = FunctionTransformer(func=encode_values, validate=False)
removal_transformer = FunctionTransformer(func=remove_attributes, validate=False)
inputer_transformer = FunctionTransformer(func=fill_missing_values, validate=False)
outliers_transformer = FunctionTransformer(func=outlier_replacement, validate=False)

In [6]:
standard_columns = ['c.UCMobile.intl', 'c.updateassist', 'c.UCMobile.x86', 'p.android.defcontainer', 'p.google', 'p.android.gms', 'p.olauncher', 'p.android.vending', 'p.browser.provider', 'p.process.gapps', 'p.gms.persistent', 'p.simulator']
minmax_columns = ['p.android.chrome', 'p.android.documentsui', 'p.android.gm']
power_columns = ['p.notifier']
quantile_columns = ['c.android.gm', 'c.android.youtube', 'c.katana', 'c.dogalize', 'c.android.chrome', 'c.android.vending', 'c.raider', 'p.system', 'p.android.externalstorage', 'p.android.packageinstaller', 'p.android.settings', 'p.inputmethod.latin', 'p.katana', 'p.dogalize']

In [7]:
preprocessor = ColumnTransformer(
    transformers=[
        ('standard', StandardScaler(), standard_columns),
        ('minmax', MinMaxScaler(), minmax_columns),
        ('power', PowerTransformer(method='yeo-johnson'), power_columns),
        ('quantile', QuantileTransformer(output_distribution='normal'), quantile_columns)
    ],
    remainder='passthrough'
)

In [8]:
pipeline = Pipeline(steps=[
    ('encoding', encoder_transformer),
    ('removing', removal_transformer),
    ('outliers', outliers_transformer),
    ('inputer', inputer_transformer),
    ('preprocessor', preprocessor)
])

In [9]:
X_train_transformed = pipeline.fit_transform(X_train)
X_test_transformed = pipeline.transform(X_test)

### 3.1 Jednoduchý klasifikátor na základe závislosti v dátach

### (A)

### (B)

### (C)

### 3.2 Trénovanie a vyhodnotenie klasifikátorov strojového učenia

### (A)

### (B)

### (C)

### (D)

### (E)

### 3.3 Optimalizácia alias hyperparameter tuning

### (A)

### (B)

### (C)

### (D)

### 3.4 Vyhodnotenie vplyvu zvolenej stratégie riešenia na klasifikáciu

### (A)

### (B)

### (C)

### (D)

### (E)