# Automation in ML

## Importing libraries

In [1]:
import numpy as np
import pandas as pd
from math import isnan
from sklearn import preprocessing
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# Missing values

In [None]:
class MissingValues:
    # Function for handling missing values in the data
    def handle(df, missing_num='auto', missing_categ='auto', _n_neighbors=3):
        count_missing = df.isna().sum().sum()
        if count_missing != 0:
            # drop rows containing only missing values
            df = df.dropna(how='all')
            df.reset_index(drop=True)
            
            if self.missing_num:
                # automated handling of numerical missing values
                if missing_num == 'auto':
                    missing_num = 'linreg'
                    lr = LinearRegression()
                    df = MissingValues._lin_regression_impute(self, df, lr)
                    missing_num = 'knn'
                    imputer = KNNImputer(n_neighbors=_n_neighbors)
                    df = MissingValues._impute(self, df, imputer, type='num')
                # linear regression imputation
                elif missing_num == 'linreg':
                    lr = LinearRegression()
                    df = MissingValues._lin_regression_impute(self, df, lr)
                # knn imputation
                elif missing_num == 'knn':
                    imputer = KNNImputer(n_neighbors=_n_neighbors)
                    df = MissingValues._impute(self, df, imputer, type='num')
                # mean, median or mode imputation
                elif missing_num in ['mean', 'median', 'most_frequent']:
                    imputer = SimpleImputer(strategy=self.missing_num)
                    df = MissingValues._impute_missing(self, df, imputer, type='num')
                # delete missing values
                elif missing_num == 'delete':
                    df = MissingValues._delete(self, df, type='num')
                   
            if missing_categ:
                ...
        else:
            pass
        return df

# outliers

In [None]:
class Outliers:
    # Function that handles outliers in the data
    def handle(df, outliers='winz'):
        if outliers:
            if outliers == 'winz':  
                df = Outliers._winsorization(self, df)
            elif ourliers == 'delete':
                df = Outliers._delete(self, df)
        return df     
    def _winsorization(df):
        ...
    def _delete(df):
        ...
    def _compute_bounds(df, feature):

# categorical encoding

In [None]:
class EncodeCateg:
    # Function for encoding of categorical features
    # to specify columns set encode_categ to: ['auto', ['col1', col2']]
    def handle(df, encode_categ=['auto']):
        if encode_categ[0]:
            # select non numeric features
            cols_categ = set(df.columns) ^ set(df.select_dtypes(include=np.number).columns)
            # check if all columns should be encoded
            if len(encode_categ) == 1:
                target_cols = cols_categ # encode ALL columns
            else:
                target_cols = encode_categ[1] # encode only specific columns
            for feature in target_cols:
                if feature in cols_categ:
                    feature = feature # columns are column names
                else:
                    feature = df.columns[feature] # columns are indexes
                try:
                    # skip encoding of datetime features
                    pd.to_datetime(df[feature])
                except:
                    try:
                        if encode_categ[0] == 'auto':
                            # ONEHOT encode if not more than 10 unique values to encode
                            if df[feature].nunique() <=10:
                                df = EncodeCateg._to_onehot(df, feature)
                            # LABEL encode if not more than 20 unique values to encode
                            elif df[feature].nunique() <=20:
                                df = EncodeCateg._to_label(df, feature)
                            # skip encoding if more than 20 unique values to encode
                        elif encode_categ[0] == 'onehot':
                            df = EncodeCateg._to_onehot(df, feature)
                        elif encode_categ[0] == 'label':
                            df = EncodeCateg._to_label(df, feature)
                    except:
                        pass
        return df
    def _to_onehot(df, feature, limit=10):
        ...
    def _to_label(df, feature):
        ...

# extraction of datetime features in datasets

In [None]:
# Feature for extracting datetime values
def convert_datetime(df, extract_datetime='s'):
    cols = set(df.columns) ^ set(df.select_dtypes(include=np.number).columns) 
    for feature in cols: 
        try:
            # convert features encoded as strings to type datetime ['D','M','Y','h','m','s']
            df[feature] = pd.to_datetime(df[feature], infer_datetime_format=True)
            df['Day'] = pd.to_datetime(df[feature]).dt.day
            if extract_datetime in ['M','Y','h','m','s']:
                df['Month'] = pd.to_datetime(df[feature]).dt.month
                if extract_datetime in ['Y','h','m','s']:
                    df['Year'] = pd.to_datetime(df[feature]).dt.year
                    if extract_datetime in ['h','m','s']:
                        df['Hour'] = pd.to_datetime(df[feature]).dt.hour
                        if extract_datetime in ['m','s']:
                            df['Minute'] = pd.to_datetime(df[feature]).dt.minute
                            if extract_datetime in ['s']:
                              df['Sec'] = pd.to_datetime(df[feature]).dt.second
            try: # check if entries for the extracted dates/times are valid, otherwise drop
                if (df['Hour'] == 0).all() and (df['Minute'] == 0).all() and (df['Sec'] == 0).all():
                    df.drop('Hour', inplace = True, axis =1 )
                    df.drop('Minute', inplace = True, axis =1 )
                    df.drop('Sec', inplace = True, axis =1 )
                elif (df['Day'] == 0).all() and (df['Month'] == 0).all() and (df['Year'] == 0).all():
                    df.drop('Day', inplace = True, axis =1 )
                    df.drop('Month', inplace = True, axis =1 )
                    df.drop('Year', inplace = True, axis =1 )  
            except:
                pass
        except: # feature cannot be converted to datetime
            pass          
return df