In [None]:
# Furthermore we implemented the method for the outlier's detection
def outliers_iqr(df):
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    IQR = Q3 - Q1
    return df[(df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))]

In [None]:
def replace_outliers_10_90(df, cols):
    for col in cols: 
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower, upper = Q1 - 1.5 * IQR, Q3 + 1.5 * IQR
        outliers_lower = df[col][ df[col]<lower ]
        outliers_upper = df[col][ df[col]>upper ]
        
        p95 = df[col].quantile(0.90)
        p5 = df[col].quantile(0.10)
        for idx in outliers_lower.index:
            df[col][idx] = p5
        for idx in outliers_upper.index:
            df[col][idx] = p95       
    return df

In [None]:
def replace_outliers_average(df, cols):
    for col in cols: 
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower, upper = Q1 - 1.5 * IQR, Q3 + 1.5 * IQR
        outliers_lower = df[col][ df[col]<lower ]
        outliers_upper = df[col][ df[col]>upper ]
        
        p50 = df[col].quantile(0.50)
        for idx in outliers_lower.index:
            df[col][idx] = p50
        for idx in outliers_upper.index:
            df[col][idx] = p50        
    return df

In [None]:
def transform_data1(df_):
    # Data encoding
    df = df_.copy(deep=True)
    ordinal_cols = ['blood_group','race','relationship','sex']
    ce_ordinal = ce.OrdinalEncoder(cols=ordinal_cols)
    df = ce_ordinal.fit_transform(df)
    # In our project we choose to replace NA values with the mean value of each column
    df = df.fillna(df.mean(numeric_only=True))
    
#     Remove outliers in those attributes with less than 100 outliers
    out_iqr = outliers_iqr(df)
    outliers_iqr_count = out_iqr.count()
    out_iqr_0_100 = outliers_iqr_count[(outliers_iqr_count > 0) & (outliers_iqr_count < 100)]
    df = df.drop(out_iqr[out_iqr_0_100.index.tolist()].dropna(how='all').index)

# #     In such columns, which are not categorical and contains many outliers (alt & etytr for example), 
# #     we have replaced them with an average values, 10% or 90% quantiles (to get more significant modification of them)
    df = replace_outliers_average(df, ['alt'])
    df = replace_outliers_10_90(df, ['etytr'])
    return df

In [None]:
# Scaling and normalizing train data
def transform_data(df):
    transformer_step = ColumnTransformer(remainder='passthrough', transformers=[('alt', PowerTransformer(), ['alt'])])
    pipe = Pipeline([
        ('alt', transformer_step),
        ('scaler',StandardScaler()),
        ('normalizer',Normalizer())])
    cols = list(dict.fromkeys(['alt']+df.columns.tolist()))
    return pd.DataFrame(pipe.fit_transform(df), columns=cols)

In [None]:
class OneRuleClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self):
        self.freq = None
    
    def fit(self, X, y, bins=10):
        X_ = pd.qcut(X, bins, labels=False, duplicates='drop')
        df_freq = pd.crosstab(X_, y)
        self.freq = df_freq
    
    def predict(self):
        df_max = self.freq.idxmax(axis=1)
        result = {'Indicator_0': [0, 0], 'Indicator_1': [0, 0]}
        for idx, row in self.freq.iterrows():
            if (df_max[idx] == 0):
                result['Indicator_0'][0]+=row[0]
                result['Indicator_1'][0]+=row[1]
            else:
                result['Indicator_0'][1]+=row[0]
                result['Indicator_1'][1]+=row[1]
        TN, TP, FN, FP = result['Indicator_0'][0], result['Indicator_1'][1], result['Indicator_1'][0], result['Indicator_0'][1] 
        
        accuracy = (TP+TN) / (TP+FP+FN+TN)
        precision = TP / (TP+FP)
        recall = TP / (TP+FN)
        return accuracy, precision, recall

X = df_x_train[features]
y = df_y_train
bins = range(1, 1000, 50)
oneRData = {'Bins': bins}
for col in X.columns.tolist():
    oneRData[col] = []
    for bin_cnt in bins:
        OneRule = OneRuleClassifier()
        OneRule.fit(X[col], y, bin_cnt)
        accuracy, precision, recall = OneRule.predict()
        oneRData[col].append(accuracy)

pd.DataFrame(oneRData).plot(x='Bins',subplots=True, figsize=(14,20), layout=(9,2))
printmd("**One Rule (Frequency Table) Classifier**")
print("Max accuracy for each feature on train data:")
print(pd.DataFrame(oneRData).drop(columns=['Bins']).max())

In [1]:
range(5)

range(0, 5)

In [2]:
[x for x in range(5)]

[0, 1, 2, 3, 4]

In [9]:
import pandas as pd
pd.Interval(0,1,'right')
b = 4

In [10]:
a = 5

if 'b' in globals():
    print(a)

5
