In [194]:
import numpy as np
import random
import pandas as pd

df = pd.read_csv('australian.dat', delimiter=" ")
df2 =df.copy()
size = len(df)
data = {}
diff = []
for column in df.iloc[:, :-1]:
    diff.append([column])
    data[column] = {}

In [195]:
def find_min_max(data):
    return [min(data), max(data)]

def custom_mean(data):
    sum = 0
    for value in data:
        if not isinstance(value, str): sum += value
    return sum/len(data)


def custom_variance(data):
    sum = 0
    mean = custom_mean(data)
    for value in data:
        if not isinstance(value, str): sum += (value - mean) ** 2
    return sum

def count_diff(data):
    return [len(np.unique(data)), np.unique(data)]

def deviation(data):
    mean = custom_mean(data)
    result = 0
    for value in data:
        if not isinstance(value, str): result += (np.square(value - mean))
    return np.sqrt(result/len(data-1))

def classes_deviation(data):
    class_0_data = df[df['CLASS'] == 0]
    class_1_data = df[df['CLASS'] == 1]
    std_class_0 = deviation(class_0_data[data])
    std_class_1 = deviation(class_1_data[data])

    return [std_class_0, std_class_1]

def generate_missing_values(data, selected_class, attributes, percent_missing=0.1):
    data_copy = data.copy()
    selected_data = data_copy[data_copy['CLASS'] == selected_class]
    num_missing = int(len(selected_data) * percent_missing)
    missing_indices = random.sample(selected_data.index.tolist(), num_missing)
    for attr in attributes:
        for index in missing_indices:
            if pd.api.types.is_numeric_dtype(selected_data[attr]):
                mean_value = custom_mean(selected_data[attr])
                data_copy.at[index, attr] = mean_value
            else:
                most_common_value = selected_data[attr].mode().values[0]
                data_copy.at[index, attr] = most_common_value

    return data_copy

def custom_normalization(data, a, b):
    min_val = min(data)
    max_val = max(data)
    normalized_data = [((x - min_val) * (b - a) / (max_val - min_val)) + a for x in data]
    return normalized_data


def custom_std_for_attr(data):
    var = custom_variance(data)
    mean = custom_mean(data)
    return [(x - mean) / var for x in data]

def better_print(data):
    for item in data:
        print(item)
        print("MIN:", data[item]["min"])
        print("MAX:",data[item]["max"])
        print("Unique values:",data[item]["uniqueValuesNumber"])
        print("List of unique values:", "\n" ,data[item]["uniqueValuesList"])
        print("Deviation:",data[item]["deviation"])
        print("Deviation for CLASS 0:",data[item]["Class0Dev"])
        print("Deviation for CLASS 1:",data[item]["Class1Dev"], "\n")



In [196]:
# i = randomMissing()
#
for item in data:
    data[item]["min"] =find_min_max(df[item])[0]
    data[item]["max"] =find_min_max(df[item])[1]
    data[item]["uniqueValuesNumber"] = count_diff(df[item])[0]
    data[item]["uniqueValuesList"] = count_diff(df[item])[1]
    data[item]["deviation"] = deviation(df[item])
    data[item]["Class0Dev"] = classes_deviation(item)[0]
    data[item]["Class1Dev"] = classes_deviation(item)[1]


In [197]:
better_print(data)

A1
MIN: 0
MAX: 1
Unique values: 2
List of unique values: 
 [0 1]
Deviation: 0.46714351368915735
Deviation for CLASS 0: 0.4648839546178204
Deviation for CLASS 1: 0.46984641946593947 

A2
MIN: 13.75
MAX: 80.25
Unique values: 350
List of unique values: 
 [13.75 15.17 15.75 15.83 15.92 16.   16.08 16.17 16.25 16.33 16.5  16.92
 17.08 17.25 17.33 17.42 17.5  17.58 17.67 17.83 17.92 18.   18.08 18.17
 18.25 18.33 18.42 18.5  18.58 18.67 18.75 18.83 18.92 19.   19.17 19.33
 19.42 19.5  19.58 19.67 19.75 20.   20.08 20.17 20.25 20.33 20.42 20.5
 20.67 20.75 20.83 21.   21.08 21.17 21.25 21.33 21.42 21.5  21.58 21.67
 21.75 21.83 21.92 22.   22.08 22.17 22.25 22.33 22.42 22.5  22.58 22.67
 22.75 22.83 22.92 23.   23.08 23.17 23.25 23.33 23.42 23.5  23.58 23.75
 23.92 24.08 24.17 24.33 24.42 24.5  24.58 24.75 24.83 24.92 25.   25.08
 25.17 25.25 25.33 25.42 25.5  25.58 25.67 25.75 25.83 25.92 26.   26.08
 26.17 26.25 26.33 26.5  26.58 26.67 26.75 26.83 26.92 27.   27.17 27.25
 27.33 27.42 27.58 

In [198]:
selected_class = 1
attributes_with_missing_values = ['A3']
df_with_missing = generate_missing_values(df, selected_class, attributes_with_missing_values)
print(df_with_missing)

     A1     A2      A3  A4  A5  A6     A7  A8  A9  A10  A11  A12  A13   A14  \
0     1  22.08  11.460   2   4   4  1.585   0   0    0    1    2  100  1213   
1     0  22.67   7.000   2   8   4  0.165   0   0    0    0    2  160     1   
2     0  29.58   1.750   1   4   4  1.250   0   0    0    1    2  280     1   
3     0  21.67  11.500   1   5   3  0.000   1   1   11    1    2    0     1   
4     1  20.17   8.170   2   6   4  1.960   1   1   14    0    2   60   159   
..   ..    ...     ...  ..  ..  ..    ...  ..  ..  ...  ...  ...  ...   ...   
685   1  31.57  10.500   2  14   4  6.500   1   0    0    0    2    0     1   
686   1  20.67   0.415   2   8   4  0.125   0   0    0    0    2    0    45   
687   0  18.83   9.540   2   6   4  0.085   1   0    0    0    2  100     1   
688   0  27.42  14.500   2  14   8  3.085   1   1    1    0    2  120    12   
689   1  41.00   0.040   2  10   4  0.040   0   1    1    0    1  560     1   

     CLASS  
0        0  
1        0  
2        0  

In [199]:
normalized_df = pd.DataFrame()

for column in df.columns:
    normalized_data = custom_normalization(df[column], -10, 10)
    normalized_df[column] = normalized_data

print(normalized_df)

       A1        A2        A3    A4         A5   A6         A7    A8    A9  \
0    10.0 -7.494737 -1.814286   0.0  -5.384615 -2.5  -8.887719 -10.0 -10.0   
1   -10.0 -7.317293 -5.000000   0.0   0.769231 -2.5  -9.884211 -10.0 -10.0   
2   -10.0 -5.239098 -8.750000 -10.0  -5.384615 -2.5  -9.122807 -10.0 -10.0   
3   -10.0 -7.618045 -1.785714 -10.0  -3.846154 -5.0 -10.000000  10.0  10.0   
4    10.0 -8.069173 -4.164286   0.0  -2.307692 -2.5  -8.624561  10.0  10.0   
..    ...       ...       ...   ...        ...  ...        ...   ...   ...   
685  10.0 -4.640602 -2.500000   0.0  10.000000 -2.5  -5.438596  10.0 -10.0   
686  10.0 -7.918797 -9.703571   0.0   0.769231 -2.5  -9.912281 -10.0 -10.0   
687 -10.0 -8.472180 -3.185714   0.0  -2.307692 -2.5  -9.940351  10.0 -10.0   
688 -10.0 -5.888722  0.357143   0.0  10.000000  7.5  -7.835088  10.0  10.0   
689  10.0 -1.804511 -9.971429   0.0   3.846154 -2.5  -9.971930 -10.0  10.0   

           A10   A11   A12   A13      A14  CLASS  
0   -10.0000

In [200]:
std_df = pd.DataFrame()

for column in df.columns:
    std_data = custom_std_for_attr(df[column])
    std_df[column] = std_data

print(std_df)

           A1            A2        A3        A4        A5        A6        A7  \
0    0.002137 -9.801402e-05  0.000392  0.001831 -0.000361 -0.000253 -0.000083   
1   -0.004505 -9.191926e-05  0.000131  0.001831  0.000067 -0.000253 -0.000267   
2   -0.004505 -2.053832e-05 -0.000176 -0.006016 -0.000361 -0.000253 -0.000126   
3   -0.004505 -1.022494e-04  0.000395 -0.006016 -0.000254 -0.000619 -0.000288   
4    0.002137 -1.177445e-04  0.000200  0.001831 -0.000147 -0.000253 -0.000034   
..        ...           ...       ...       ...       ...       ...       ...   
685  0.002137  1.856422e-08  0.000336  0.001831  0.000709 -0.000253  0.000554   
686  0.002137 -1.125794e-04 -0.000254  0.001831  0.000067 -0.000253 -0.000272   
687 -0.004505 -1.315868e-04  0.000280  0.001831 -0.000147 -0.000253 -0.000277   
688 -0.004505 -4.285132e-05  0.000571  0.001831  0.000709  0.001209  0.000112   
689  0.002137  9.743134e-05 -0.000276  0.001831  0.000281 -0.000253 -0.000283   

          A8        A9     