In [1]:
# import arff, numpy as np
# dataset = arff.load(open('mydataset.arff', 'rb'))
# data = np.array(dataset['data'])

In [2]:
from scipy.io import arff
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [3]:
data_example_df = arff.loadarff('OpenML/data/1.arff')

In [4]:
data_example_df = pd.DataFrame(data_example_df[0])

In [5]:
data_example_df

Unnamed: 0,family,product-type,steel,carbon,hardness,temper_rolling,condition,formability,strength,non-ageing,...,'s','p',shape,thick,width,len,oil,bore,packing,class
0,b'?',b'C',b'A',8.0,0.0,b'?',b'S',b'?',0.0,b'?',...,b'?',b'?',b'COIL',0.700,610.0,0.0,b'?',b'0',b'?',b'3'
1,b'?',b'C',b'R',0.0,0.0,b'?',b'S',b'2',0.0,b'?',...,b'?',b'?',b'COIL',3.200,610.0,0.0,b'?',b'0',b'?',b'3'
2,b'?',b'C',b'R',0.0,0.0,b'?',b'S',b'2',0.0,b'?',...,b'?',b'?',b'SHEET',0.700,1300.0,762.0,b'?',b'0',b'?',b'3'
3,b'?',b'C',b'A',0.0,60.0,b'T',b'?',b'?',0.0,b'?',...,b'?',b'?',b'COIL',2.801,385.1,0.0,b'?',b'0',b'?',b'3'
4,b'?',b'C',b'A',0.0,60.0,b'T',b'?',b'?',0.0,b'?',...,b'?',b'?',b'SHEET',0.801,255.0,269.0,b'?',b'0',b'?',b'3'
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
893,b'?',b'C',b'R',0.0,0.0,b'?',b'S',b'3',0.0,b'?',...,b'?',b'?',b'SHEET',1.599,610.0,762.0,b'?',b'0',b'?',b'2'
894,b'?',b'C',b'R',0.0,0.0,b'?',b'S',b'3',0.0,b'?',...,b'?',b'?',b'SHEET',1.601,830.0,880.0,b'?',b'0',b'?',b'2'
895,b'?',b'C',b'V',0.0,0.0,b'?',b'S',b'2',0.0,b'?',...,b'?',b'?',b'SHEET',1.599,150.0,762.0,b'?',b'0',b'?',b'2'
896,b'?',b'C',b'A',0.0,85.0,b'T',b'?',b'?',0.0,b'?',...,b'?',b'?',b'COIL',0.400,20.0,0.0,b'?',b'0',b'?',b'U'


In [6]:
data_meta_inf_df = pd.read_csv("OpenML/data.csv")

In [7]:
data_meta_inf_df

Unnamed: 0,id,name,length,target
0,10,lymph,22513,class
1,1000,hypothyroid,276707,binaryClass
2,1001,sponge,41487,binaryClass
3,1002,ipumslasmall,2622855,binaryClass
4,1003,primarytumor,22769,binaryClass
...,...,...,...,...
1193,995,mfeatzernike,934793,binaryClass
1194,996,prnnfglass,10062,binaryClass
1195,997,balancescale,6440,binaryClass
1196,998,analcatdatabondrate,4213,binaryClass


In [8]:
data_meta_inf_df.sort_values(by='id')

Unnamed: 0,id,name,length,target
351,2,anneal,84089,class
442,3,krvskp,489987,class
483,4,labor,8244,class
1097,9,autos,30728,symboling
0,10,lymph,22513,class
...,...,...,...,...
575,41004,junglechesspcsendgamelionelephant,744770,class
576,41005,junglechesspcsendgameratrat,580258,class
577,41006,junglechesspcsendgameratlion,924887,class
578,41007,junglechesspcsendgamelionlion,372703,class


# Meta features extraction

In [9]:
class MetaFeaturesExtractor:
    def __init__(self, structured_data: pd.DataFrame):
        self._meta_features_table = {
                                    "base_meta_features": {
                                        "number_of_objects": {
                                            "method": self.get_number_of_objects
                                            },
                                        "number_of_features": {
                                            "method": self.get_number_of_features
                                            },
                                        "number_of_categorical_features": {
                                            "method": self.get_number_of_categorical_features
                                            },
                                        "number_of_classes": {
                                            "method": self.get_number_of_classes
                                            }
                                        },
                                    "statistical_meta_features": {
                                        "minimum": {
                                            "method": self.get_minimum
                                            },
                                        "maximum": {
                                            "method": self.get_maximum
                                            },
                                        "mean": {
                                            "method": self.get_mean
                                            },
                                        "std": {
                                            "method": self.get_std
                                            },
                                         "asymmetry_coef":{
                                             "method": self.get_asymmetry_coef
                                            },
                                         "excess_coef":{
                                             "method": self.get_excess_coef
                                            }
                                          # Число значений категории.
                                          # Энтропия вероятностей.
                                          # Любая другая функция, которая агрегирует множество чисел в одно, если применить её к распределению вероятностей.
                                        },
                                    "structural_meta_features": {
                                        "vortex_depth": {
                                            "method": self.get_vortexes_depth
                                            },
                                        "regression_coefs": {
                                            "method": self.get_regression_coefs
                                            }
                                        }
                                    }
        self._structured_data = structured_data
        self._meta_object = {}
        self._some_unstructured_meta_info_about_data = {}
        self._preprocess_structured_data()


    def _get_structured_data(self):
        return self._structured_data


    def _update_meta_object(self, meta_feature_name: str, meta_feature_value):
        self._meta_object[meta_feature_name] = meta_feature_value


    def _get_meta_features_table(self):
        return self._meta_features_table


    def _update_meta_feature_value(self, meta_feature_type, meta_feature_name, meta_feature_value):
        self._meta_features_table[meta_feature_type][meta_feature_name]["value"] = meta_feature_value


    def _get_unstructured_meta_info_about_data(self):
        return self._some_unstructured_meta_info_about_data


    def get_meta_object(self):
        return self._meta_object

    #===================================================================================

    def _preprocess_structured_data(self):
        data = self._get_structured_data()
        self._some_unstructured_meta_info_about_data["data_pd_shape"] = data.shape
        self._some_unstructured_meta_info_about_data["data_pd_describe_numerical"] = data.describe()
        self._some_unstructured_meta_info_about_data["data_pd_describe_categorical"] = data.describe(include=["object", "bool"])
        # ETC


    def get_number_of_objects(self) -> float:
        shape = self._get_unstructured_meta_info_about_data()["data_pd_shape"]
        return shape[0]


    def get_number_of_features(self):
        shape = self._get_unstructured_meta_info_about_data()["data_pd_shape"]
        return shape[1]


    def get_number_of_categorical_features(self):
        return self._get_unstructured_meta_info_about_data()["data_pd_describe_categorical"].shape[1]


    def get_number_of_classes(self):
        return self._get_unstructured_meta_info_about_data()["data_pd_describe_categorical"].iloc[:, -1]["unique"]


    def get_minimum(self):
        return self._get_unstructured_meta_info_about_data()["data_pd_describe_numerical"].loc["min"].min()


    def get_maximum(self):
        return self._get_unstructured_meta_info_about_data()["data_pd_describe_numerical"].loc["max"].max()


    def get_mean(self):
        return self._get_unstructured_meta_info_about_data()["data_pd_describe_numerical"].loc["mean"].mean()


    def get_std(self):
        return 8


    def get_asymmetry_coef(self):
        return 9


    def get_excess_coef(self):
        return 10


    def get_vortexes_depth(self):
        return 11


    def get_regression_coefs(self):
        return 12


    #===================================================================================


    def extract_meta_features(self):
        meta_features_table = self._get_meta_features_table()
        for _, meta_feature_type in enumerate(meta_features_table):
            meta_feature_type_table = meta_features_table[meta_feature_type]
            for _, meta_feature_name in enumerate(meta_feature_type_table):
                meta_feature_table = meta_feature_type_table[meta_feature_name]
                # print([meta_feature_type, meta_feature_name, meta_feature_table["method"]])
                meta_feature_value = meta_feature_table["method"]()
                self._update_meta_feature_value(meta_feature_type, meta_feature_name, meta_feature_value)
                self._update_meta_object(meta_feature_name, meta_feature_value)
        return


In [10]:
extractor = MetaFeaturesExtractor(data_example_df)

In [11]:
extractor.extract_meta_features()

In [12]:
print(extractor.get_meta_object())

{'number_of_objects': 898, 'number_of_features': 39, 'number_of_categorical_features': 33, 'number_of_classes': 5, 'minimum': 0.0, 'maximum': 4880.0, 'mean': 348.50426818856744, 'std': 8, 'asymmetry_coef': 9, 'excess_coef': 10, 'vortex_depth': 11, 'regression_coefs': 12}


In [13]:
# knn
# desision tree
# linear regression

In [14]:
# n_train = 150
# n_test = 1000
# noise = 0.1
#
#
# def f(x):
#     x = x.ravel()
#     return np.exp(-(x ** 2)) + 1.5 * np.exp(-((x - 2) ** 2))
#
#
# def generate(n_samples, noise):
#     X = np.random.rand(n_samples) * 10 - 5
#     X = np.sort(X).ravel()
#     y = (
#         np.exp(-(X ** 2))
#         + 1.5 * np.exp(-((X - 2) ** 2))
#         + np.random.normal(0.0, noise, n_samples)
#     )
#     X = X.reshape((n_samples, 1))
#     return X, y
#
#
# X_train, y_train = generate(n_samples=n_train, noise=noise)
# X_test, y_test = generate(n_samples=n_test, noise=noise)
#
# from sklearn.tree import DecisionTreeRegressor
#
# reg_tree = DecisionTreeRegressor(max_depth=5, random_state=17)
#
# reg_tree.fit(X_train, y_train)
# reg_tree_pred = reg_tree.predict(X_test)
#
# plt.figure(figsize=(10, 6))
# plt.plot(X_test, f(X_test), "b")
# # plt.scatter(X_train, y_train, c="b", s=20)
# plt.plot(X_test, reg_tree_pred, "g", lw=2)
# plt.xlim([-5, 5])
# plt.title(
#     "Decision tree regressor, MSE = %.2f"
#     % (np.sum((y_test - reg_tree_pred) ** 2) / n_test)
# )
# plt.show()

In [15]:
from typing import List


def split_data_train_test(dataframe: pd.DataFrame, frac: float = 0.2) -> List[pd.DataFrame]:
    test = dataframe.sample(frac=frac, axis=0)
    train = dataframe.drop(index=test.index)
    return [train, test]


def split_data_X_Y(dataframe: pd.DataFrame) -> List[np.ndarray]:
    X = dataframe.iloc[:, 0:-1]
    Y = dataframe.iloc[:, -1]
    return [X, Y]


def form_data(dataframe: pd.DataFrame) -> List[np.ndarray]:
    [train, test] = split_data_train_test(dataframe)
    [X_train, Y_train] = split_data_X_Y(train)
    [X_test, Y_test] = split_data_X_Y(test)
    return [X_train, Y_train, X_test, Y_test]

In [16]:
data_example_df

Unnamed: 0,family,product-type,steel,carbon,hardness,temper_rolling,condition,formability,strength,non-ageing,...,'s','p',shape,thick,width,len,oil,bore,packing,class
0,b'?',b'C',b'A',8.0,0.0,b'?',b'S',b'?',0.0,b'?',...,b'?',b'?',b'COIL',0.700,610.0,0.0,b'?',b'0',b'?',b'3'
1,b'?',b'C',b'R',0.0,0.0,b'?',b'S',b'2',0.0,b'?',...,b'?',b'?',b'COIL',3.200,610.0,0.0,b'?',b'0',b'?',b'3'
2,b'?',b'C',b'R',0.0,0.0,b'?',b'S',b'2',0.0,b'?',...,b'?',b'?',b'SHEET',0.700,1300.0,762.0,b'?',b'0',b'?',b'3'
3,b'?',b'C',b'A',0.0,60.0,b'T',b'?',b'?',0.0,b'?',...,b'?',b'?',b'COIL',2.801,385.1,0.0,b'?',b'0',b'?',b'3'
4,b'?',b'C',b'A',0.0,60.0,b'T',b'?',b'?',0.0,b'?',...,b'?',b'?',b'SHEET',0.801,255.0,269.0,b'?',b'0',b'?',b'3'
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
893,b'?',b'C',b'R',0.0,0.0,b'?',b'S',b'3',0.0,b'?',...,b'?',b'?',b'SHEET',1.599,610.0,762.0,b'?',b'0',b'?',b'2'
894,b'?',b'C',b'R',0.0,0.0,b'?',b'S',b'3',0.0,b'?',...,b'?',b'?',b'SHEET',1.601,830.0,880.0,b'?',b'0',b'?',b'2'
895,b'?',b'C',b'V',0.0,0.0,b'?',b'S',b'2',0.0,b'?',...,b'?',b'?',b'SHEET',1.599,150.0,762.0,b'?',b'0',b'?',b'2'
896,b'?',b'C',b'A',0.0,85.0,b'T',b'?',b'?',0.0,b'?',...,b'?',b'?',b'COIL',0.400,20.0,0.0,b'?',b'0',b'?',b'U'


In [17]:
data_example_df = data_example_df.dropna()

In [18]:
X = data_example_df.iloc[:, 0:-1]
Y = data_example_df.iloc[:, -1]

In [19]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer

In [20]:
non_numeric_columns = list(X.describe(exclude=[np.number]).columns)

In [21]:
transformer = make_column_transformer(
    (OneHotEncoder(), non_numeric_columns),
    remainder='passthrough')

transformed = transformer.fit_transform(X)
transformed_X = pd.DataFrame(transformed, columns=transformer.get_feature_names())
print(transformed_X.head())

   onehotencoder__x0_b'?'  onehotencoder__x0_b'TN'  onehotencoder__x0_b'ZS'  \
0                     1.0                      0.0                      0.0   
1                     1.0                      0.0                      0.0   
2                     1.0                      0.0                      0.0   
3                     1.0                      0.0                      0.0   
4                     1.0                      0.0                      0.0   

   onehotencoder__x1_b'C'  onehotencoder__x2_b'?'  onehotencoder__x2_b'A'  \
0                     1.0                     0.0                     1.0   
1                     1.0                     0.0                     0.0   
2                     1.0                     0.0                     0.0   
3                     1.0                     0.0                     1.0   
4                     1.0                     0.0                     1.0   

   onehotencoder__x2_b'K'  onehotencoder__x2_b'M'  onehotencod



In [32]:
Y = pd.DataFrame(Y)

In [33]:
list(Y.columns)

['class']

In [38]:
transformer = make_column_transformer(
    (OneHotEncoder(), list(pd.DataFrame(Y).columns)),
    remainder='passthrough')

transformed = transformer.fit_transform(Y)
transformed_Y = pd.DataFrame(transуformed, columns=transformer.get_feature_names_out())
print(transformed_Y.head())

NameError: name 'transуformed' is not defined

In [36]:
transformer.get_feature_names_out()

array(["onehotencoder__class_b'1'", "onehotencoder__class_b'2'",
       "onehotencoder__class_b'3'", "onehotencoder__class_b'5'",
       "onehotencoder__class_b'U'"], dtype=object)

# D_Tree on sample data

In [23]:
from sklearn.preprocessing import OneHotEncoder

In [24]:
[X_train, Y_train, X_test, Y_test] = form_data(data_example_df)

In [67]:
non_numeric_columns = list(data_example_df.describe(exclude=[np.number]).columns)

In [68]:
# One-hot encoding multiple columns
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from seaborn import load_dataset
import pandas as pd
#
# df = load_dataset('penguins')
# # df = df[['island', 'sex', 'body_mass_g']]
data_example_df = data_example_df.dropna()


In [73]:
data_example_df["class"].

count      898
unique       5
top       b'3'
freq       684
Name: class, dtype: object

In [70]:

transformer = make_column_transformer(
    (OneHotEncoder(), non_numeric_columns),
    remainder='passthrough')

transformed = transformer.fit_transform(data_example_df)
transformed_df = pd.DataFrame(transformed, columns=transformer.get_feature_names())
print(transformed_df.head())

   onehotencoder__x0_b'?'  onehotencoder__x0_b'TN'  onehotencoder__x0_b'ZS'  \
0                     1.0                      0.0                      0.0   
1                     1.0                      0.0                      0.0   
2                     1.0                      0.0                      0.0   
3                     1.0                      0.0                      0.0   
4                     1.0                      0.0                      0.0   

   onehotencoder__x1_b'C'  onehotencoder__x2_b'?'  onehotencoder__x2_b'A'  \
0                     1.0                     0.0                     1.0   
1                     1.0                     0.0                     0.0   
2                     1.0                     0.0                     0.0   
3                     1.0                     0.0                     1.0   
4                     1.0                     0.0                     1.0   

   onehotencoder__x2_b'K'  onehotencoder__x2_b'M'  onehotencod



In [71]:
transformed_df.columns

Index(['onehotencoder__x0_b'?'', 'onehotencoder__x0_b'TN'',
       'onehotencoder__x0_b'ZS'', 'onehotencoder__x1_b'C'',
       'onehotencoder__x2_b'?'', 'onehotencoder__x2_b'A'',
       'onehotencoder__x2_b'K'', 'onehotencoder__x2_b'M'',
       'onehotencoder__x2_b'R'', 'onehotencoder__x2_b'S'',
       'onehotencoder__x2_b'V'', 'onehotencoder__x2_b'W'',
       'onehotencoder__x3_b'?'', 'onehotencoder__x3_b'T'',
       'onehotencoder__x4_b'?'', 'onehotencoder__x4_b'A'',
       'onehotencoder__x4_b'S'', 'onehotencoder__x5_b'1'',
       'onehotencoder__x5_b'2'', 'onehotencoder__x5_b'3'',
       'onehotencoder__x5_b'5'', 'onehotencoder__x5_b'?'',
       'onehotencoder__x6_b'?'', 'onehotencoder__x6_b'N'',
       'onehotencoder__x7_b'?'', 'onehotencoder__x7_b'P'',
       'onehotencoder__x8_b'?'', 'onehotencoder__x8_b'D'',
       'onehotencoder__x8_b'E'', 'onehotencoder__x8_b'F'',
       'onehotencoder__x8_b'G'', 'onehotencoder__x9_b'1'',
       'onehotencoder__x9_b'2'', 'onehotencoder__x9_b'

In [52]:
transformer.get_feature_names()



['onehotencoder__x0_Adelie',
 'onehotencoder__x0_Chinstrap',
 'onehotencoder__x0_Gentoo',
 'onehotencoder__x1_Biscoe',
 'onehotencoder__x1_Dream',
 'onehotencoder__x1_Torgersen',
 'onehotencoder__x2_32.1',
 'onehotencoder__x2_33.1',
 'onehotencoder__x2_33.5',
 'onehotencoder__x2_34.0',
 'onehotencoder__x2_34.4',
 'onehotencoder__x2_34.5',
 'onehotencoder__x2_34.6',
 'onehotencoder__x2_35.0',
 'onehotencoder__x2_35.1',
 'onehotencoder__x2_35.2',
 'onehotencoder__x2_35.3',
 'onehotencoder__x2_35.5',
 'onehotencoder__x2_35.6',
 'onehotencoder__x2_35.7',
 'onehotencoder__x2_35.9',
 'onehotencoder__x2_36.0',
 'onehotencoder__x2_36.2',
 'onehotencoder__x2_36.3',
 'onehotencoder__x2_36.4',
 'onehotencoder__x2_36.5',
 'onehotencoder__x2_36.6',
 'onehotencoder__x2_36.7',
 'onehotencoder__x2_36.8',
 'onehotencoder__x2_36.9',
 'onehotencoder__x2_37.0',
 'onehotencoder__x2_37.2',
 'onehotencoder__x2_37.3',
 'onehotencoder__x2_37.5',
 'onehotencoder__x2_37.6',
 'onehotencoder__x2_37.7',
 'onehotenc

In [22]:
from sklearn.tree import DecisionTreeClassifier

reg_tree = DecisionTreeClassifier(max_depth=5, random_state=17)

reg_tree.fit(X_train, Y_train)
reg_tree_pred = reg_tree.predict(X_test)

ValueError: could not convert string to float: b'?'

In [7]:
data_example_df

NameError: name 'data_example_df' is not defined