In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC as SVM
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from scipy.stats import mode
from scipy.linalg import svd
from scipy import delete
from collections import defaultdict

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Section 1 MDI Methods
The **class importer** gathers all Missing Data Importer we are doing experiment on

Reference: https://github.com/rafaelvalle/MDI/blob/master/missing_data_imputation.py

*   **Case substitution** One observation with missing data is replaced with another non-sampled obser- vation.
*   **Summary statistic** Replace the missing data with the mean, median, or mode of the feature vec- tor. Using a numerical approach directly is not appropriate for nonordinal categorical data.
*    **One-hot** Create a binary variable to indicate whether or not a specific feature is missing.
*    **Hot deck and cold deck** Compute the K-Nearest Neighbors of the observation with missing data and assign the mode of the K-neighbors to the missing data. algorithm.
*    **Prediction Model** Train a prediction model (e.g., random forests) to predict the missing value.
*    **Factor analysis Perform factor analysis** (e.g., principal component analysis (PCA)) on the design matrix, project the design matrix onto the first N eigenvectors and replace the missing values by the values that might be given by the projected design matrix.


In [None]:
class Imputer():
    def __init__(self):
        """
        Attributes
        ----------

        """
    def drop(self, x, missing_data_cond):
        """ Drops all observations that have missing data

        Parameters
        ----------
        x : np.ndarray
            Matrix with categorical data, where rows are observations and
            columns are features
        missing_data_cond : function
            Method that takes one value and returns True if it represents
            missing data or false otherwise.
        """
        return x[np.sum(missing_data_cond(x), axis=1) == 0]

    def replace(self, x, missing_data_cond, in_place=False):

        if in_place:
            data = x
        else:
            data = np.copy(x)

        for col in range(x.shape[1]):
            nan_ids = missing_data_cond(x[:, col])
            val_ids = np.random.choice(np.where(~nan_ids)[0], size = np.sum(nan_ids))
            data[nan_ids, col] = data[val_ids, col]
        return data

    def summarize(self, x, summary_func, missing_data_cond, in_place=False):
        if in_place:
            data = x
        else:
            data = np.copy(x)

        for col in range(x.shape[1]):
            nan_ids = missing_data_cond(x[:, col])
            if np.any(nan_ids):
                val = summary_func(x[~nan_ids, col])
                data[nan_ids, col] = val

        return data

    def one_hot(self, x, missing_data_cond, weighted=False, in_place=False):
        if in_place:
            data = x
        else:
            data = np.copy(x)

        # find missing columns
        _, miss_cols = np.where(missing_data_cond(data))
        miss_cols_uniq = np.unique(miss_cols)

        for miss_col in miss_cols_uniq:
            uniq_vals, indices = np.unique(data[:, miss_col], return_inverse=True)
            one_hot_encoded = np.eye(len(uniq_vals), dtype=int)[indices]
            if weighted:
                one_hot_encoded *= len(uniq_vals)
            data = np.column_stack((data, one_hot_encoded))

        data = np.delete(data, miss_cols_uniq, axis=1)
        return data

    def knn(self, x, k, summary_func, missing_data_cond, cat_cols, weighted=False, in_place=False):
        if in_place:
            data = x
        else:
            data = np.copy(x)
        # first one-hot missing-data columns
        imp = Imputer()
        data_complete = imp.one_hot(data, missing_data_cond, weighted=weighted)
        # Then binarize complete columns
        col = 0
        cat_ids_comp = []
        while col <= max(cat_cols):
            try:
              float(data_complete[0, col])
            except ValueError:
              cat_ids_comp.append(col)
            col += 1

        data_complete = imp.binarize_data(data_complete, cat_ids_comp).astype(float)

        # normalize features
        scaler = StandardScaler().fit(data_complete)
        data_complete = scaler.transform(data_complete)
        # create dict with missing rows and respective columns
        missing = defaultdict(list)
        for x, y in np.argwhere(missing_data_cond(data)):
            missing[x].append(y)
        # create mask to build NearestNeighbors with complete observations only
        mask = np.ones(len(data_complete), dtype = bool)
        mask[np.fromiter(missing.keys(), dtype=int)] = False

        print('Computing k-nearest neighbors')
        nbrs = NearestNeighbors(n_neighbors=k, metric='euclidean').fit(data_complete[mask])
        ids = nbrs.kneighbors(data_complete[list(missing.keys())],return_distance=False)

        def substituteValues(i):
            row = missing.keys()[i]
            cols = missing[row]
            #data[row, cols] = mode(data[mask][ids[i]][:, cols])[0].flatten()
            nn_missing_dat=data[mask][ids[i]][:, cols]
            for missing_col in range(nn_missing_dat.shape[1]):
                data[row, cols[missing_col]]=mode(nn_missing_dat[:,missing_col])[0][0]
        print('Substituting missing values')
        map(substituteValues, range(len(missing)))
        return data

    def predict(self, x, cat_cols, missing_data_cond, clf, inc_miss=True, in_place=False):
      if in_place:
          data = x
      else:
          data = np.copy(x)

      miss_rows, miss_cols = np.where(missing_data_cond(data))
      miss_cols_uniq = np.unique(miss_cols)

      if inc_miss:
          valid_cols = np.arange(data.shape[1])
      else:
          valid_cols = [n for n in range(data.shape[1]) if n not in miss_cols_uniq]

      data_factorized = np.copy(data)

      factor_labels = {}
      for cat_col in cat_cols:
          labels, factors = np.unique(data[:, cat_col], return_inverse=True)
          factor_labels[cat_col] = labels
          data_factorized[:, cat_col] = factors

      data_factorized = data_factorized.astype(int)

      for miss_col in miss_cols_uniq:
          valid_obs = [n for n in range(len(data)) if data[n, miss_col] != '?']
          data_train = data_factorized[:, valid_cols][valid_obs]
          y_train = data_factorized[valid_obs, miss_col]

          # train random forest classifier
          clf.fit(data_train, y_train)

          # Given current feature, find obs with missing vals
          miss_obs_idx = miss_rows[miss_cols == miss_col]
          y_hat = clf.predict(data_factorized[miss_obs_idx, :][:, valid_cols])
          data_factorized[miss_obs_idx, miss_col] = y_hat
      for col, labels in factor_labels.items():
          data[:, col] = factor_labels[col][data_factorized[:, col]]

      return data

    def factor_analysis(self, x, cat_cols, missing_data_cond, threshold=0.9, technique='SVD', in_place=False):
      def _mode(d):
        return mode(d)[0].flatten()

      if in_place:
          data = x
      else:
          data = np.copy(x)

      data_summarized = self.summarize(data, _mode, missing_data_cond)

    # Factorize categorical variables and store encoding
      factor_labels = {}
      for cat_col in cat_cols:
          labels, factors = np.unique(data_summarized[:, cat_col], return_inverse=True)
          factor_labels[cat_col] = labels
          data_summarized[:, cat_col] = factors

      data_summarized = data_summarized.astype(float)
      if technique == 'SVD':
          U, S, Vt = svd(data_summarized, full_matrices=False)
          cumulative_variance = np.cumsum(S) / np.sum(S)
          n_components = np.searchsorted(cumulative_variance, threshold) + 1

        # Compute low rank approximation
          data_approx = U[:, :n_components] @ np.diag(S[:n_components]) @ Vt[:n_components, :]
      else:
          raise ValueError(f"Technique {technique} is not supported")

    # Get missing data indices
      missing_indices = np.argwhere(missing_data_cond(data))

    # Update data given projection
      for col in np.unique(missing_indices[:, 1]):
          obs_indices = missing_indices[missing_indices[:, 1] == col, 0]

        # Clip low rank approximation to be within factor labels
          proj_cats = np.clip(data_approx[obs_indices, col], 0, len(factor_labels[col]) - 1)

        # Round categorical variable factors to int
          proj_cats = proj_cats.round().astype(int)
          data[obs_indices, col] = factor_labels[col][proj_cats]

      return data

  # END of missing data imputation method

  # START of data transformation method
    def factorize_data(self, x, cols, in_place=False):
      if in_place:
          data = x
      else:
          data = np.copy(x)

      factors_labels = {}
      for col in cols:
          labels, factors = np.unique(data[:, col], return_inverse=True)
          factors_labels[col] = labels
          data[:, col] = factors

      return data, factors_labels

    def binarize_data(self, x, cols, miss_data_symbol=None, one_minus_one=True, in_place=False):
      if in_place:
        data = x
      else:
        data = np.copy(x)

      # Calculate the initial number of columns before adding new one-hot encoded columns
      initial_cols = data.shape[1]
      num_rows = data.shape[0]

      # Prepare to drop the original categorical columns after encoding
      new_data = np.empty((num_rows, 0), dtype=int)

      for col in cols:
          uniq_vals, indices = np.unique(data[:, col], return_inverse=True)
          # Create one-hot encoded matrix
          one_hot_encoded = np.eye(len(uniq_vals), dtype=int)[indices]
          if one_minus_one:
              one_hot_encoded = one_hot_encoded * 2 - 1

          new_data = np.column_stack((new_data, one_hot_encoded))

          # Add a column for missing data if necessary
          if miss_data_symbol is not None and miss_data_symbol not in uniq_vals:
              missing_column = -one_minus_one * np.ones((num_rows, 1), dtype=int)
              new_data = np.column_stack((new_data, missing_column))

      # Include non-categorical columns
      non_cat_cols = [n for n in range(initial_cols) if n not in cols]
      non_cat_data = data[:, non_cat_cols]

      # Combine non-categorical data and new one-hot encoded columns
      data = np.column_stack((non_cat_data, new_data))

      return data


# Section 2 Adult Data
The prediction task of the Adult Dataset is to determine whether a person makes over $50,000 a year.


*   N = 48,842
*   14 features (6 countinuous and 8 categorical)



In [None]:
df = pd.read_csv('/content/drive/MyDrive/[Machine Learning Project] Viona & Iris/code/adult-train-raw.txt', delimiter=',', header=None)
df.head()
# From previous literature, column 4 'Education_num' is redundant, and column 14 is the indicator
# This chunk just provides an overview of the dataset

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [None]:
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
x = df.values
x = delete(x, (4, 14), 1)
x[1]

  x = delete(x, (4, 14), 1)


array([50, 'Self-emp-not-inc', 83311, 'Bachelors', 'Married-civ-spouse',
       'Exec-managerial', 'Husband', 'White', 'Male', 0, 0, 13,
       'United-States'], dtype=object)

In [35]:
x[14]

array([40, 'Private', 121772, 'Assoc-voc', 'Married-civ-spouse',
       'Craft-repair', 'Husband', 'Asian-Pac-Islander', 'Male', 0, 0, 40,
       '?'], dtype=object)

In [None]:
imp = Imputer()
missing_data_cond = lambda x: x == '?'
cat_cols = (1, 3, 4, 5, 6, 7, 8, 12)
n_neighbors = 5

In [None]:
# Imputing with random replacement
data_replace = imp.replace(x, missing_data_cond)

In [None]:
# Imputing with features with feature summary
summ_func = lambda x: mode(x)[0]
data_mode = imp.summarize(x, summ_func, missing_data_cond)

TypeError: Argument `a` is not recognized as numeric. Support for input that cannot be coerced to a numeric array was deprecated in SciPy 1.9.0 and removed in SciPy 1.11.0. Please consider `np.unique`.

In [None]:
# Using one-hot encoding for categorical features
data_onehot = imp.binarize_data(x, cat_cols)

In [33]:
len(data_onehot[14])

107

In [31]:
data_onehot[14] #13

array([40, 121772, 0, 0, 40, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1,
       -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, 1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, 1, 1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1], dtype=object)

In [32]:
data_onehot[1]

array([50, 83311, 0, 0, 13, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1,
       -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1, 1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, 1, -1, -1], dtype=object)

In [36]:
data_prime = imp.one_hot(x, missing_data_cond)
len(data_prime[14]) #13

76

In [None]:
# predicting missing values using random forest
clf = RandomForestClassifier(n_estimators=100, criterion='gini')
data_rf = imp.predict(x, cat_cols, missing_data_cond, clf)

In [None]:
# predicting missing values using SVM
clf = SVM(
    penalty='l2', loss='squared_hinge', dual=True, tol=0.0001, C=1.0, multi_class='ovr',
    fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0,
    random_state=None, max_iter=1000)
data_svm = imp.predict(x, cat_cols, missing_data_cond, clf)



In [None]:
# predicting missing values using logistic regression
clf = LogisticRegression(
            penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True,
            intercept_scaling=1, max_iter = 1000)
data_logistic = imp.predict(x, cat_cols, missing_data_cond, clf)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [None]:
# replace missing data with values obtained after factor analysis
data_facanal = imp.factor_analysis(x, cat_cols, missing_data_cond)

In [None]:
# replace missing data with knn
data_knn = imp.knn(x, n_neighbors, np.mean, missing_data_cond, cat_cols)

Computing k-nearest neighbors
Substituting missing values
