In [1]:
! pip install memory_profiler
%load_ext memory_profiler 

Defaulting to user installation because normal site-packages is not writeable


In [2]:
from functions import *

In [3]:
random_state = 42

In [4]:
# Read in data
X_train, X_test, y_train, y_test = split_tsv('/data/home/bt211037/dissertation/supervised_ML_data.tsv', random_state)  

In [5]:
# Get the column names of the continuous and nominal data
cat, con = cat_con_cols(X_train) 
# Convert categorical cols values from floats to integers - train
X_train[cat] = X_train[cat].astype('Int64')  
# Convert categorical cols values from floats to integers - test
X_test[cat] = X_test[cat].astype('Int64')  

In [6]:
X_train = minmax_scaling(X_train, con)  # Normalisation

## Categorical imputation with extratreesclassifier

In [21]:
%%memit
# Categorical imputation 4000 rows
time = Stopwatch()
X_train_ca_4 = categorical_imputer(X_train[0:4000], cat, random_state)
print(time.stop())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, cat] = imputed_cat


30.97033182310406
peak memory: 6789.85 MiB, increment: 5204.39 MiB


In [8]:
%%memit
# Categorical imputation 40000 rows
time = Stopwatch()
X_train_ca_40 = categorical_imputer(X_train[0:40000], cat, random_state)
print(time.stop())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, cat] = imputed_cat


148.17322739993688
peak memory: 60296.97 MiB, increment: 53596.24 MiB


## Continuous imputation with bayesianridegclassifer

In [9]:
%%memit
# Continuous imputation 4000 rows
time = Stopwatch()
X_train_co_4 = continuous_data(X_train[0:4000], con, random_state)
print(time.stop())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, con] = imputed_con


49.55575369496364
peak memory: 46008.56 MiB, increment: 40.98 MiB


In [None]:
%%memit
# Continuous imputation 40000 rows
time = Stopwatch()
X_train_co_40 = continuous_data(X_train, con, random_state)
print(time.stop())

## Categorical imputation with ridge classfier

In [7]:
from sklearn.linear_model import RidgeClassifier

In [11]:
# Categorical imputation using extratreesclassifier
def r_cat_imputer(df, cat, random_state):

    cat_imputer = IterativeImputer(estimator=RidgeClassifier(),
                                   initial_strategy='most_frequent',
                                   max_iter=10, random_state=random_state,
                                   verbose=0)

    imputed_cat = cat_imputer.fit_transform(df[cat])
    df.loc[:, cat] = imputed_cat
    return df


In [12]:
%%memit
# Categorical imputation 4000 rows
time = Stopwatch()
X_train_ca_4 = r_cat_imputer(X_train[0:4000], cat, random_state)
print(time.stop())

15.078198442002758
peak memory: 1585.36 MiB, increment: 35.46 MiB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, cat] = imputed_cat


In [24]:
%%memit
# Categorical imputation 40000 rows
time = Stopwatch()
X_train_ca_40 = r_cat_imputer(X_train[0:40000], cat, random_state)
print(time.stop())



236.03486201900523
peak memory: 6737.78 MiB, increment: 395.58 MiB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, cat] = imputed_cat
