In [1]:
# We will need to adjust this to fit our case but this is the structure for the Pipeline

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import PowerTransformer, MinMaxScaler, LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

import pandas as pd
import numpy as np

# Define a method to select attributes from a DataFrame
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

# Read in the tsv file
df = pd.read_csv('06693-0001-Data.tsv', delim_whitespace=True)

# Make a helper dictionary to keep track of 'response', 'numerical', 'categorical'

var_dict = {
    'response': 'V1144',
    'cat_preds': ['V12', 'V13', 'V4518', 'V5114'],
    'num_preds': []
}

# Make a train-test split:
train_set, test_set = train_test_split(df, random_state=1, stratify=df[var_dict['response']])


#num_attribs = var_dict['num_preds']
cat_attribs = var_dict['cat_preds']

# We will need this later
#num_pipeline = Pipeline([
#    ('selector', DataFrameSelector(num_attribs)),
#    ('power_transf', PowerTransformer(method='yeo-johnson', standardize=False)),
#    ('minmax_scaler', MinMaxScaler()),
#])

cat_pipeline = Pipeline([
    ('selector', DataFrameSelector(cat_attribs)),
    ('encoder', LabelBinarizer(sparse=False)),
])

full_pipeline = FeatureUnion(transformer_list=[
    #("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline),
])

y_train = train_set[var_dict['response']]

# Prepare the training data with the pipeline above
X_train_prepared = cat_pipeline.fit_transform(train_set)
X_train_prepared


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


array([[0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [2]:
X_train_prepared.shape

(6073, 57)

In [3]:
from sklearn.tree import DecisionTreeClassifier

tree_clf = DecisionTreeClassifier()
tree_clf.fit(X_train_prepared, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [5]:
X_test_prepared = cat_pipeline.transform(test_set)
X_test_prepared.shape

(2025, 54)

In [6]:
tree_pred = tree_clf.predict(X_test_prepared)

ValueError: Number of features of the model must match the input. Model n_features is 57 and input n_features is 54 

In [15]:
# So I need to catch where the categorical counts are failing --- in training set finding 57 categories 
# among the 5 variables - in the test set - only 54.

In [16]:
from sklearn.metrics import classification_report
y_train_pred = tree_clf.predict(X_train_prepared)
print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

           0       0.65      0.66      0.66      2633
           1       0.65      0.15      0.25       284
           5       0.68      0.72      0.70      3151
           9       1.00      0.20      0.33         5

   micro avg       0.67      0.67      0.67      6073
   macro avg       0.75      0.43      0.49      6073
weighted avg       0.67      0.67      0.66      6073



In [17]:
test_set['V1144'].value_counts()

5    1050
0     878
1      95
9       2
Name: V1144, dtype: int64

In [19]:
train_set['V1144'].value_counts()

5    3151
0    2633
1     284
9       5
Name: V1144, dtype: int64

In [20]:
df['V1144'].value_counts()

5    4201
0    3511
1     379
9       7
Name: V1144, dtype: int64

In [21]:
len(df)

8098

In [22]:
len(test_set)

2025

In [23]:
tree_clf.feature_importances_

array([1.02331447e-02, 3.15434186e-03, 7.71145089e-03, 8.67979836e-03,
       8.33834204e-03, 5.28016222e-03, 9.36637063e-03, 5.43801307e-03,
       9.47128780e-03, 1.14095908e-02, 8.30581120e-03, 3.96457022e-03,
       1.39343032e-02, 8.26998899e-03, 9.88882774e-03, 1.32451389e-02,
       8.50359026e-03, 1.08565544e-02, 6.99421507e-03, 1.27360484e-02,
       1.31641023e-02, 1.48412319e-02, 9.42284538e-03, 7.25417172e-03,
       7.52226501e-03, 1.15366278e-02, 1.01213708e-02, 1.05183648e-02,
       1.04464327e-02, 1.38273178e-02, 7.95091605e-03, 8.78513574e-03,
       1.06437854e-02, 1.70850989e-02, 1.07900642e-02, 7.15506579e-03,
       9.56494677e-03, 1.19239746e-02, 1.04321322e-02, 1.05422742e-02,
       4.46089282e-03, 3.48508067e-04, 0.00000000e+00, 3.46986198e-04,
       2.63597955e-02, 3.58153730e-02, 3.23196604e-02, 1.94298254e-02,
       7.48440731e-03, 3.79024322e-01, 3.69949710e-03, 5.93416090e-03,
       7.47670411e-03, 1.50536535e-02, 7.01934112e-02, 0.00000000e+00,
      

In [30]:
sorted(zip(tree_clf.feature_importances_, range(0,X_train_prepared.shape[1])), reverse=True)

[(0.3790243215909741, 49),
 (0.07019341116276466, 54),
 (0.035815372996119005, 45),
 (0.0323196604349827, 46),
 (0.02635979546050285, 44),
 (0.01942982539025851, 47),
 (0.017085098921422008, 33),
 (0.01505365354736591, 53),
 (0.014841231894634248, 21),
 (0.013934303199765954, 12),
 (0.013827317797428037, 29),
 (0.013245138904990073, 15),
 (0.013164102339046052, 20),
 (0.01273604842168633, 19),
 (0.011923974638936026, 37),
 (0.011536627780334149, 25),
 (0.011409590831408472, 9),
 (0.010856554358837927, 17),
 (0.010790064178424662, 34),
 (0.010643785368145418, 32),
 (0.010542274244273305, 39),
 (0.010518364827482213, 27),
 (0.010446432720073172, 28),
 (0.010432132185387294, 38),
 (0.01023314474005016, 0),
 (0.010121370777031182, 26),
 (0.009888827738674806, 14),
 (0.009564946771575888, 36),
 (0.009471287799915231, 8),
 (0.009422845382702466, 22),
 (0.009366370628743301, 6),
 (0.008785135742524357, 31),
 (0.008679798358755, 3),
 (0.008503590260516478, 16),
 (0.008338342041536778, 4),
 (0.

In [32]:
sum(X_train_prepared[:,49])

1600.0

In [33]:
X_train_prepared.shape

(6073, 57)