In [None]:
class_col = 'CLS_Grade_GE10'
run_params = {"max_depth":5, 'min_samples_leaf':20, 'class_weight': 'balanced'}
cat_cols = ['Subject', 'school', 'sex', 'address', 'famsize', 'Pstatus','Medu', 'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic']

In [None]:
import pandas as pd
from sklearn import tree

In [None]:
# used by this notebook (Derivative of data pipeline code)
# code to support processsing of dummy variables in notebooks
class dummy_builder:
    def __init__(self, prefixes):
        self.prefixes = prefixes
               
    def make_dummies(self, df, dummy_cols):
        """Alteration of specified columns in a dataframe, with optional (non-default) removal of the dummy which is the majority category and (default) removal of the pre-dummied originals"""
            
        for c in dummy_cols:
            # there might be some dropped columns
            if c not in df.columns.values:
                continue

            if not self.prefixes.has_key(c):
                raise Exception("prefixes defined in dummies.py do not contain the key:", c)
            
            if len(df[c].unique()) > 1:
                dummies = pd.get_dummies(df[c], prefix=self.prefixes[c])
            else:
                dummies = None
                self.g_print(c + " had only one value; not creating dummies")
            # append dummies and drop the categorical
            if dummies is not None:
                df = pd.concat([df, dummies], axis=1, copy=False)
                df.drop(c, axis=1, inplace=True)
        return df

In [None]:
# these are used by other notebooks
def prep_vector_for_classifier(row):
        # turn the row into the form to match the data fed to the classifier (i.e dummified)
    # a bit awkward as we have to resort to the original to know what the 0 dummy entries are
    new_values = []
    new_index = []
    for i, v in row.iteritems():
        if i in cat_cols:
            new_values.append(1)
            new_index.append('{}_{}'.format(i, v))
        else:
            new_values.append(v)
            new_index.append(i)
    s = pd.Series(data=new_values, index=new_index).reindex(dummied_data_cols).fillna(0)

    XX = s.values.reshape(1, -1)

    return XX

def predict_1(row):    
    return clf.predict_proba(prep_vector_for_classifier(row))[0][1]

In [None]:
fs = pd.read_csv(data_file)
fs_data = fs.drop(class_col, axis=1)
cls = fs[class_col]
del fs

In [None]:
db = dummy_builder(prefixes = {k:k for k in cat_cols})
data = db.make_dummies(fs_data, cat_cols)
dummied_data_cols = data.columns
clf = tree.DecisionTreeClassifier()
clf.set_params(**run_params)
clf.fit(data, cls)

In [None]:
importances = pd.DataFrame({"dummied": data.columns.values,
                            "undummied": [f.split('_')[0] for f in data.columns.values],
                            "importance": clf.feature_importances_}).sort_values('importance', ascending=False)

undummied_nz_importances = importances[importances.importance > 0][['undummied', 'importance']].groupby('undummied').sum().sort_values('importance', ascending=False)

In [None]:
# prepare some personas

#get an arbitrary row to be "me"
sample_no = 11
row = fs_data.iloc[sample_no]

personas = {"me": dict(row),
            "wild-child": {'Dalc': 5,
                         'Fedu': 1,
                         'Fjob': 'other',
                         'Grade1': 9,
                         'Medu': 2,
                         'Mjob': 'services',
                         'Pstatus': 'A',
                         'Subject': 'Maths',
                         'Walc': 5,
                         'absences': 20,
                         'activities': 'no',
                         'address': 'U',
                         'age': 17,
                         'failures': 2,
                         'famrel': 1,
                         'famsize': 'GT3',
                         'famsup': 'yes',
                         'freetime': 2,
                         'goout': 5,
                         'guardian': 'father',
                         'health': 3,
                         'higher': 'no',
                         'internet': 'yes',
                         'nursery': 'yes',
                         'paid': 'no',
                         'reason': 'home',
                         'romantic': 'yes',
                         'school': 'GP',
                         'schoolsup': 'no',
                         'sex': 'F',
                         'studytime': 1,
                         'traveltime': 1
                          },
            "middle-class": {'Dalc': 1,
                         'Fedu': 1,
                         'Fjob': 'teacher',
                         'Grade1': 15,
                         'Medu': 4,
                         'Mjob': 'services',
                         'Pstatus': 'T',
                         'Subject': 'Maths',
                         'Walc': 1,
                         'absences': 0,
                         'activities': 'yes',
                         'address': 'U',
                         'age': 17,
                         'failures': 0,
                         'famrel': 5,
                         'famsize': 'LE3',
                         'famsup': 'yes',
                         'freetime': 2,
                         'goout': 1,
                         'guardian': 'mother',
                         'health': 4,
                         'higher': 'yes',
                         'internet': 'yes',
                         'nursery': 'yes',
                         'paid': 'yes',
                         'reason': 'reputation',
                         'romantic': 'no',
                         'school': 'GP',
                         'schoolsup': 'no',
                         'sex': 'F',
                         'studytime': 4,
                         'traveltime': 3
                          }
           }