In [1]:
import pandas as pd
from sklearn import feature_selection as skfs
from sklearn import preprocessing as skpp
from sklearn import model_selection as ms
import numpy as np
import datetime as dt


# Level 3

In [85]:
def dummify(dataframe, *not_to_dummy, dummy_na=True):
    print("""DUMMIFYING: Creating dummy columns from categorical variables. This is necessary as sci-kit learn models always treat feature
    columns as continuous variables. Therefore, we change categorical variables to binary pseudo-continuous variables
    ({0})""".format(dt.datetime.now()))

    print("""Dummifying (Step1): We make a deep copy of the dataframe. If we did not do this, Python would change our 
    original dataframe, but we don't want this""")
    dummy_tmp1 = dataframe.copy(deep=True)
    
    print("""Dummifying (Step2): dummy_tmp2: We drop columns from the table that we do not want to be dummified""")
    dummy_tmp2 = dummy_tmp1.drop(not_to_dummy, axis=1, errors='ignore')
    
    print("""Dummifying (Step3): dummy_tmp3: Transforming all categorical columns of dummy_tmp2 to dummy-variables""")
    dummy_tmp3 = pd.get_dummies(dummy_tmp2, dummy_na=dummy_na)

    print("""Dummifying (Step4): Concatenating (aka. joining)""")
    print("""Dummifying (Step4a): dataframe2: Make a deep copy of the dummy_tmp3 table""")
    dataframe2 = dummy_tmp3.copy(deep=True)
    for col_name in not_to_dummy: # Iterate through columns
        col = dataframe[col_name]
        dataframe2 = pd.concat([dataframe2, col], 1)

    # Print which columns were transformed:
    print('Dummified: {}'.format(dataframe.dtypes[dataframe.dtypes == object].keys()))

    return dataframe2

In [86]:
def variance_threshold_select(dataframe, thresh=0.0, na_replacement=-999):
    print('Transforming: Deleting low variance columns ({0})'.format(dt.datetime.now()))

    dataframe1 = dataframe.copy(deep=True) # Make a deep copy of the dataframe
    selector = VarianceThreshold(thresh)
    selector.fit(dataframe1.fillna(na_replacement)) # Fill NA values as VarianceThreshold cannot deal with those
    dataframe2 = dataframe.loc[:,selector.get_support(indices=False)] # Get new dataframe with columns deleted that have NA values

    return dataframe2

# Level 2

In [87]:
def read_csv_file(path='Data/dataset.csv', y_value=''): # Function definition: The path is standard as Data/dataset.csv but you can change it
    ##########
    # PRINTING information, before function begins
    print('Your file path is: {}. If you get a file not found error, try to change your file path. Dont forget the .csv at the end of the file'.format(path))
    if y_value:
        print('You have specified a y-column: {}. You get: x-table, y-variable'.format(y_value))
    else:
        print('You have not specified a y-column. You get: table or variable'.format(y_value))    
    
    ######
    # MAIN
    # dataframe1 is a dataframe. More information on Data Structures: http://pandas.pydata.org/pandas-docs/stable/dsintro.html
    dataframe1 = pd.read_csv('Data/dataset.csv') # With this function, we load the CSV, the argument is the path of the CSV

    if y_value: # If we defined that there is a y_value in the file, then lets create our x and y variables
        # Creating the y and the x variable
        y = dataframe1[y_value] # We choose our y_value column as our y column.
        x = dataframe1.drop(y_value, axis=1) # Here, we drop our y column to create an x variable with no y variable in it
        
        return x, y # Returns a tuple with x, y variable. You can get them using x, y =  read_csv_file(y_value='your_value')
    
    else:
        return dataframe1 # Returns a dataframe if no y_value was provided

In [88]:
def preprocess_x(dataframe, threshold=0.0): # dataframe0 stands for dataframe 0
    dataframe1 = dataframe.copy(deep=True) # We make a deep copy of the dataframe. If we did not do this, Python would change our original dataframe, but we don't want this

    dataframe2 = dummify(dataframe1) # Creating dummies from categorical variables (for more info, see http://pandas.pydata.org/pandas-docs/stable/generated/pandas.get_dummies.html)
    dataframe3 = variance_threshold_select(dataframe2, thresh=threshold) # We delete variables 
    return dataframe3

In [91]:
def preprocess_y(column):
    le = skpp.LabelEncoder()
    column = le.fit_transform(column)
    return(column)

# Level 1

In [112]:
# Standardized Functions
x, y = read_csv_file(y_value='Survived')
x = preprocess_x(x)
y = preprocess_y(y)
x_train, x_test, y_train, y_test = ms.train_test_split(x, y)


Your file path is: Data/dataset.csv. If you get a file not found error, try to change your file path. Dont forget the .csv at the end of the file
You have specified a y-column: Survived. You get: x-table, y-variable
Dummifying (2017-05-03 13:01:50.016781)
Dummified: Index(['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], dtype='object')
Transforming: Deleting low variance columns (2017-05-03 13:01:50.045801)
