In [19]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as snsy

# magic word for producing visualizations in notebook
%matplotlib inline

# Load in the general demographics data.
azdias = pd.read_csv("Udacity_CUSTOMERS_Subset.csv", delimiter = ";")


# Load in the feature summary file.
feat_info = pd.read_csv("AZDIAS_Feature_Summary.csv", delimiter = ";")

# Identify missing or unknown data values and convert them to NaNs.
def list_convert(line):
    line = line.replace("[", "")
    line = line.replace("]", "")
    a = line.split(",")
    return a
feat_info["missing_or_unknown"] = feat_info["missing_or_unknown"].apply(list_convert)
# Sets the attribute as index so it is easier to handle with the .loc mehtod
feat_info.set_index("attribute", inplace = True)



# Iterates through the columns of azdias
for column in azdias.columns:
    # saves the values as a list which are nan in this colum
    null_values = feat_info.loc[str(column), "missing_or_unknown"]
    for entry in null_values:
        # Some columns do not have an indicator for null values,
        try:
            if (entry != ""):
                # replaces every null value with an np.nan in the column
                azdias[column] = azdias[column].replace(int(entry), np.nan)
            else:
                print("Entry was an empty string: -{}- (should be empty)".format(entry))
        except:
            print("Entry {} is not a number.".format(entry))
            azdias[column] = azdias[column].replace(entry, np.nan)

nan_columns = ["AGER_TYP", "GEBURTSJAHR", "TITEL_KZ", "ALTER_HH", "HH_EINKOMMEN_SCORE"]
# drop columns
azdias.drop(axis = 1, columns = nan_columns, inplace = True)
len(azdias.columns)
print("Shape of df is: {}".format(azdias.shape))

# How much data is missing in each row of the dataset?
azdias["missing"] = azdias.apply(lambda x: (azdias.shape[1] - x.count()), axis=1)

df_full = azdias.query("missing == 0")
print("Shape is now: {}".format(df_full.shape))

# How many features are there of each data type?
cat_mixed_info = feat_info.query("type == ['categorical', 'mixed']")
cat_columns = list(feat_info.query("type == ['categorical']").index)
one_hot = []

for column in cat_columns:
    try:
        print("Column {} has {} unique values".format(column,len(df_full[column].unique())))
        # If it is already one hot encoded do nothing
        if list(df_full[column].unique()) == [0, 1]:
            pass
        # Otherwise append it to a list for one-hot encoding
        else:
            one_hot.append(column)
    except KeyError:
        print("The column {} got removed in an earlier step".format(column))

df_oh = df_full.copy()
# Re-encode categorical variable(s) to be kept in the analysis.
df_oh = pd.get_dummies(data = df_oh, columns = list(one_hot))

print("Shape of df is now {}".format(df_oh.shape))

# Engineer new variables
mainstream = [1,3,5,8,10,12,14]

df_oh["MAINSTREAM"] = 2
for number in mainstream:
    df_oh.loc[df_oh.PRAEGENDE_JUGENDJAHRE == number, 'MAINSTREAM'] = "1"
df_oh["MAINSTREAM"].replace(2, 0, inplace = True)

df_oh["GENERATION"] = 30
gen_dict = {1:40, 2:40, 3:50, 4:50, 5:60, 6:60, 7:60, 8:70, 8:70, 9:70, 10: 80, 11:80,
           12:80, 13: 80, 14:90, 15:90}
for key, value in gen_dict.items():
    df_oh.loc[df_oh.PRAEGENDE_JUGENDJAHRE == key, "GENERATION"] = value

df_oh["WEALTH"] = df_oh["CAMEO_INTL_2015"].str[:1]
df_oh["LIFE_STAGE"] = df_oh["CAMEO_INTL_2015"].str[1:]

# LP_LEBENSPHASE_GROB is a subset of LP_LEBENSPHASE_FEIN and is therefore dropped
df_oh.drop(columns = "LP_LEBENSPHASE_GROB", inplace = True)

mixed_oh_01 = ["GENERATION","WEALTH", "LIFE_STAGE",
            "LP_LEBENSPHASE_FEIN"]
mixed_oh_02 = ["WOHNLAGE", "KBA05_BAUMAX", "PLZ8_BAUMAX"]



df_oh = pd.get_dummies(data = df_oh, columns = mixed_oh_01)
df_oh["WOHNLAGE_0.0"] = 0
df_oh = pd.get_dummies(data = df_oh, columns = mixed_oh_02)
print("The shape is now {}".format(df_oh.shape))

drop_cols = ["PRAEGENDE_JUGENDJAHRE", "CAMEO_INTL_2015"]
df_oh.drop(columns = drop_cols, inplace = True)

print("The shape is now {}".format(df_oh.shape))

Entry was an empty string: -- (should be empty)
Entry was an empty string: -- (should be empty)
Entry was an empty string: -- (should be empty)
Entry was an empty string: -- (should be empty)
Entry was an empty string: -- (should be empty)
Entry was an empty string: -- (should be empty)
Entry X is not a number.
Entry XX is not a number.
Entry XX is not a number.
Entry was an empty string: -- (should be empty)
Entry was an empty string: -- (should be empty)
Entry was an empty string: -- (should be empty)
Entry was an empty string: -- (should be empty)
Shape of df is: (191652, 80)
