In [None]:
import matplotlib.pyplot as plt
# import libraries here; add more as necessary
import numpy as np
import pandas as pd
import seaborn as sns

# magic word for producing visualizations in notebook
from webencodings import labels


# load in the data
azdias = pd.read_csv('./data/Udacity_AZDIAS_052018.csv')[:10000]
customers = pd.read_csv('./data/Udacity_CUSTOMERS_052018.csv')[:10000]


In [None]:
azdias.head()

In [None]:
customers.head()


In [None]:
from abc import ABC, abstractmethod
from pandas.core.arrays.categorical import factorize_from_iterable
import pickle


class OpDataFrame(ABC):
    def __init__(self, op_name, ignore_if_failure=False):
        self.op_name = op_name
        self.ignore_if_failure = ignore_if_failure

    @abstractmethod
    def op(self, _df: pd.DataFrame):
        return _df

    def get_info(self):
        if hasattr(self, "field"):
            return f"{self.op_name}: {self.field}"
        return self.op_name

    # def __eq__(self, obj):
    #     return isinstance(obj, OpDataFrame) and (obj.op_name == self.op_name) and hasattr(obj, "field") and hasattr(self, "field") and (obj.field == self.field)


class OpConvertDate(OpDataFrame):
    """
    Get datetime values from the date fields from data frame
    """

    def __init__(self, field):
        super(OpConvertDate, self).__init__(op_name="OpConvertDate")
        self.field = field

    def op(self, _df: pd.DataFrame):
        _df[self.field] = pd.to_datetime(_df[self.field])
        return _df


class OpConvertBoolean(OpDataFrame):
    """
    Convert columns `field` in dataframe `_df` into boolean value.
    """

    def __init__(self, field, t_value='t'):
        super(OpConvertBoolean, self).__init__(op_name="OpConvertBoolean")
        self.field = field
        self.t_value = t_value

    def op(self, _df: pd.DataFrame):
        _df[self.field] = _df[self.field].astype(str) == self.t_value
        return _df


class OpConvertMoneyToFloat(OpDataFrame):
    """
    Convert money with sign $ to float value
    """

    def __init__(self, field):
        super(OpConvertMoneyToFloat, self).__init__(op_name="OpConvertMoneyToFloat")
        self.field = field
        # self.money_sign = money_sign

    def op(self, _df: pd.DataFrame):
        _df[self.field] = _df[self.field].replace('[\$,]', '', regex=True).astype(float)
        return _df


class OpConvertToFloat(OpDataFrame):
    """
    Convert value to float
    """

    def __init__(self, field):
        super(OpConvertToFloat, self).__init__(op_name="OpConvertToFloat")
        self.field = field

    def op(self, _df: pd.DataFrame):
        _df[self.field] = _df[self.field].astype(float)
        return _df


class OpConvertPercentToFloat(OpDataFrame):
    """
    Convert percent values to float (Ex 80% = 0.8)
    """

    def __init__(self, field):
        super(OpConvertPercentToFloat, self).__init__(op_name="OpConvertPercentToFloat")
        self.field = field

    def op(self, _df: pd.DataFrame):
        _df[self.field] = _df[self.field].str.rstrip('%').astype('float') / 100.0
        return _df


class OpConvertToString(OpDataFrame):
    def __init__(self, field):
        super(OpConvertToString, self).__init__(op_name="OpConvertToString")
        self.field = field

    def op(self, _df: pd.DataFrame):
        _df[self.field] = _df[self.field].astype('string')
        return _df


class OpDropColumns(OpDataFrame):
    def __init__(self, fields):
        super(OpDropColumns, self).__init__(op_name="OpDropColumns")
        self.fields = fields

    def op(self, _df: pd.DataFrame):
        _column_deleting = list(set(self.fields) & set(_df.columns))
        _df = _df.drop(labels=_column_deleting, axis=1)

        return _df


class CanSave(object):
    def __init__(self, save_path):
        self.save_path = save_path

    def save(self, data):
        if self.save_path:
            with open(self.save_path, 'wb') as f:
                pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)


class CanLoad(object):
    def __init__(self, load_path):
        self.load_path = load_path

    def load(self):
        if self.load_path:
            with open(self.load_path, 'rb') as f:
                return pickle.load(f)


class CategoryDummy(object):
    def __init__(self):
        pass

    def _get_index(self, labels, code_name):
        def _get_nan_index():
            for i, v in enumerate(labels):
                try:
                    if isinstance(v,(float, int)) and np.isnan(v):
                        return i
                except Exception as e:
                    print(e)

        if isinstance(code_name, (float, int)):
            if np.isnan(code_name):
                return _get_nan_index()

        for i, v in enumerate(labels):
            if v == code_name:
                return i

        return _get_nan_index()

    def codes_labels(self, series: pd.Series):
        codes, labels = factorize_from_iterable(series)

        codes = codes.copy()
        # Dummy na is default
        codes[codes == -1] = len(labels)
        levels = np.append(labels, np.nan)

        return codes, levels

    def _dummy(self, codes, labels, prefix="_"):
        dummy_cols = [f"{prefix}{label}" for label in labels]

        dummy_mat = np.eye(len(labels), dtype=np.uint8).take(codes, axis=0)
        dummy_mat[codes == -1] = 0

        # drop_first:
        dummy_mat = dummy_mat[:, 1:]
        dummy_cols = dummy_cols[1:]
        return pd.DataFrame(dummy_mat, columns=dummy_cols)

    def dummy_by_specified_labels(self, labels, code_labels, prefix="_"):
        codes = [self._get_index(labels, name) for name in code_labels]
        return self._dummy(codes, labels, prefix)

    def dummy_by_all_codes(self, codes, labels, prefix="_"):
        return self._dummy(codes, labels, prefix)


class OpSaveCategoryColumns(OpDataFrame, CanSave):
    def __init__(self, cat_cols, save_path, list_df_train = None):
        OpDataFrame.__init__(self, op_name="OpSaveCategoryColumns")
        CanSave.__init__(self, save_path=save_path)
        self.cat_cols = cat_cols
        self.list_df_train = list_df_train

    def op(self, _df: pd.DataFrame):
        dummier = CategoryDummy()
        d = {}

        if self.list_df_train is None:
            self.list_df_train = [_df]

        _df_train = pd.concat(self.list_df_train, axis=0)

        for cat_col in self.cat_cols:
            d[cat_col] = dummier.codes_labels(_df_train[cat_col])

        self.save(d)

        return _df


class OpLoadCategoryColumns(OpDataFrame, CanLoad):
    def __init__(self, cat_cols, load_path, drop_orignal_col = True):
        OpDataFrame.__init__(self, op_name="OpLoadCategoryColumns")
        CanLoad.__init__(self, load_path=load_path)

        self.cat_cols = cat_cols
        self.drop_orignal_col = drop_orignal_col

    def op(self, _df: pd.DataFrame):
        dummier = CategoryDummy()
        d = self.load()

        for cat_col in self.cat_cols:
            _, labels = d[cat_col]

            _df_col_cat = dummier.dummy_by_specified_labels(labels, _df[cat_col].values, prefix=f"{cat_col}_")
            _df = pd.concat([_df.reset_index(drop=True), _df_col_cat.reset_index(drop=True)], axis=1)
        if self.drop_orignal_col:
            _df = _df.drop(labels=self.cat_cols, axis=1)

        return _df


class OpGroupbyExpander(OpDataFrame):
    def __init__(self, group_fields, funcs=['mean', 'max', 'min']):
        super(OpGroupbyExpander, self).__init__(op_name="OpGroupbyExpander")
        self.group_fields = group_fields
        # self.compute_field = compute_field
        self.funcs = funcs

    def op(self, _df: pd.DataFrame):
        compute_fields = _df.select_dtypes(include=['float', 'int']).columns

        for compute_field in compute_fields:
            _g_df = _df.groupby(by=self.group_fields).agg(
                {compute_field: self.funcs}
            )

            _df = pd.merge(_df, _g_df, on=self.group_fields)

        return _df


class OpFeatureComposedExpander(OpDataFrame):
    def __init__(self):
        super(OpFeatureComposedExpander, self).__init__(op_name="OpFeatureComposedExpander")
        # self.num_cols = num_cols

    def op(self, _df: pd.DataFrame):

        num_cols = _df.select_dtypes(include=['float', 'int', 'int64'])

        for col_1 in num_cols:
            for col_2 in num_cols:
                _df[f"{col_1}_and_{col_2}"] = _df[col_1] * _df[col_2]

        return _df


class OpPipeLine(OpDataFrame):
    """
    Create pipeline of operation on the dataframe
    """

    def __init__(self, copy=True):
        super(OpPipeLine, self).__init__(op_name="OpPipeLine")
        self.copy = copy
        self.ops = []

    def add_op(self, op):
        self.ops.append(op)

    def remove_op(self, removed_op_name):

        self.ops = [x for x in self.ops if x.op_name != removed_op_name]

    def replace_op(self, new_node_op):
        for i, node_op in enumerate(self.ops):
            if node_op.op_name == new_node_op.op_name and node_op.field == new_node_op.field:
                self.ops[i] = new_node_op

    def add_ops(self, ops):
        self.ops.extend(ops)

    def op(self, _df: pd.DataFrame):
        if self.copy:
            _df = _df.copy()

        for node_op in self.ops:
            try:
                _df = node_op.op(_df)
            except Exception as e:
                if not node_op.ignore_if_failure:
                    print(e)
                    raise Exception(f"Error when doing op: {node_op.get_info()}. Detail error: {e}")
                continue
        return _df

    def get_info(self):
        return " >> ".join([op.get_info() for op in self.ops])


class OpMergeDataFrame(OpDataFrame):
    """
    Merge 2 dataframes
    """

    def __init__(self, _df_main, left_on, right_on, how='inner'):
        super(OpMergeDataFrame, self).__init__(op_name="OpMergeDataFrame")
        self._df_main = _df_main
        self.how = how
        self.left_on = left_on
        self.right_on = right_on

    def op(self, _df: pd.DataFrame):
        return pd.merge(self._df_main, _df, how=self.how, left_on=self.left_on, right_on=self.right_on)


class OpSelectNumericColumnOnly(OpDataFrame):
    def __init__(self):
        super(OpSelectNumericColumnOnly, self).__init__(op_name="OpSelectNumericColumnOnly")

    def op(self, _df: pd.DataFrame):
        choose_cols = ['float', 'int', 'int64', 'bool', 'uint8']
        return _df.select_dtypes(include=choose_cols)


class OpAppliedFieldFunction(OpDataFrame):
    def __init__(self, field, func, op_name, new_field=None):
        OpDataFrame.__init__(self, op_name=op_name)
        self.func = func
        self.field = field
        self.new_field = new_field

    def op(self, _df: pd.DataFrame):
        if self.new_field is None:
            _df[self.field] = _df[self.field].apply(self.func)
        else:
            _df[self.new_field] = _df[self.field].apply(self.func)
        return _df

class OpFillNaWithValue(OpDataFrame):
    def __init__(self, field, default_value):
        OpDataFrame.__init__(self, "OpFillNAWithValue")
        self.field = field
        self.default_value = default_value

    def op(self, _df: pd.DataFrame):
        _df[self.field] = _df[self.field].fillna(self.default_value)

        return _df

class OpFillNaWithMeanValue(OpDataFrame):
    def __init__(self, field):
        OpDataFrame.__init__(self, "OpFillNaWithMeanValue")
        self.field = field


    def op(self, _df: pd.DataFrame):
        _df[self.field] = _df[self.field].fillna(_df[self.field].mean())

        return _df


from datetime import datetime, date


def convert_date_to_month_value(date_string):
    if isinstance(date_string, (float, int)) and np.isnan(date_string):
        return np.nan

    date_value = datetime.strptime(date_string, "%Y-%m-%d %H:%M:%S").date()

    return (date(2018, 1, 1) - date_value).days


def convert_year_to_value(value, current_year=2018):
    if np.isnan(value) or value <= 0:
        return np.nan

    if value > current_year:
        return 0

    return current_year - value




convert_pipeline = OpPipeLine()

# convert_pipeline.add_op(OpDropColumns(fields=['Unnamed: 0', 'LNR', 'CUSTOMER_GROUP', 'ONLINE_PURCHASE', 'PRODUCT_GROUP']))

convert_pipeline.add_op(OpDropColumns(fields=['Unnamed: 0', 'LNR']))

convert_pipeline.add_op(OpAppliedFieldFunction('EINGEFUEGT_AM', convert_date_to_month_value, 'convert_EINGEFUEGT_AM'))
convert_pipeline.add_op(OpAppliedFieldFunction('EINGEZOGENAM_HH_JAHR', lambda c: convert_year_to_value(c, 2018),
                                               'convert_EINGEZOGENAM_HH_JAHR'))
convert_pipeline.add_op(
    OpAppliedFieldFunction('GEBURTSJAHR', lambda c: convert_year_to_value(c, 2018), 'convert_GEBURTSJAHR'))
convert_pipeline.add_op(
    OpAppliedFieldFunction('AGER_TYP', lambda c: np.nan if c == -1 or c == 0 else c, 'convert_AGER_TYP'))

# cat_cols = consider_category_columns(azdias, threshold=0.05)
cat_cols = set(azdias.columns) - {'Unnamed: 0', 'LNR', 'EINGEFUEGT_AM', 'EINGEZOGENAM_HH_JAHR', 'GEBURTSJAHR', 'KBA13_ANZAHL_PKW'}

# cat_cols = list(set(cat_cols) - {'EINGEFUEGT_AM', 'EINGEZOGENAM_HH_JAHR', 'GEBURTSJAHR'})

cat_path = './cat_present.txt'

convert_pipeline.add_op(OpSaveCategoryColumns(cat_cols=cat_cols, save_path=cat_path, list_df_train=[customers, azdias]))
convert_pipeline.add_op(OpLoadCategoryColumns(cat_cols=cat_cols, load_path=cat_path))

# convert_pipeline.add_op(OpAppliedFieldFunction('GEBURTSJAHR', bin_year, 'convert_BIN_YEAR', 'BIN_YEAR'))


convert_pipeline.add_op(OpFillNaWithMeanValue('EINGEFUEGT_AM'))
convert_pipeline.add_op(OpFillNaWithMeanValue('GEBURTSJAHR'))
convert_pipeline.add_op(OpFillNaWithMeanValue('KBA13_ANZAHL_PKW'))
convert_pipeline.add_op(OpFillNaWithMeanValue('EINGEZOGENAM_HH_JAHR'))


formatted_customers = convert_pipeline.op(customers)
formatted_azdias = convert_pipeline.op(azdias)

# print(formatted_customers.head())
#
# print(formatted_azdias.head())


print(f"Shapes Azdias: {formatted_azdias.shape}, Customer: {formatted_customers.shape}")


In [None]:
from sklearn.decomposition import PCA


def pca_decomposition(_df_train):
    pca = PCA()
    pca = pca.fit(_df_train)
    return pca, pca.transform(_df_train)


pca, Xt = pca_decomposition(formatted_azdias)

Xt_cus = pca.transform(formatted_customers.drop(labels=['CUSTOMER_GROUP', 'ONLINE_PURCHASE', 'PRODUCT_GROUP'], axis=1))


In [None]:
def visualize_2D():
    plot = plt.scatter(Xt[:,0], Xt[:,1], c='yellow')
    plt.scatter(Xt_cus[:,0], Xt_cus[:,1], c='green')
    plt.legend(handles=plot.legend_elements()[0], labels=['General', 'Customer'])
    plt.savefig(f'./images/visualize_2D.png', dpi=100, bbox_inches='tight')
    plt.show()

visualize_2D()


In [None]:

def visualize_3D():
    ax = plt.axes(projection='3d')
    ax.scatter3D(Xt[:,0], Xt[:,1], Xt[:,2], color='yellow', alpha=0.3)
    ax.scatter3D(Xt_cus[:,0], Xt_cus[:,1], Xt_cus[:,2], color='green', alpha=0.3)
    plt.savefig(f'./images/visualize_3D.png', dpi=100, bbox_inches='tight')
    plt.show()

visualize_3D()

# plot = plt.scatter(Xt[:,0], Xt[:,1], c='blue')
# plt.scatter(Xt_cus[:,0], Xt_cus[:,1], c='yellow')
# plt.legend(handles=plot.legend_elements()[0], labels=['General', 'Customer'])
# plt.show()


In [None]:
# formatted_customers['CUS'] = formatted_customers['CUSTOMER_GROUP'].apply(lambda x: 1 if x == "MULTI_BUYER" else 2)
#
# formatted_customers['CUS'].value_counts()
#
# from sklearn.feature_selection import RFE
# from sklearn.linear_model import LogisticRegression
#
# # feature extraction
# model = LogisticRegression(solver='lbfgs')
# rfe = RFE(model, n_features_to_select=30)
# X = formatted_customers.drop(labels=['CUSTOMER_GROUP', 'ONLINE_PURCHASE', 'PRODUCT_GROUP'], axis=1).values
# y = formatted_customers['CUS'].to_numpy()
# fit = rfe.fit(X, y)
# print("Num Features: %d" % fit.n_features_)
# print("Selected Features: %s" % fit.support_)
# print("Feature Ranking: %s" % fit.ranking_)


#





In [None]:
def visualize_dist_side_by_side(_df_general, _df_customers, col):
    figure, (ax1, ax2) = plt.subplots(1, 2)
    # figure_name = plt.figure(figsize=(15, 8))

    figure.set_figheight(5)
    figure.set_figwidth(10)

    ax1.set_title(f"General: {col}")
    sns.distplot(_df_general[col], ax=ax1)

    ax2.set_title(f"Customer: {col}")
    sns.distplot(_df_customers[col], ax=ax2)

    plt.savefig(f'./images/visualize_dist_side_by_side_{col}.png', dpi=100, bbox_inches='tight')


visualize_dist_side_by_side(formatted_azdias, formatted_customers, 'GEBURTSJAHR')



In [None]:

def visualize_bar_size_by_side(_df_general, _df_customers, col):
    figure, (ax1, ax2) = plt.subplots(1, 2)

    figure.set_figheight(5)
    figure.set_figwidth(10)

    ax1.set_title(f"General: {col}")
    _df_general[col].value_counts().sort_index().plot(kind='bar', ax=ax1, color='steelblue')
    # sns.barplot(_df_general.groupby(col)['LNR'].count(), ax=ax1)
    #
    ax2.set_title(f"Customer: {col}")
    _df_customers[col].value_counts().sort_index().plot(kind='bar', ax=ax2, color='steelblue')

    plt.savefig(f'./images/visualize_bar_size_by_side_{col}.png', dpi=100, bbox_inches='tight')


visualize_bar_size_by_side(azdias, customers, 'AKT_DAT_KL')


In [None]:
def visualize_coff(_df):
    f, ax = plt.subplots(figsize=(12, 8))
    corr = _df.corr()
    hm = sns.heatmap(round(corr, 2), annot=True, ax=ax, cmap="coolwarm", fmt='.2f',
                     linewidths=.05)
    f.subplots_adjust(top=0.93)
    t = f.suptitle('Attributes Correlation Heatmap', fontsize=14)


visualize_coff(formatted_azdias)



In [None]:
# K-means

from sklearn.cluster import KMeans

model = KMeans(n_clusters=10)
Xt_kmeans = model.fit(Xt)


In [None]:
def visualize_categories(_Xt):
    fig, ax = plt.subplots(figsize=(8, 6))
    plt.scatter(_Xt[:,0], _Xt[:,1],  marker = 'o',
                c=Xt_kmeans.labels_,
                s=80, alpha=0.5)
    # plt.scatter(centroids.iloc[:,0], centroids.iloc[:,1],
    #             marker = 's', s=200, c=[0, 1, 2],
    #             cmap = customcmap)
    ax.set_xlabel(r'x', fontsize=14)
    ax.set_ylabel(r'y', fontsize=14)
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
    plt.savefig(f'./images/visualize_categories.png', dpi=100, bbox_inches='tight')
    plt.show()


visualize_categories(Xt)

In [None]:
# Predict the categories into customers dataset

x_cus_predict = model.predict(Xt_cus)

unique, counts = np.unique(x_cus_predict, return_counts=True)

print(np.asarray((unique, counts)).T)

In [None]:

def get_potential_cus(_Xt, categories, _df):
    x_potential = model.predict(_Xt)
    indies = np.argwhere(np.isin(x_potential, categories)).ravel()
    return _df.iloc[indies]

get_potential_cus(Xt, [0, 3, 9], azdias)

get_potential_cus(Xt, [3], azdias)


In [None]:
mailout_train = pd.read_csv('./data/mailout_train.csv')

mailout_train.head()

formated_mailout_train = convert_pipeline.op(mailout_train)

In [None]:
formated_mailout_train['RESPONSE'].value_counts()

In [None]:
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler

def xy_split(_df_train, drop_cols, label_cols):
    X = _df_train.drop(labels=drop_cols, axis=1).values
    y = _df_train[label_cols].values

    if len(y.shape) == 2 and y.shape[1] == 1:
        y = np.ravel(y)

    return X, y

def split_data(X, y, test_size=0.2):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    return X_train, X_test, y_train, y_test

def random_over_sampling(X, y):
    sampler = RandomOverSampler()
    return sampler.fit_resample(X, y)


X, y = xy_split(formated_mailout_train, ['RESPONSE'], ['RESPONSE'])
X, y = random_over_sampling(X, y)
X_train, X_test, y_train, y_test = split_data(X, y)




In [53]:
# RANDOM FOREST CLASSIFICATION
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

clf = RandomForestClassifier().fit(X_train, y_train)
y_pred = clf.predict(X_test)
report = classification_report(y_test,y_pred)

print(report)



              precision    recall  f1-score   support

           0       0.92      0.98      0.95      8461
           1       0.98      0.92      0.95      8511

    accuracy                           0.95     16972
   macro avg       0.95      0.95      0.95     16972
weighted avg       0.95      0.95      0.95     16972



In [54]:
# PREDICT TEST DATA

mailout_test = pd.read_csv('./data/mailout_test.csv')
formated_mailout_test = convert_pipeline.op(mailout_test)



  exec(code_obj, self.user_global_ns, self.user_ns)
  f"X has feature names, but {self.__class__.__name__} was fitted without"


array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [59]:
y_test_pred = clf.predict(formated_mailout_test)

print(f"There are: {len(y_test_pred[y_test_pred == 1])} customers (which response is 1) in {mailout_test.shape[0]} records")


  f"X has feature names, but {self.__class__.__name__} was fitted without"


There are: 893 customers (which response is 1) in 42833 records


In [58]:
len(test1[test1 == 1])

41940