In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
customer_df = pd.read_csv('Wholesale_customers_data.csv')
customer_df.drop(['Channel', 'Region'], axis=1, inplace=True)

In [4]:
customer_df.shape
print ( customer_df)

     Fresh   Milk  Grocery  Frozen  Detergents_Paper  Delicatessen
0    12669   9656     7561     214              2674          1338
1     7057   9810     9568    1762              3293          1776
2     6353   8808     7684    2405              3516          7844
3    13265   1196     4221    6404               507          1788
4    22615   5410     7198    3915              1777          5185
5     9413   8259     5126     666              1795          1451
6    12126   3199     6975     480              3140           545
7     7579   4956     9426    1669              3321          2566
8     5963   3648     6192     425              1716           750
9     6006  11093    18881    1159              7425          2098
10    3366   5403    12974    4400              5977          1744
11   13146   1124     4523    1420               549           497
12   31714  12319    11757     287              3881          2931
13   21217   6208    14982    3095              6707          

# Correlation and Redundancy

I claim that there is correlation and redundancy in the `customer` table. What I mean by this is that some features are linear combinations of other features. 

Let's examine redundancy by dropping a feature and seeing if the other features can predict it.

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor

In [11]:
def calculate_r_2_for_feature(data,feature):
    new_data = data.drop(feature, axis=1)

    X_train, \
    X_test,  \
    y_train, \
    y_test = train_test_split(
        new_data,data[feature],test_size=0.25
    )

    regressor = KNeighborsRegressor()
    regressor.fit(X_train,y_train)

    score = regressor.score(X_test,y_test)
    return score

In [12]:
calculate_r_2_for_feature(customer_df,'Detergents_Paper')

0.86506646635978857

In [13]:
print("{:24} {}".format("Delicatessen: ", calculate_r_2_for_feature(customer_df,'Delicatessen')))
print("{:24} {}".format("Degergents_paper: ", calculate_r_2_for_feature(customer_df,'Detergents_Paper')))
print("{:24} {}".format("Fresh: ", calculate_r_2_for_feature(customer_df,'Fresh')))
print("{:24} {}".format("Frozen: ", calculate_r_2_for_feature(customer_df,'Frozen')))
print("{:24} {}".format("Grocery: ", calculate_r_2_for_feature(customer_df,'Grocery')))
print("{:24} {}".format("Milk: ", calculate_r_2_for_feature(customer_df,'Milk')))

Delicatessen:            0.15586554770022232
Degergents_paper:        0.6399811022083208
Fresh:                   0.25319390497631455
Frozen:                  0.040778056045101874
Grocery:                 0.7768672121473145
Milk:                    0.6297862446302848


But this is subject to randomness. There is randomness in my `train_test_split`. Let's do the whole thing many times and take the average. 

In [9]:
def mean_r2_for_feature(data, feature):
    scores = []
    for _ in range(100):
        scores.append(calculate_r_2_for_feature(data, feature))
        
    scores = np.array(scores)
    return scores.mean()

In [10]:
print("{:24} {}".format("Delicatessen: ", mean_r2_for_feature(customer_df,'Delicatessen')))
print("{:24} {}".format("Detergents_Paper: ", mean_r2_for_feature(customer_df,'Detergents_Paper')))
print("{:24} {}".format("Fresh: ", mean_r2_for_feature(customer_df,'Fresh')))
print("{:24} {}".format("Frozen: ", mean_r2_for_feature(customer_df,'Frozen')))
print("{:24} {}".format("Grocery: ", mean_r2_for_feature(customer_df,'Grocery')))
print("{:24} {}".format("Milk: ", mean_r2_for_feature(customer_df,'Milk')))

Delicatessen:            -0.01606296011321544
Detergents_Paper:        0.7666783320492304
Fresh:                   0.026939800816412653
Frozen:                  0.06289045092703353
Grocery:                 0.7309199813434732
Milk:                    0.45167851780219714


In [None]:
print("{:24} {}".format("Delicatessen: ", mean_r2_for_feature(customer_df,'Delicatessen')))
print("{:24} {}".format("Detergents_Paper: ", mean_r2_for_feature(customer_df,'Detergents_Paper')))
print("{:24} {}".format("Fresh: ", mean_r2_for_feature(customer_df,'Fresh')))
print("{:24} {}".format("Frozen: ", mean_r2_for_feature(customer_df,'Frozen')))
print("{:24} {}".format("Grocery: ", mean_r2_for_feature(customer_df,'Grocery')))
print("{:24} {}".format("Milk: ", mean_r2_for_feature(customer_df,'Milk')))

#### Discussion

What does this tell us?

## Visualize Redundancy

Study the correlation of the data.

In [None]:
import time
start = time.time()
sns.pairplot(customer_df, kind='reg')
print(time.time() - start)

In [None]:
corr = customer_df.corr()
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask, 0)] = True
with sns.axes_style("white"):
    ax = sns.heatmap(corr, mask=mask, square=True, annot=True,
                     cmap='RdBu', fmt='+.3f')
    plt.xticks(rotation=45, ha='center')