In [6]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
import seaborn as sns
import datetime as dt

# import required libraries for clustering
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from scipy.cluster.hierarchy import linkage
from scipy.cluster.hierarchy import dendrogram
from scipy.cluster.hierarchy import cut_tree

## Step 1: Exploring and Understanding the DataSet

In [8]:
df = pd.read_csv('/Users/lluisarull/Desktop/DSDM/Machine_Learning/Assignment_5/retail_data - retail_data.csv')
df

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,01-12-2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,01-12-2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,01-12-2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,01-12-2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,01-12-2010 8:26,3.39,17850.0,United Kingdom
...,...,...,...,...,...,...,...,...
541904,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,09-12-2011 12:50,0.85,12680.0,France
541905,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,09-12-2011 12:50,2.10,12680.0,France
541906,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,09-12-2011 12:50,4.15,12680.0,France
541907,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,09-12-2011 12:50,4.15,12680.0,France


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    541909 non-null  object 
 1   StockCode    541909 non-null  object 
 2   Description  540455 non-null  object 
 3   Quantity     541909 non-null  int64  
 4   InvoiceDate  541909 non-null  object 
 5   UnitPrice    541909 non-null  float64
 6   CustomerID   406829 non-null  float64
 7   Country      541909 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 33.1+ MB


### Defining Functions and important vars

In [11]:
def check_categoricals_and_binaries(df, max_dif_cat_considered):
  """It gets the variables that are binary or categorical by checking first if the
  type of the variable is integer or string(we asume categorical values
  are integers and strings) and then if the number of possible values are 2 (binary) 
  or between 2 or a specified number of values (categorical). The inputs are
  the DataFrame and the int max_dif_cat_considered. The output is a list of names
  for the binary vars, and a dict for the categoricals with name : quantity of values.
  """
  categoricals = dict()
  binaries = list()
  for column in df.columns:
    dif_pos_values = len(df[column].unique())
    if dif_pos_values == 2:
      binaries.append(column)
    elif dif_pos_values > 2 and dif_pos_values < max_dif_cat_considered:
      categoricals[column] = dif_pos_values
  return binaries, categoricals


def check_outliers(df, vars_without_outliers):
  """It gets the df and a list of variables that dont have outliers (categorical
  and binary vars) and returns a dictionary with the total number of outliers
  and the index of them for each var that HAS outliers
  """
  outliers = dict()
  for column in df.columns:
    if column in vars_without_outliers:
      continue
    else:
      q25, q75 = np.quantile(df[column], 0.25), np.quantile(df[column], 0.75)

      IQR = q75 - q25
      lower,upper = q25 - IQR*1.5, q75 + IQR*1.5
      outliers_index = ((df[column] < lower) | (df[column] > upper))

      if len(outliers_index == True) == 0:
        continue
      else:
        output = {
            "indexes" : outliers_index,
            "quantity" : outliers_index.sum(axis = 0)
        }
        outliers[column] = output
  return outliers

def outliers_printer(df, vars_list):
  i=1
  plt.figure(figsize = (20,10))
  for col in vars_list:
   
    plt.subplot(3,3,i)
    sns.boxplot(df[col])

    i=i+1

def careful_standardization(df, not_standardize_list):
  """ This function receives the DataFrame and a list of variables not to standardize
  (for example the target, binary variables, or dummy variables(categorical vars
  should have been dummified). Also can receive sub strings of dummified categorical vars.
  For example, neigborhoodcode when dummified turns into neigborhoodcode_1, neigborhoodcode_2...
  so by putting the string 'neigborhoodcode_' inside the list no dummy of 'neigborhoodcode_' will
  be standardized). The output is the same DataFrame with all the other columns standardized. 
  """
  standardize_vars_cols = []
  dummy_vars_and_y = []

  for column in df.columns:
    for element in not_standardize_list:
    
      if element in column:
        save = False
        break
      else:
        save = True

    if save:
      standardize_vars_cols.append(column)
    else:
      dummy_vars_and_y.append(column)

  # scaler
  scaler = StandardScaler().fit(df[standardize_vars_cols])
  scaled_features = scaler.transform(df[standardize_vars_cols])
  df_scaled = pd.DataFrame(scaled_features, index = df.index, columns = df[standardize_vars_cols].columns)
  df_scaled[dummy_vars_and_y] = df[dummy_vars_and_y]

  return df_scaled


def CV_model(X , y, kf, model, metric, the_greater_the_better = True):
  """This function gets X and y sets, the KFold object to split the data, the model
  to evaluate, the metric to evaluate the model and an argument to set if the metric has to 
  be big or small (r2 the bigger the better, mse the smaller the better)"""
  metrics_list = []
  
  if the_greater_the_better:
    best_metric = 0
  
  if not the_greater_the_better:
    best_metric = 1000
  
  for train_index, test_index in kf.split(X):

    X_train, X_test = np.matrix(X)[train_index], np.matrix(X)[test_index]
    y_train, y_test = np.array(y)[train_index], np.array(y)[test_index]
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    metric_val = metric(y_test, y_pred)
    metrics_list.append(metric_val)

    if the_greater_the_better:

      if metric_val > best_metric:
        
        final_model = model
        final_X_test = X_test
        final_y_pred = y_pred
        final_y_test = y_test
        best_metric = metric_val
    else:

      if metric_val < best_metric:
        
        final_model = model
        final_X_test = X_test
        final_y_pred = y_pred
        final_y_test = y_test
        best_metric = metric_val

  return final_model, final_X_test, final_y_test, final_y_pred, np.mean(metrics_list)

In [12]:
df.nunique()

InvoiceNo      25900
StockCode       4070
Description     4211
Quantity         722
InvoiceDate    23260
UnitPrice       1630
CustomerID      4372
Country           38
dtype: int64

Subset of data containing CustomerID as NaN values
