# Preparing and exploring data to apply supervised learning algorithm to highlight customer segmentation.

In [1]:
# Imported Libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, PolynomialFeatures

# custom imports
import summarize

In [2]:
import acquire as a

df = a.acquire_data()

## Looking over raw data information
   * The datatypes need to be changed
   * The names should be pythonic and in lowercase

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    541909 non-null  object 
 1   StockCode    541909 non-null  object 
 2   Description  540455 non-null  object 
 3   Quantity     541909 non-null  int64  
 4   InvoiceDate  541909 non-null  object 
 5   UnitPrice    541909 non-null  float64
 6   CustomerID   406829 non-null  float64
 7   Country      541909 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 33.1+ MB


## Quick view of the stats description for data.

In [4]:
df.describe()

Unnamed: 0,Quantity,UnitPrice,CustomerID
count,541909.0,541909.0,406829.0
mean,9.55225,4.611114,15287.69057
std,218.081158,96.759853,1713.600303
min,-80995.0,-11062.06,12346.0
25%,1.0,1.25,13953.0
50%,3.0,2.08,15152.0
75%,10.0,4.13,16791.0
max,80995.0,38970.0,18287.0


## Building prep function

In [5]:
import prepare as p

df = p.prepare_data(df)

In [6]:
df.head(3)

Unnamed: 0_level_0,invoice_no,stock_code,description,quantity,unit_price,customer_id,country
invoice_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-12-01 08:26:00,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2.55,17850.0,United Kingdom
2010-12-01 08:26:00,536365,71053,WHITE METAL LANTERN,6,3.39,17850.0,United Kingdom
2010-12-01 08:26:00,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2.75,17850.0,United Kingdom


## Names are now pythonic, invoice_date is now set as index

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 541909 entries, 2010-12-01 08:26:00 to 2011-12-09 12:50:00
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   invoice_no   541909 non-null  object 
 1   stock_code   541909 non-null  object 
 2   description  540455 non-null  object 
 3   quantity     541909 non-null  int64  
 4   unit_price   541909 non-null  float64
 5   customer_id  406829 non-null  float64
 6   country      541909 non-null  object 
dtypes: float64(2), int64(1), object(4)
memory usage: 33.1+ MB


In [8]:
df.describe()

Unnamed: 0,quantity,unit_price,customer_id
count,541909.0,541909.0,406829.0
mean,9.55225,4.611114,15287.69057
std,218.081158,96.759853,1713.600303
min,-80995.0,-11062.06,12346.0
25%,1.0,1.25,13953.0
50%,3.0,2.08,15152.0
75%,10.0,4.13,16791.0
max,80995.0,38970.0,18287.0


## Going to reevaluate the missing values in the dataset  

In [9]:
def missing_values(df):
    # calculate number of missing value for each attribute
    missing_counts = df.isna().sum()

    # calculate the percent of missing vals in each attribute
    total_rows = len(df)
    missing_percentages = (missing_counts / total_rows) * 100

    # create a summary df
    summary_df = pd.DataFrame({'Missing Values' : missing_counts, 'Percentage Missing (%)': missing_percentages})

    return summary_df

In [10]:
p.missing_values(df)

Unnamed: 0,Missing Values,Percentage Missing (%)
invoice_no,0,0.0
stock_code,0,0.0
description,1454,0.268311
quantity,0,0.0
unit_price,0,0.0
customer_id,135080,24.926694
country,0,0.0


* Again, customer_id and description seem to have many missing values
    * 0.27% missing data for decriptions.
    * 25.% missing data for customer_id. 

## Handling this I will just edit the feature values to identify these missing/unknown information

* Decided to drop missing values for description feature used IQR (interquartile range)

In [11]:
df = p.handle_missing_values(df, prop_required_column=.25, prop_required_row=0.95)

In [12]:
p.missing_values(df)

Unnamed: 0,Missing Values,Percentage Missing (%)
invoice_no,0,0.0
stock_code,0,0.0
description,0,0.0
quantity,0,0.0
unit_price,0,0.0
customer_id,133626,24.724723
country,0,0.0


* could have used below to fill NaNs for descriptions

In [13]:
# # DataFrame with NaNs filled in the 'description' and 'customerid' column
# df = df.copy()
# df['customer_id'].fillna('Missing', inplace=True)

## I am going to fill nans for unknown customers with '0'.

In [14]:
df['customer_id'].isna().sum()

133626

In [15]:
df['customer_id'].fillna(0, inplace=True)

In [16]:
## Now that i have edited

In [17]:
p.missing_values(df)

Unnamed: 0,Missing Values,Percentage Missing (%)
invoice_no,0,0.0
stock_code,0,0.0
description,0,0.0
quantity,0,0.0
unit_price,0,0.0
customer_id,0,0.0
country,0,0.0


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 540455 entries, 2010-12-01 08:26:00 to 2011-12-09 12:50:00
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   invoice_no   540455 non-null  object 
 1   stock_code   540455 non-null  object 
 2   description  540455 non-null  object 
 3   quantity     540455 non-null  int64  
 4   unit_price   540455 non-null  float64
 5   customer_id  540455 non-null  float64
 6   country      540455 non-null  object 
dtypes: float64(2), int64(1), object(4)
memory usage: 33.0+ MB


In [19]:
df['customer_id'].value_counts()

customer_id
0.0        133626
17841.0      7983
14911.0      5903
14096.0      5128
12748.0      4642
            ...  
13270.0         1
17763.0         1
17291.0         1
15668.0         1
15562.0         1
Name: count, Length: 4373, dtype: int64

## Testing my function with code above.

In [2]:
import acquire as a
import prepare as p

df = a.acquire_data()

df = p.prepare_data(df)

df.head()

Unnamed: 0_level_0,invoice_no,stock_code,description,quantity,unit_price,customer_id,country
invoice_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-12-01 08:26:00,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2.55,17850.0,United Kingdom
2010-12-01 08:26:00,536365,71053,WHITE METAL LANTERN,6,3.39,17850.0,United Kingdom
2010-12-01 08:26:00,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2.75,17850.0,United Kingdom
2010-12-01 08:26:00,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,3.39,17850.0,United Kingdom
2010-12-01 08:26:00,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,3.39,17850.0,United Kingdom


## Function is working appropriately

In [3]:
# yay no missing values! wooo!
p.missing_values(df)

Unnamed: 0,Missing Values,Percentage Missing (%)
invoice_no,0,0.0
stock_code,0,0.0
description,0,0.0
quantity,0,0.0
unit_price,0,0.0
customer_id,0,0.0
country,0,0.0


## Testing wrangle function

In [4]:
import wrangle as w

df = w.wrangle_data()

* Wrangle function operational

In [5]:
p.missing_values(df)

Unnamed: 0,Missing Values,Percentage Missing (%)
invoice_no,0,0.0
stock_code,0,0.0
description,0,0.0
quantity,0,0.0
unit_price,0,0.0
customer_id,0,0.0
country,0,0.0


## Now lets visualize some data