In [1]:
# imported libs
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, PolynomialFeatures
from sklearn.cluster import DBSCAN, KMeans, AgglomerativeClustering

# local imports
import wrangle as w
import summarize as s

# ignore warning
import warnings
warnings.filterwarnings('ignore')

# Exploration Working Notebook

# Generate dataframes and split data for exploration

In [None]:
train, val, test, train_scaled, val_scaled, test_scaled, new_df, df_customers, df = w.wrangle_data()

In [None]:
# checking for subset shape
train.shape, val.shape, test.shape

# Cleaned & Prepped DataFrame

In [None]:
# cleaned dataframe
df.info()

In [None]:
# Checking for nulls
df.isnull().sum()

In [None]:
df.head(1)

# Aggregated and groupedby dataframe


In [None]:
new_df

new_df.info()

In [None]:
s.summarize(train)

In [None]:
train.info()

In [None]:
train.invoice_month_name.value_counts()

In [None]:
month_counts = train.invoice_month_name.value_counts()

In [None]:
month_order = ["January", "February", "March", "April", "May", "June", 
               "July", "August", "September", "October", "November", "December"]

# Sort the month counts according to the specified order
sorted_month_counts = month_counts.reindex(month_order)

# Plot the bar chart
plt.bar(x=sorted_month_counts.index, height=sorted_month_counts.values)
plt.xticks(rotation=30)
plt.show()


In [None]:
train.invoice_day_name.value_counts()

In [None]:
train.invoice_month_name

In [None]:
train.head()

In [None]:
train.describe().T

In [None]:
train[['quantity', 'unit_price', 'total_price']].head()

In [None]:
train_scaled.shape, val_scaled.shape, test_scaled.shape

In [None]:
train_scaled.head(3)

In [None]:
train_scaled.info()

In [None]:
train_scaled.describe().T

In [None]:
train_scaled.describe().T

In [None]:
train_scaled.columns.to_list()

In [None]:
train_scaled.head()  

## New features below based on features present:

In [None]:
train.total_price.value_counts()

In [None]:
train.total_price.head()

## Can we see patterns, find signals in the data?

## What features are driving the outcome?

## Are there other features we can construct that have stronger relationships?

## Visualization and statistical testing

## I want to walk away from exploration with with modeling strategies (feature selection, algorithm selection, evaluation methods, etc.).

## Lets take a look at the distributions of all features in the data

## What is the distribution of scaled numerical features?

In [None]:
col = train_scaled.columns.tolist()

# Loop through each column in the list
for col_name in col:
    # Check the data type of the current column
    if train_scaled[col_name].dtype.kind in 'biufc':
        # Create histogram if the data type is numeric
        plt.figure()
        plt.title(f'Distribution of {col_name}')
        sns.histplot(data=train_scaled, x=col_name)
        plt.show()
    else:
        print(f"Cannot create histogram. The data type of {col_name} is not numeric.")

## Categorical data

## What is the distribution of categorical features?

In [None]:
# Loop through each column in the list
for col_name in col:
    # Check if the current column is categorical (non-numeric)
    if train_scaled[col_name].dtype.kind not in 'biufc':
        # Create countplot if the data type is non-numeric (categorical)
        plt.figure()
        plt.title(f'Count of categories in {col_name}')
        sns.countplot(data=train_scaled, x=col_name)
        plt.xticks(rotation=30)  # Rotate x-axis labels for better readability if needed
        plt.show()
    else:
        print(f"Cannot create countplot. The data type of {col_name} is numeric.")

# What is the correlation between all numerical features?

In [None]:
sns.pairplot(data = train_scaled, corner=False)

# what is the `country` count distributions within this data?

In [None]:
country_cnts = train_scaled.country.value_counts()
country_cnts = pd.DataFrame(country_cnts).reset_index()
country_cnts = country_cnts[country_cnts['count'] > 1000]

In [None]:
# set figsze
plt.figure(figsize=(15, 10))

# Create the bar plot
ax = sns.barplot(data=country_cnts, x='count', y='country', palette='bright')

# remove spines
sns.despine(left=True, bottom=True)

# adding values at the end of each bar
for p in ax.patches:
    
    ax.annotate(f'{p.get_width():.0f}', (p.get_width(), p.get_y() + p.get_height() / 2),
                ha='left', va='center', fontsize=10, color='black')

# Remove x-tick labels, keeping the ticks
ax.set_xticklabels([])
ax.set_xticks([])

# Remove y ticks
ax.tick_params(axis='y', which='both', left=False)
# ax.tick_params(axis='x', which='both', bottom=False, xlabel=None)

        
# set labels
plt.title('Country Distribution')

# capitalize the y-axis labels
ax.set_yticklabels([label.get_text().upper() for label in ax.get_yticklabels()])

plt.ylabel('Country', rotation=0, labelpad=30)
plt.xlabel('')
plt.tight_layout()
plt.show()

* The data is comprised of alot of infmormation relating to the United Kingdom

In [None]:
train_scaled.columns.to_list()

## How does `quantity` vary with `customer_id` by country?

In [None]:
plt.figure(figsize=(10, 6))
sns.relplot(x='quantity', y='customer_id', hue='country', data=train_scaled)
plt.title('Quantity vs. Customers')
plt.xlabel('Quantity')
plt.ylabel('Customer ID')
plt.show()

## how about `unit_price` and `customer_id`?

In [None]:
plt.figure(figsize=(10, 6))
sns.relplot(x='unit_price', y='customer_id', data=train_scaled)
plt.title('Unit Price vs. Customers')
plt.xlabel('Unit Price')
plt.ylabel('Customer ID')
plt.show()

## Scatter plot of  `total_price` and `quantity`

In [None]:
# plt.figure(figsize=(10, 6))
sns.scatterplot(x='total_price', y='quantity', data=train)
plt.title('Is there a relationship between Total Price and Quantity?')
plt.xlabel('Total Price')
plt.ylabel('Quantity')
plt.show()

# -----------------------------------------------------------------

# How would scaling and normalizing features aid in getting a better visualization of distribution that exist within this dataset?

A data split and then scaling the train subset aided in providing a better visual and distribution of the data.

# Code added to prep function to handle negative values:

In [None]:
# df['quantity'] = df['quantity'].apply(lambda x: max(x, 0))
    
# df['unit_price'] = df['unit_price'].apply(lambda x: max(x, 0))

# -----------------------------------------------------------------

## I'm going to bin quantities:

In [None]:
train['quantity_bin'] = pd.cut(train.quantity, [0, 10, 20, 30, 40, 50, 10000])

train.head()

## How does quantity vary from total_price?

In [None]:
sns.barplot(x='quantity_bin', y='total_price', data=train)
plt.title('Is there a relationship between Total Price and Quantity?')
plt.xlabel('Quantity')
plt.ylabel('Total Price')
plt.show()

In [None]:
train.info()

In [None]:
train.columns.tolist()

In [None]:
# # Calculate Recency, Frequency, and MonetaryValue
# recency = train.groupby('customer_id')['invoice_date'].max()
# frequency = train.groupby('customer_id')['invoice_no'].count()
# monetary_value = train.groupby('customer_id')['total_price'].sum()

# # Add these features to the DataFrame
# train['Recency'] = train['customer_id'].map(recency)
# train['Frequency'] = train['customer_id'].map(frequency)
# train['MonetaryValue'] = train['customer_id'].map(monetary_value)

In [None]:
train = train.drop(columns='invoice_date')

In [None]:
# Create bins for the 'quantity' feature
train['quantity_bin'] = pd.cut(train.quantity, [0, 10, 20, 30, 40, 50, 10000])

# Optionally, assign labels to each bin for easier interpretation
bin_labels = ['0-10', '11-20', '21-30', '31-40', '41-50', '51+']
train['quantity_bin'] = pd.cut(train.quantity, bins=[0, 10, 20, 30, 40, 50, 10000], labels=bin_labels)

# Select relevant features for segmentation
# For example, use recency, frequency, monetary value, and the binned quantity feature
X = train[['Recency', 'Frequency', 'MonetaryValue', 'quantity_bin']]

# Convert categorical features into dummy/indicator variables if needed
X = pd.get_dummies(X)

# Apply K-means clustering
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(X)

# Assign cluster labels to each customer
train['Cluster'] = kmeans.labels_

# Analyze the clusters
cluster_analysis = train.groupby('Cluster').mean()

# Develop marketing strategies or operational improvements based on cluster analysis


## At this point I need to select a few features to further look at, I am going to take a look at my new_df dataframe which consists of invoice_date, invoice_no, and total_price.

In [None]:
new_df.head()

In [None]:
train, val, test = w.train_val_test(new_df)

In [None]:
train.columns

## Narrowing feature on what I want to explore.

In [None]:
categoricals = ['invoice_no']

numericals = ['invoice_date', 'total_price', 'quantity']

explore_cols = numericals + categoricals

In [None]:
explore_cols

## creating histograms on train[columns] numericals specifically

In [None]:
fig, axs = plt.subplots(1,2, figsize=(20,6))
for col, ax in zip(numericals, axs):
    ax.hist(train[col])
    ax.set_title(f'Distribution of {col}')
plt.show()

* Normality at first glance is pretty apperant here.
* invoice_date: right_tailed, data is very normalized between 0 to 150, then uniform
* total_price: Heavily concentrated around 0

In [None]:
sns.pairplot(data=train[explore_cols])
plt.show()

initital observations on this info based on a pair plot:

* total_price and invoice_no seems to have something, but otherwise there is a concentration near zero in this data

  # We need to remove existing outliers that are in the pairplot.

  * This means I have to reevaluate the function application prepping.

In [None]:
train, val, test = w.train_val_test()

In [None]:
train.columns

In [None]:
scaler = MinMaxScaler()
normalize_train = scaler.fit_transform(train)

In [None]:
normalize_train