In [1]:
# imported libs
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, PolynomialFeatures

# local imports
import wrangle as w

# ignore warning
import warnings
warnings.filterwarnings('ignore')

# Exploration Working Notebook

In [2]:
import wrangle as w

In [None]:
train, val, test, train_scaled, val_scaled, test_scaled, new_df, df_customers, df = w.wrangle_data()
train.shape, val.shape, test.shape

In [None]:
# cleaned dataframe
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.head()

In [None]:
# aggregated and grouped by dataframe
new_df

new_df.head(3)

In [None]:
train.info()

In [None]:
train.head()

In [None]:
train.describe().T

In [None]:
train[['quantity', 'unit_price', 'total_price']].head()

In [None]:
train_scaled.shape, val_scaled.shape, test_scaled.shape

In [None]:
train_scaled.head(3)

In [None]:
train_scaled.describe().T

In [None]:
train_scaled.describe().T

In [None]:
train_scaled.head()  

## New features below based on features present:

In [None]:
train.total_price.value_counts()

In [None]:
train.total_price.head()

## Can we see patterns, find signals in the data?

## What features are driving the outcome?

## Are there other features we can construct that have stronger relationships?

## Visualization and statistical testing

## I want to walk away from exploration with with modeling strategies (feature selection, algorithm selection, evaluation methods, etc.).

## Lets take a look at the distributions of all features in the data

## What is the distribution of scaled numerical features?

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

numeric_cols = train_scaled.columns[(train_scaled.dtypes == 'float64') | (train_scaled.dtypes == 'int64')]

# Created histograms for numeric columns
for col in numeric_cols:
    plt.figure()
    plt.title(f'Distribution of {col}')
    sns.histplot(data=train_scaled, x=col)
    plt.tight_layout()
    plt.show()

## Categorical data

## What is the distribution of categorical features?

In [None]:
for col in train_scaled.columns[train_scaled.dtypes == 'object']:
    plt.figure()
    plt.title(f'Distribution of {col}')
    sns.histplot(data=train_scaled, x=col)
    plt.show()

# What does the correlation between all numerical features?

In [None]:
sns.pairplot(data = train_scaled, corner=False)

# what is the `country` count distributions within this data?

In [None]:
country_cnts = train_scaled.country.value_counts()
country_cnts = pd.DataFrame(country_cnts).reset_index()
country_cnts = country_cnts[country_cnts['count'] > 1000]

In [None]:
# set figsze
plt.figure(figsize=(15, 10))

# Create the bar plot
ax = sns.barplot(data=country_cnts, x='count', y='country', palette='bright')

# remove spines
sns.despine(left=True, bottom=True)

# adding values at the end of each bar
for p in ax.patches:
    
    ax.annotate(f'{p.get_width():.0f}', (p.get_width(), p.get_y() + p.get_height() / 2),
                ha='left', va='center', fontsize=10, color='black')

# Remove x-tick labels, keeping the ticks
ax.set_xticklabels([])
ax.set_xticks([])

# Remove y ticks
ax.tick_params(axis='y', which='both', left=False)
# ax.tick_params(axis='x', which='both', bottom=False, xlabel=None)

        
# set labels
plt.title('Country Distribution')

# capitalize the y-axis labels
ax.set_yticklabels([label.get_text().upper() for label in ax.get_yticklabels()])

plt.ylabel('Country', rotation=0, labelpad=30)
plt.xlabel('')
plt.tight_layout()
plt.show()

* The data is comprised of alot of infmormation relating to the United Kingdom

## How does `quantity` vary with `customer_id`?

In [None]:
plt.figure(figsize=(10, 6))
sns.relplot(x='quantity', y='customer_id', data=train_scaled)
plt.title('Quantity vs. Customers')
plt.xlabel('Quantity')
plt.ylabel('Customer ID')
plt.show()

## how about `unit_price` and `customer_id`?

In [None]:
plt.figure(figsize=(10, 6))
sns.relplot(x='unit_price', y='customer_id', data=train_scaled)
plt.title('Unit Price vs. Customers')
plt.xlabel('Unit Price')
plt.ylabel('Customer ID')
plt.show()

## Scatter plot of  `total_price` and `quantity`

In [None]:
# plt.figure(figsize=(10, 6))
sns.scatterplot(x='total_price', y='quantity', data=train)
plt.title('Is there a relationship between Total Price and Quantity?')
plt.xlabel('Total Price')
plt.ylabel('Quantity')
plt.show()

In [None]:
# plt.figure(figsize=(10, 6))
sns.scatterplot(x='total_price', y='quantity', data=train_scaled)
plt.title('Is there a relationship between Total Price and Quantity?')
plt.xlabel('Total Price')
plt.ylabel('Quantity')
plt.show()

# -----------------------------------------------------------------

# How would scaling and normalizing features aid in getting a better visualization of distribution that exist within this dataset?

A data split and then scaling the train subset aided in providing a better visual and distribution of the data.

# Code added to prep function to handle negative values:

In [None]:
# df['quantity'] = df['quantity'].apply(lambda x: max(x, 0))
    
# df['unit_price'] = df['unit_price'].apply(lambda x: max(x, 0))

# -----------------------------------------------------------------

## I'm going to bin quantities:

In [None]:
train['quantity_bin'] = pd.cut(train.quantity, [0, 10, 20, 30, 40, 50, 10000])

train.head()

## How does quantity vary from total_price?

In [None]:
sns.barplot(x='quantity_bin', y='total_price', data=train)
plt.title('Is there a relationship between Total Price and Quantity?')
plt.xlabel('Quantity')
plt.ylabel('Total Price')
plt.show()

## At this point I need to select a few features to further look at, I am going to take a look at my new_df dataframe which consists of invoice_date, invoice_no, and total_price.

In [None]:
new_df.head()

In [None]:
train, val, test = w.train_val_test(new_df)

In [None]:
train.columns

## Narrowing feature on what I want to explore.

In [None]:
categoricals = ['invoice_no']

numericals = ['invoice_date', 'total_price']

explore_cols = numericals + categoricals

In [None]:
explore_cols

## creating histograms on train[columns] numericals specifically

In [None]:
fig, axs = plt.subplots(1,2, figsize=(20,6))
for col, ax in zip(numericals, axs):
    ax.hist(train[col])
    ax.set_title(f'Distribution of {col}')
plt.show()

* Normality at first glance is pretty apperant here.
* invoice_date: right_tailed, data is very normalized between 0 to 150, then uniform
* total_price: Heavily concentrated around 0

In [None]:
sns.pairplot(data=train[explore_cols])
plt.show()

initital observations on this info based on a pair plot:

* total_price and invoice_no seems to have something, but otherwise there is a concentration near zero in this data

  # We need to remove existing outliers that are in the pairplot.

  * This means I have to reevaluate the function application prepping.

In [None]:
train, val, test = w.train_val_test(normalize_df)

In [None]:
train.columns

In [None]:
scaler = MinMaxScaler()
normalize_train = scaler.fit_transform(train)

In [None]:
normalize_train