# 1.0 Importing Libraries
# Import Python Libraries

In [None]:
# Essential Configuration
%matplotlib inline
%config InlineBackend.figure_formats = {'retina', 'svg'}

# Core Libraries
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Visualization Libraries
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from matplotlib.colors import LinearSegmentedColormap
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.gridspec as gridspec
from matplotlib import colors as mcolors
import plotly.graph_objects as go
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)

# Statistical and Data Manipulation
from scipy.stats import linregress
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from collections import Counter

# Scikit-learn Libraries
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.ensemble import IsolationForest
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering, MeanShift, estimate_bandwidth
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

# Visualization Helpers for Clustering
from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer

# Display Settings
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', None)        # Adjust the display width to accommodate all columns

# Seaborn Default Settings
sns.set(
    {
        "figure.figsize": (8, 6)
    },
    style='ticks',
    color_codes=True,
    font_scale=0.8
)


: 

# 2.0 Loading the Dataset

In [None]:
# Load dataset using panda read_csv to and check first three rows
df = pd.read_csv('Dataset_ecommerce.csv')
df.head()

# Dataset Description:

1. InvoiceNo: Code representing each unique transaction.
2. StockCode:	Code uniquely assigned to each distinct product.
3. Description:	Description of each product.
4. Quantity:	The number of units of a product in a transaction.
5. InvoiceDate:	The date and time of the transaction.
6. UnitPrice:	The unit price of the product in sterling.
7. CustomerID:	Identifier uniquely assigned to each customer.
8. Country:	The country of the customer.

# 2.0 Data Understanding

In [None]:
#check data shape
df.shape

In [None]:
#check data information
df.info()

In [None]:
# Check duplicate values
df.duplicated().sum()

# Displaying the number of duplicate rows
print(f"The dataset contains {df.duplicated().sum()} duplicate rows that need to be removed.")

In [None]:
# Checking for missing values in dataset
df.isnull().sum()

In [None]:
# check data statistics
df.describe(include='all').round(2)

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Combine all text in the 'Description' column into a single string
text = ' '.join(df['Description'].dropna().astype(str))

# Generate the word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white', colormap='viridis').generate(text)

# Plot the word cloud
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')  # Turn off axis
plt.title('Word Cloud for Product Descriptions', fontsize=16)
plt.show()


In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Combine all text in the 'Country' column into a single string
text = ' '.join(df['Country'].dropna().astype(str))

# Generate the word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white', colormap='viridis').generate(text)

# Plot the word cloud
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')  # Turn off axis
plt.title('Word Cloud for Country', fontsize=16)
plt.show()


# Comments:

1. Quantity: The average quantity of products in a transaction is approximately 50.53. The quantity has a range with a minimum value of 1 and a maximum value of 100.
2. UnitPrice: The average unit price of the products is approximately 50.58.
The unit price also shows a wide range, from 1 to 100.
3. CustomerID: There are 406829 non-null entries, indicating missing values in the dataset which need to be addressed. The Customer IDs range from 12346 to 18287, helping in identifying unique customers.
4. InvoiceNo: There are 25900 unique invoice numbers, indicating 25900 separate transactions. The most frequent invoice number is 573585, appearing 1114 times, possibly representing a large transaction or an order with multiple items.
5. StockCode: There are 9000 unique stock codes representing different products.
The most frequent stock code is SC2014, appearing 96 times in the dataset.
6. Description: There are 10 unique product descriptions. The most frequent product description is "Sports Equipment", appearing 54765 times.
7. Country: The transactions come from 28 different countries, with a dominant majority of the transactions (approximately 3.6%) originating from the Cote d'Ivoire

# Other Observations:
1.   The data has 541,909 rows and  8 features.
2.   The missing values is from "CustomerID" only having missing values of 	135,080. The rows will be removed from the data.
3.   No duplicated values observed in the data.
4.   Unique Values: The unique vales are made of 28 countries , 22,190 invoices, 9,000 stock code.
5.   The data types consist of 5 objects, 2 floats , and 1 interger.
6.   The common words from 'Description' and 'Country' are 'Sports equipments' and 'Cote d'Ivouire'.


# 2.0 Loading the Dataset