In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

%matplotlib inline

df = pd.read_csv("amz_uk_price_prediction_dataset.csv")
df.head()

In [None]:
# Part 1: Understanding Product Categories
# Business Question: What are the most popular product categories on Amazon UK, and how do they compare in terms of listing frequency?

# 1. Frequency Tables:

#Generate a frequency table for the product category.
category_frq = (df['category'].value_counts()).head(15)
display(category_frq)

#Which are the top 5 most listed product categories?
print("The top5 most listed categories are Sports & Outdoors, Beauty, Handmade Clothing, Shoes & Accessories, Bath & Body and Birthday Gifts.")

In [None]:
# 2. Visualizations:

# Display the distribution of products across different categories using a bar chart. If you face problems understanding the chart, do it for a subset of top categories.
ax = sns.barplot(x=category_frq.index, y=category_frq.values, palette="Set3", hue = category_frq.index, legend = True)
sns.move_legend(ax, "upper right", fontsize='small')

plt.xticks(rotation=45, ha='right')
ax.set_ylabel("number of articles")
ax.set_xlabel("category")

plt.show()

#For a subset of top categories, visualize their proportions using a pie chart. Does any category dominate the listings?
category_frq.plot.pie(autopct='%.1f%%', startangle=45, colors=sns.color_palette("Set3"), pctdistance=0.85)
plt.ylabel('')
plt.show()
print("The category 'Sport & Outdoors' dominates the listing by far, making up for 76% of products.")

In [None]:
# Part 2: Delving into Product Pricing
# Business Question: How are products priced on Amazon UK, and are there specific price points or ranges that are more common?

# 1. Measures of Centrality:

# Calculate the mean, median, and mode for the price of products.
display(df.price.mean())
display(df.price.median())
display(df.price.mode())

# What's the average price point of products listed? How does this compare with the most common price point (mode)?
print("The average price of all products listed is 89,24 GBP, while the median is only 19,09 GBP. This indicates that there are very high priced items listed, that pull the average price up.")


In [None]:
# 2. Measures of Dispersion:

# Determine the variance, standard deviation, range, and interquartile range for product price.
display(df.price.var())
display(df.price.std())
display(df.price.max() - df.price.min())
display(df.price.quantile(3/4) - df.price.quantile(1/4))

# How varied are the product prices? Are there any indicators of a significant spread in prices?
print("I would say that the spread between prices is high. The std shows that the prices deviate around the mean by 346 GBP on average. I would consider that a big spread. Also the range of 100000 GBP indicates a veray big gap between lowest and highest price point. What is interesting, is that the interquartile range is only 36. I assume that is beacuse the extreme values are above the 75% quartile.")


In [None]:
# 3. Visualizations:

# Is there a specific price range where most products fall? Plot a histogram to visualize the distribution of product prices. If its hard to read these diagrams, think why this is, and explain how it could be solved..
sns.histplot(df['price'], bins=100, color="salmon");
plt.show()
print("The problem is, that the vast majority of prices fall into one bin but since the range is so bit you don't really see any other bins. The smaller you make the bins the more spread you can see but that will make the bars thinner and thinner. The solution would be to only look at specific range of prices.")
# Are there products that are priced significantly higher than the rest? Use a box plot to showcase the spread and potential outliers in product pricing.
sns.boxplot(x = df['price'], color="lightblue")
plt.show()
print("There seems to be a long tail of higher prices but two significant outliers around 80000 GBP and 100000 GBP.")

In [None]:
# Part 3: Unpacking Product Ratings
# Business Question: How do customers rate products on Amazon UK, and are there any patterns or tendencies in the ratings?

# 1. Measures of Centrality:

# Calculate the mean, median, and mode for the rating of products.
display(df.stars.mean())
display(df.stars.median())
display(df.stars.mode())

display(df.stars.value_counts())

filtered_df = df[df.stars > 0.0] 
display(filtered_df.stars.mean())
display(filtered_df.stars.median())
display(filtered_df.stars.mode())

# How do customers generally rate products? Is there a common trend?
print("The median and mode are 0, which may mean that customers didn't rate the articles at all as it's not possible to give 0 stars. Assuming that 0 stars mean no ratings, filtering out the 0 values will give a more accurate picture. Without 0 star ratings, the average rating is 4.3 and the median is 4.4, which indicates that the articles are generally rated very well above 4 stars once the customers rate an article")


In [None]:
# 2. Measures of Dispersion:

# Determine the variance, standard deviation, and interquartile range for product rating.
display(filtered_df["stars"].var())
display(filtered_df["stars"].std())
display(filtered_df.stars.quantile(0.75) - filtered_df.stars.quantile(0.25))

# Are the ratings consistent, or is there a wide variation in customer feedback?
print("Something looks odd, because the variance is smaller than the std but I can't tell what it is. The numbers together with the interquartile range nevertheless indicate, that most customers rate between 4 and 5 stars.")


In [None]:
# 3. Shape of the Distribution:

# Calculate the skewness and kurtosis for the rating column.
skewness_rating = filtered_df['stars'].skew()
kurtosis_rating = filtered_df['stars'].kurtosis()

skewness_rating, kurtosis_rating

# Are the ratings normally distributed, or do they lean towards higher or lower values?
print("The negative value of skewness (-2.38) indicates that the distribution is left-skewed. This means that the tail on the left side (lower ratings) is longer than the left side. This suggests that there are a significant number of ratings that are low, acting as outliers and pulling the mean down. Kurtosis: The kurtosis value of 9.78 is greater than 3, which indicates that the rating distribution has heavier tails and a sharper peak compared to a normal distribution. This means that there are more outliers (extreme values) than one would expect in a normally distributed set.")

In [None]:
# 4. Visualizations:

# Plot a histogram to visualize the distribution of product ratings. Is there a specific rating that is more common?
sns.histplot(filtered_df['stars'], bins=50, color="salmon");
plt.show()
print("The most common rating besides no rating is 4.5.")