In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import chi2_contingency
from scipy.stats import pearsonr

%matplotlib inline

In [None]:
df_as = pd.read_csv("/Users/merryxmm7/Documents/GitHub/lab-eda-univariate/amz_uk_price_prediction_dataset.csv")

In [None]:
df_as

In [None]:
#1 Crosstab Analysis between category and isBestSeller
# here we see the number of occurances of the best seller for each category
crosstab_result = pd.crosstab(df_as['category'], df_as['isBestSeller'])
# Sorting by the sum of values in each row 
crosstab_result_sorted = crosstab_result.loc[crosstab_result.sum(axis=1).sort_values(ascending=False).index]


In [None]:
top_5_categories.plot(kind='bar', stacked=True, figsize=(10, 6)).legend(title='Best-Seller Status'); plt.title('Top 5 Categories and Best-Seller Status'); plt.xlabel('Categories'); plt.ylabel('Proportion'); plt.xticks(rotation=45); plt.tight_layout(); plt.show()


In [None]:
# Now we want to see the proportions so that qe can come to a conclusion and say if there are some categories where being 
# best seller is most prevalent 
# Calculate the row-wise sum
row_sums = crosstab_result.sum(axis=1)

# Divide each value in the crosstab result by the row sum to get proportions
proportions = crosstab_result.div(row_sums, axis=0)
sorted_proportions = proportions.sort_values(by=True, ascending=False)
sorted_proportions
# Grocery have the most best sellers in proportion to their sales

In [None]:
#2. Statistical tests
# Chi test
from scipy.stats import chi2_contingency
chi2_statistic, chi2_p_value, _, _ = chi2_contingency(crosstab_result)

chi2_statistic, chi2_p_value

In [None]:
# Since our p value is 0, the association between category and bestseller is statistical significant
# Therefore we reject the null-hypothesis and cocnlude that the categories are not independent of whether an item is a best seller or no

In [None]:
#2.1 Cramers V
from scipy.stats.contingency import association

# Computing the association between variables in 'crosstab_result' using the "cramer" method
association(crosstab_result, method="cramer")
# we have a weak association but still stat significant between categories and best sellers

In [None]:
#3. Visualisations
top_5 = crosstab_result_sorted.head(5)
top_5.plot(kind="bar", stacked=True)

In [None]:
# Part 2
# 2.0 Preliminary Step: Remove outliers in product prices
def tukeys_test_outliers(data):
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    
    # Define bounds for the outliers
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Identify the outliers
    outliers = data[(data > lower_bound) | (data < upper_bound)]
    
    return outliers

In [None]:
df_prices = df_as["price"]
outliers = tukeys_test_outliers(df_prices)
outliers

In [None]:
#violin plot
top_5_categories = df_as.groupby('category')['price'].median().nlargest(20).index
sns.violinplot(data=df_as[df_as['category'].isin(df_as.groupby('category')['price'].median().nlargest(5).index)], x='category', y='price', palette="coolwarm")


In [None]:
category_counts = df_as['category'].value_counts()
top_20_categories = category_counts.nlargest(20).index
df_top_20_categories = df_as[df_as['category'].isin(top_20_categories)]
sns.violinplot(data=df_top_20_categories, x='category', y='price', palette="coolwarm"); plt.title('Violin Plot of Price Distribution for Top 20 Categories'); plt.xlabel('Category'); plt.ylabel('Price'); plt.xticks(rotation=90); plt.show()


In [None]:
category_counts = df_as['category'].value_counts()

# Select the top 20 categories based on count
top_20_categories = category_counts.nlargest(10).index

# Filter the DataFrame to include only products from the top 20 categories
df_top_20_categories = df_as[df_as['category'].isin(top_20_categories)]


In [None]:
sns.barplot(data=df_top_20_categories, x='category', y='price', palette="coolwarm")
plt.xticks(rotation = 90)

In [None]:
category_counts = df_as['category'].value_counts()

# Select the top 10 categories based on count
top_10_categories = category_counts.nlargest(10).index

# Filter the DataFrame to include only products from the top 10 categories
df_top_10_categories = df_as[df_as['category'].isin(top_10_categories)]

# Calculate the average price for each category
average_prices = df_top_10_categories.groupby('category')['price'].mean().sort_values(ascending=False).reset_index()

sns.barplot(data=average_prices, x='category', y='price', palette="coolwarm")
plt.xticks(rotation = 90)


In [None]:
# Calculate the average price for each category
average_prices_all_categories = df_as.groupby('category')['price'].mean().sort_values(ascending=False)

# Find the category with the highest average price
highest_average_price_category = average_prices_all_categories.idxmax()
highest_average_price_category

In [None]:
category_counts = df_as['category'].value_counts()

# Select the top 10 categories based on count
top_10_categories = category_counts.nlargest(10).index

# Filter the DataFrame to include only products from the top 10 categories
df_top_10_categories = df_as[df_as['category'].isin(top_10_categories)]
category_counts = df_as['category'].value_counts(); top_10_categories = category_counts.nlargest(10).index; df_top_10_categories = df_as[df_as['category'].isin(top_10_categories)]; sns.boxplot(data=df_top_10_categories, x='category', y='stars'); plt.title('Distribution of Product Ratings for Top 10 Categories'); plt.xlabel('Category'); plt.ylabel('Rating'); plt.xticks(rotation=45); plt.show(); highest_median_rating_category = df_as.groupby('category')['rating'].median().idxmax(); print("The product category with the highest median rating is:", highest_median_rating_category)


In [None]:
#3 correlation between prices and rating 
correlation_coefficient = df_as['price'].corr(df_as['stars'])
correlation_coefficient
# negative so the higer the price the less the rating

In [None]:
correlation_coefficient, p_value = pearsonr(df_as['price'], df_as['stars'])

print("Correlation Coefficient:", correlation_coefficient)
print("P-value:", p_value)

if p_value < 0.05:
    print("There is a significant correlation between product price and its rating.")
else:
    print("There is no significant correlation between product price and its rating.")


In [None]:
sns.scatterplot(data=df_as, x='stars', y='price'); plt.title('Relationship between Product Rating and Price'); plt.xlabel('Rating'); plt.ylabel('Price'); plt.show()

In [None]:

# Correlation heatmap
sns.heatmap(df_as.corr(), annot=True, cmap='coolwarm', fmt=".2f", annot_kws={"size": 10}); plt.title('Correlation Heatmap'); plt.show()

# QQ plot to examine normality of product prices



In [None]:
stats.probplot(df_as['price'], dist="norm", plot=plt); plt.title('QQ Plot for Product Prices'); plt.xlabel('Theoretical Quantiles'); plt.ylabel('Ordered Values'); plt.show()