In [10]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats.contingency import association
from scipy.stats import chi2_contingency
import statsmodels.api as sm


%matplotlib inline

In [11]:
url = '/Users/javi/Desktop/dsfvfvdv/amz_uk_price_prediction_dataset.csv'
df = pd.read_csv(url)

# Crosstab Analysis


In [12]:
crosstab_result = pd.crosstab(df['category'], df['isBestSeller'])
sorted_crosstab_result = crosstab_result.sort_values(by=True, ascending=False)

print(sorted_crosstab_result)

#There are no discernable products but Groceries, Health & Personal Care, Power & Hand Tools have around 6% influence not enough to be of any meaning need further investigation

isBestSeller                  False  True 
category                                  
Grocery                        9008    556
Health & Personal Care         9017    552
Baby                          14709    341
Sports & Outdoors            835935    330
Power & Hand Tools             8353    306
...                             ...    ...
Downhill Ski Boots             1284      0
Digital Frames                  774      0
Motorbike Chassis               107      0
Motorbike Drive & Gears         250      0
eBook Readers & Accessories     246      0

[296 rows x 2 columns]


# Statistical Tests

# Chi-square test

In [13]:
chi2_statistic, chi2_p_value, _, _ = chi2_contingency(crosstab_result)
chi2_statistic, chi2_p_value


(36540.20270061387, 0.0)

In [37]:
# Interpretation of numbers there is a very high and even say extreme association between the df['category'] and df['isBestSeller'] 


# Cramér's V

In [14]:
association(crosstab_result, method="cramer")


0.1222829439760564

In [None]:
# Interpretation of Cramér's V 0.122 suggests quite a weak association between the two variables. Though statistically significant, the strength of this relationship is not very strong in practical terms. 

# Visualization stacked bar chart

In [None]:
crosstab_result.plot(kind="bar", stacked=True)
plt.xticks(rotation=45, ha='right')
plt.show()

# Part 2: Exploring Product Prices and Ratings Across Categories and Brands

# Preliminary Step: Remove outliers in product prices.

In [None]:
def tukeys_test_outliers(data):
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    
    # Define bounds for the outliers
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Identify the outliers
    outliers = data[(data < lower_bound) | (data > upper_bound)]
    
    return outliers

# Apply Tukey's test to identify outliers in the 'price' column
data_series = df['price']  
outliers = tukeys_test_outliers(data_series)




# Violin Plots

In [None]:
# Create a new DataFrame without outliers
filtered_df = df[~df['price'].isin(outliers)]

category_counts = filtered_df['category'].value_counts()

top_20_categories = category_counts.head(20).index

filtered_df_top_20 = filtered_df[filtered_df['category'].isin(top_20_categories)]

sns.violinplot(data=filtered_df_top_20, x='category', y='price', palette="coolwarm")

plt.xticks(rotation=45, ha='right')

plt.show()

Which product category tends to have the highest median price? Don't filter here by top categories.

In [None]:
# Best or highest median are = Men, Fragrances and Sports & Outdoors in that order

# Bar charts

In [None]:
category_counts = filtered_df['category'].value_counts()

# Select the top 10 categories
top_10_categories = category_counts.head(10).index

# Filter the DataFrame to include only the top 10 categories
filtered_df_top_10 = filtered_df[filtered_df['category'].isin(top_10_categories)]
# Plotting a bar plot to visualize the average 'price' for each 'category' category, using the "coolwarm" color palette
sns.barplot(data=filtered_df_top_10, x='category', y='price', palette="coolwarm")
plt.xticks(rotation=45, ha='right')

# Show the plot
plt.show()


In [None]:
# the highest average price is commanded by Sports & Outdoors

# Box Plots

In [None]:
filtered_df_top_10 = filtered_df_top_10[filtered_df_top_10['stars'] > 0]
# Plotting a box plot to visualize the spread and outliers of 'SalePrice' for each 'MSZoning' category, using the "coolwarm" color palette
sns.boxplot(data=filtered_df_top_10, x='category', y='stars', palette="coolwarm")
plt.xticks(rotation=45, ha='right')
plt.show()


In [None]:
# Handmad Clothing, Shoes & Accessories has the highest median rating

In [33]:
filtered_df_stars_gt_0 = filtered_df[filtered_df['stars'] > 0]
# Calculating the Pearson correlation coefficient between 'price' and 'stars' columns
correlation = filtered_df['price'].corr(filtered_df['stars'])
correlation
p


-0.07767299878181444

In [None]:
#The Pearson correlation coefficient of -0.07767299878181444 suggests a weak negative linear relationship between Amazon prices and the stars of the product. This indicates that there is a slight tendency for products with higher prices to have slightly lower star ratings on Amazon, and vice versa. However, the strength of this relationship is quite weak, implying that other factors may play a more significant role in determining product ratings. Further analysis is required

# Part 3: Investigating the Interplay Between Product Prices and Ratings

In [20]:
# Calculating the Pearson correlation coefficient between 'price' and 'stars' columns
correlation = filtered_df_stars_gt_0['price'].corr(filtered_df_stars_gt_0['stars'])
correlation



0.006744508067157907

In [None]:
#if we filter the star rating and get rid of the 0 that which sadly in this data frame is counted as a vote eventhough on amazon these votes don't count towards the main result of the real star rating we can see there is almost not correlation between stars and prices

In [21]:
# Calculating the Spearman rank correlation between 'price' and 'stars' 
correlation = df['price'].corr(df['stars'], method='spearman')
correlation



-0.13316425462433876

In [None]:
# The Spearman correlation coefficient of -0.1332 indicates a weak negative monotonic relationship between Amazon prices and the stars of the product. This suggests that there is a slight tendency for products with higher prices to have slightly lower star ratings on Amazon, and vice versa, but the relationship is weak. As with the Pearson correlation, this implies that other factors may have a more significant impact on product ratings. Further analysis is required

In [22]:
# Calculating the Spearman rank correlation between 'price' and 'stars' 
correlation = filtered_df_stars_gt_0['price'].corr(filtered_df_stars_gt_0['stars'], method='spearman')
correlation


0.024659617205380468

In [None]:
# The same stands for the Spearman correlation. There is no direct correlation between price and stars

# Visualisation

# Scatter plot

In [None]:
# the entire df Plotting a scatter plot to visualize the relationship between 'stars' and 'price'
sns.scatterplot(data=df, x='price', y='stars')

# Adjust x-axis labels rotation for better readability
plt.xticks(rotation=45, ha='right')

# Show the plot
plt.show()

In [None]:
sns.scatterplot(data=filtered_df, x='price', y='stars')
plt.xticks(rotation=45, ha='right')
plt.show()

In [None]:
sns.scatterplot(data=filtered_df_stars_gt_0, x='price', y='stars')
plt.xticks(rotation=45, ha='right')
plt.show()



In [None]:
# doesn´t matter the price... the reviews are a bit all over the place. Hinting at no correlation between star ratings and prices

In [26]:
potential_categorical_from_numerical = filtered_df_stars_gt_0.select_dtypes("number").loc[:, filtered_df_stars_gt_0.select_dtypes("number").nunique() < 20]
potential_categorical_from_numerical

0
1
2
3
4
...
2443639
2443642
2443644
2443649
2443650


# Heatmap

In [None]:

df_categorical = pd.concat([filtered_df_stars_gt_0.select_dtypes("object"), potential_categorical_from_numerical], axis=1)

df_numerical = filtered_df_stars_gt_0.select_dtypes("number").drop(columns=potential_categorical_from_numerical.columns)
correlation_matrix = df_numerical.corr()
plt.figure(figsize=(18, 15))

sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm")

plt.title("Correlation Heatmap for Selected Numerical Variables")
plt.show()

# QQ Plot

In [None]:
# Generating a Q-Q plot for 'SalePrice' to check if its distribution follows a normal distribution
sm.qqplot(filtered_df['price'], line='s');
sm.qqplot(filtered_df_stars_gt_0['price'], line='s');
sm.qqplot(df['price'], line='s');

# BONUS

In [None]:
category_counts = df['category'].value_counts()
df['category'].value_counts()

top_20_categories = category_counts.head(20).index

filtered_df_top_20 = df[df['category'].isin(top_20_categories)]

sns.violinplot(data=filtered_df_top_20, x='category', y='price', palette="coolwarm")

plt.xticks(rotation=45, ha='right')

plt.show()


In [None]:
# IT'S quite hard to read the violin plot for the human eye in this format. the Outliers stretch out the picture and the picture hasn´t a high enough pixel count to be able to zoom in and be able to anything useful

In [None]:
# BAR CHART
category_counts = df['category'].value_counts()

# Select the top 10 categories
top_10_categories = category_counts.head(10).index

# Filter the DataFrame to include only the top 10 categories
filtered_df_top_10 = df[df['category'].isin(top_10_categories)]
# Plotting a bar plot to visualize the average 'price' for each 'category' category, using the "coolwarm" color palette
sns.barplot(data=filtered_df_top_10, x='category', y='price', palette="coolwarm")
plt.xticks(rotation=45, ha='right')
plt.show()

In [None]:
#BOX PLOTS
filtered_df_top_10 = filtered_df_top_10[filtered_df_top_10['stars'] > 0]
# Plotting a box plot to visualize the spread and outliers of 'SalePrice' for each 'MSZoning' category, using the "coolwarm" color palette
sns.boxplot(data=filtered_df_top_10, x='category', y='stars', palette="coolwarm")
plt.xticks(rotation=45, ha='right')
plt.show()
# Handmad Clothing, Shoes & Accessories still has highest median rating followed by birthday gifts. We could start comparing and see how much revenue each section has and also try to understand why make-up, sports & outdoors, manicure & pedicure products have the lowest in q3 rating .

In [35]:
# CORRELIATION COEFFICIENTS:
filtered_df_stars_gt_0 = df[df['stars'] > 0]
# Calculating the Pearson correlation coefficient between 'price' and 'stars' columns
correlation = df['price'].corr(df['stars'])
print('The correlation of filtered stars and outliers is: ', correlation)

The correlation of filtered stars and outliers is:  -0.12490673262148386


In [None]:
#The Pearson correlation coefficient of -0.1249 suggests a weak negative linear relationship between Amazon prices and the stars of the product. This indicates that there is a slight tendency for products with higher prices to have slightly lower star ratings on Amazon, and vice versa. However, the strength of this relationship is quite weak, implying that other factors may play a more significant role in determining product ratings. Further analysis is required

In [None]:
sns.scatterplot(data=df, x='price', y='stars')

plt.xticks(rotation=45, ha='right')

plt.show()

In [None]:
# As we can see it´s the same interpretation as before but it´s interesting to mention that the best example of "expensive != good stars" since the most expensive products has poor reviews. and most of the expensive products have poor reviews

In [None]:
#HEATMAP
df_categorical = pd.concat([filtered_df_stars_gt_0.select_dtypes("object"), potential_categorical_from_numerical], axis=1)

df_numerical = filtered_df_stars_gt_0.select_dtypes("number").drop(columns=potential_categorical_from_numerical.columns)
correlation_matrix = df_numerical.corr()

# Setting up the matplotlib figure with an appropriate size
plt.figure(figsize=(18, 15))

# Drawing the heatmap for the numerical columns
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm")

plt.title("Correlation Heatmap for Selected Numerical Variables")
plt.show()

In [None]:
#there not a great deviation between the heatmaps not enough atleast to change any conclusions

In [None]:
#QQ PLOT
sm.qqplot(filtered_df_stars_gt_0['price'], line='s');
sm.qqplot(df['price'], line='s');

In [None]:
#there no correlations amogst price and stars