In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import statsmodels.api as sm

%matplotlib inline

In [None]:
df = pd.read_csv("C:/Users/evacv/Documents/bootcamp Data analysis/archive/amz_uk_price_prediction_dataset.csv")

 Part 1: Analyzing Best-Seller Trends Across Product Categories
**Objective**: Understand the relationship between product categories and their best-seller status.
1. **Crosstab Analysis**:
Create a crosstab between the product `category` and the `isBestSeller` status.
Are there categories where being a best-seller is more prevalent?

In [None]:
crosstab_result = pd.crosstab(df['category'], df['isBestSeller'])
crosstab_result

2. **Statistical Tests**:
    - Conduct a Chi-square test to determine if the best-seller distribution is independent of the product category.
    - Compute Cramér's V to understand the strength of association between best-seller status and category.

In [None]:
from scipy.stats import chi2_contingency

# Chi-square test for 'MSZoning' and 'SaleCondition'
chi2_statistic, chi2_p_value, _, _ = chi2_contingency(crosstab_result)

chi2_statistic, chi2_p_value

3. **Visualizations**:
	- Visualize the relationship between product categories and the best-seller status using a stacked bar chart.

In [None]:
plt.figure(figsize=(10, 6))
crosstab_result.plot(kind='bar', stacked=True)
plt.title('Relación entre Categorías de Productos y Estado de Mejor Vendedor')
plt.xlabel('Categoría de Producto')
plt.ylabel('Cantidad de Productos')
plt.xticks(rotation=45)
plt.legend(title='isBestSeller', loc='upper right')
plt.show()

### Part 2: Exploring Product Prices and Ratings Across Categories and Brands

**Objective**: Investigate how different product categories influence product prices.

0. **Preliminary Step: Remove outliers in product prices.**

	For this purpose, we can use the IQR (Interquartile Range) method. Products priced below the first quartile minus 1.5 times the IQR or above the third quartile plus 1.5 times the IQR will be considered outliers and removed from the dataset. The next steps will be done with the dataframe without outliers.
	
	*Hint: you can check the last Check For Understanding at the end of the lesson EDA Bivariate Analysis for a hint on how to do this.*






In [None]:
Q1 = df['price'].quantile(0.25)
Q3 = df['price'].quantile(0.75)
IQR = Q3 - Q1

# Define lower and upper bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out outliers
df_no_outliers = df[(df['price'] >= lower_bound) & (df['price'] <= upper_bound)]
df_no_outliers

1. **Violin Plots**:
    - Use a violin plot to visualize the distribution of `price` across different product `categories`. Filter out the top 20 categories based on count for better visualization.
    - Which product category tends to have the highest median price? Don't filter here by top categories.


In [None]:
top_categories = df['category'].value_counts().head(20).index
df_top_categories = df[df['category'].isin(top_categories)]

plt.figure(figsize=(12, 8))
sns.violinplot(x='category', y='price', data=df_top_categories)
plt.title('Distribution of Price Across Different Product Categories')
plt.xlabel('Product Category')
plt.ylabel('Price')
plt.xticks(rotation=90)
plt.show()





In [None]:
highest_median_category = df.groupby('category')['price'].median().idxmax()
print("Product category with the highest median price:", highest_median_category)

2. **Bar Charts**:
    - Create a bar chart comparing the average price of products for the top 10 product categories (based on count).
    - Which product category commands the highest average price? Don't filter here by top categories.

In [None]:
top_10_categories = df['category'].value_counts().nlargest(10).index
df_top_10_categories = df[df['category'].isin(top_10_categories)]

# Calculate the average price for each category
average_prices = df_top_10_categories.groupby('category')['price'].mean().sort_values(ascending=False)

# Create a bar chart
plt.figure(figsize=(10, 6))
average_prices.plot(kind='bar', color='skyblue')
plt.title('Average Price of Products for Top 10 Product Categories')
plt.xlabel('Product Category')
plt.ylabel('Average Price')
plt.xticks(rotation=45)
plt.show()

In [None]:
average_price_by_category = df.groupby('category')['price'].mean()

# Encontrar la categoría con el precio promedio más alto
category_with_highest_avg_price = average_price_by_category.idxmax()
highest_avg_price = average_price_by_category.max()

print("Product category with the highest average price:", category_with_highest_avg_price)
print("Highest average price:", highest_avg_price)

3. **Box Plots**:
    - Visualize the distribution of product `ratings` based on their `category` using side-by-side box plots. Filter out the top 10 categories based on count for better visualization.
    - Which category tends to receive the highest median rating from customers? Don't filter here by top categories.

In [None]:
top_10_categories = df['category'].value_counts().nlargest(10).index
df_top_10_categories = df[df['category'].isin(top_10_categories)]

# Crear diagramas de caja lado a lado
plt.figure(figsize=(12, 8))
sns.boxplot(x='category', y='stars', data=df_top_10_categories, palette='Set3')
plt.title('Distribution of Product Ratings Across Top 10 Product Categories')
plt.xlabel('Product Category')
plt.ylabel('Rating')
plt.xticks(rotation=45)
plt.show()

# Identificar la categoría que tiende a recibir la calificación mediana más alta
highest_median_rating_category = df.groupby('category')['stars'].median().idxmax()
print("Category that tends to receive the highest median stars:", highest_median_rating_category)

### Part 3: Investigating the Interplay Between Product Prices and Ratings

**Objective**: Analyze how product ratings (`stars`) correlate with product prices.

1. **Correlation Coefficients**:
    - Calculate the correlation coefficient between `price` and `stars`.
    - Is there a significant correlation between product price and its rating?
	


In [None]:
correlation_coefficient = df['price'].corr(df['stars'])

# Mostrar el coeficiente de correlación
print("Correlation coefficient between price and stars:", correlation_coefficient)

#El coeficiente de correlación de Pearson calculado entre el precio y las calificaciones es aproximadamente -0.125. Un coeficiente de correlación negativo indica una relación inversa 
#entre las dos variables, lo que sugiere que a medida que el precio aumenta, las calificaciones tienden a disminuir, y viceversa. Sin embargo, la magnitud del coeficiente es relativamente 
#baja, lo que sugiere una correlación débil entre el precio y las calificaciones.

2. **Visualizations**:
    - Use a scatter plot to visualize the relationship between product rating and price. What patterns can you observe?
    

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='stars', y='price', data=df)
plt.title('Relationship between Product Rating and Price')
plt.xlabel('Product Rating')
plt.ylabel('Price')
plt.show()

- Use a correlation heatmap to visualize correlations between all numerical variables.
  

In [None]:
df_numerical = df.select_dtypes(include=['int64', 'float64'])

# Calcular la matriz de correlación para las variables numéricas
correlation_matrix = df_numerical.corr()

# Configurar la figura de matplotlib con un tamaño adecuado
plt.figure(figsize=(18, 15))

# Dibujar el heatmap para las columnas numéricas
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm")

plt.title("Correlation Heatmap for Numerical Variables")
plt.show()


  - Examine if product prices typically follow a normal distribution using a QQ plot. 

In [None]:
sm.qqplot(df['price'], line ='s')


plt.title("QQ Plot for Product Prices")
plt.xlabel("Theoretical Quantiles")
plt.ylabel("Sample Quantiles")
plt.show()