In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load the CSV data into a DataFrame
wa_marketing_data = pd.read_csv('WA_Marketing-Campaign.csv')

# Display the first few rows of the data
wa_marketing_data.head()

In [None]:
# Check for missing values in all columns
missing_values = wa_marketing_data.isnull().sum()

# Check for any abnormalities in data types
data_types = wa_marketing_data.dtypes

analysis_result = pd.DataFrame({'Missing Values': missing_values, 'Data Type': data_types})
analysis_result

In [None]:
# Descriptive statistics for numerical columns
numerical_stats = wa_marketing_data.describe()
numerical_stats

In [None]:
# Analysis for 'MarketSize' categorical column
market_size_counts = wa_marketing_data['MarketSize'].value_counts()
market_size_counts

In [None]:
# Set up the figure and axes
fig, ax = plt.subplots(nrows=2, ncols=1, figsize=(10, 8))

# Histogram for AgeOfStore
ax[0].hist(wa_marketing_data['AgeOfStore'], bins=20, color='skyblue', edgecolor='black')
ax[0].set_title('Distribution of AgeOfStore')
ax[0].set_xlabel('AgeOfStore')
ax[0].set_ylabel('Frequency')

# Histogram for SalesInThousands
ax[1].hist(wa_marketing_data['SalesInThousands'], bins=20, color='salmon', edgecolor='black')
ax[1].set_title('Distribution of SalesInThousands')
ax[1].set_xlabel('SalesInThousands')
ax[1].set_ylabel('Frequency')

# Adjust the layout
plt.tight_layout()
plt.show()

In [None]:
# Group data by 'week' and 'Promotion' and calculate average sales for each group
weekly_sales = wa_marketing_data.groupby(['week', 'Promotion'])['SalesInThousands'].mean().reset_index()

# Plotting the sales over time for each promotion
plt.figure(figsize=(12, 7))
for promo in weekly_sales['Promotion'].unique():
    subset = weekly_sales[weekly_sales['Promotion'] == promo]
    plt.plot(subset['week'], subset['SalesInThousands'], label=f'Promotion {promo}', marker='o')

plt.title('Average Sales Over Time by Promotion')
plt.xlabel('Week')
plt.ylabel('Average Sales (in Thousands)')
plt.legend()
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
plt.xticks(weekly_sales['week'].unique())
plt.show()

In [None]:
# Group data by 'week', 'Promotion', and 'MarketSize' and calculate average sales for each group
weekly_sales_market = wa_marketing_data.groupby(['week', 'Promotion', 'MarketSize'])['SalesInThousands'].mean().reset_index()

# Plotting the sales over time for each promotion and market size with different markers for each market size
plt.figure(figsize=(14, 8))
promo_colors = {1: 'red', 2: 'blue', 3: 'green'}
market_markers = {'Small': 'o', 'Medium': '^', 'Large': 's'}
for promo in weekly_sales_market['Promotion'].unique():
    for market in weekly_sales_market['MarketSize'].unique():
        subset = weekly_sales_market[(weekly_sales_market['Promotion'] == promo) & (weekly_sales_market['MarketSize'] == market)]
        plt.plot(subset['week'], subset['SalesInThousands'], label=f'Promotion {promo} - {market}', marker=market_markers[market], color=promo_colors[promo], linestyle='-')

plt.title('Average Sales Over Time by Promotion and Market Size')
plt.xlabel('Week')
plt.ylabel('Average Sales (in Thousands)')
plt.legend()
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
plt.xticks(weekly_sales_market['week'].unique())
plt.show()

From the plot:

Promotion 1:

Sales in Medium and Large markets are relatively stable across the weeks.
Sales in Small markets show a slight increase in the third week.

Promotion 2:

Sales in Medium markets start high in the first week but see a decline in the subsequent weeks.
Sales in Large markets show a consistent decline over the weeks.
Sales in Small markets are relatively stable.

Promotion 3:

Sales in Medium markets show a consistent upward trend over the weeks.
Sales in Large markets start lower in the first week but show an increase in the subsequent weeks.
Sales in Small markets are relatively stable.

The overlay of market sizes provides insights into how each promotion performed in different market segments over time. For instance, Promotion 3 seems to be particularly effective in Medium markets, showing a consistent upward trend.

In [None]:
# Calculate week-over-week growth rate for each promotion
growth_rates = []
for promo in weekly_sales['Promotion'].unique():
    subset = weekly_sales[weekly_sales['Promotion'] == promo].sort_values(by='week')
    subset['GrowthRate'] = subset['SalesInThousands'].pct_change() * 100
    growth_rates.append(subset)

growth_rates_df = pd.concat(growth_rates)

# Plotting the growth rates over time for each promotion
plt.figure(figsize=(12, 7))
for promo in growth_rates_df['Promotion'].unique():
    subset = growth_rates_df[growth_rates_df['Promotion'] == promo]
    plt.plot(subset['week'], subset['GrowthRate'], label=f'Promotion {promo}', marker='o')

plt.title('Week-over-Week Sales Growth Rate by Promotion')
plt.xlabel('Week')
plt.ylabel('Growth Rate (%)')
plt.legend()
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
plt.xticks(growth_rates_df['week'].unique())
plt.axhline(y=0, color='black', linestyle='-')
plt.show()

Here's the visualization of the week-over-week sales growth rate for each promotion:

Promotion 1: The growth rate fluctuates around the zero line, indicating relatively stable sales with a slight increase in the third week.

Promotion 2: There's a sharp decline in the growth rate after the first week, indicating a drop in sales.

Promotion 3: The growth rate shows a consistent positive trend, indicating increasing sales over the weeks.

From this visualization:

Promotion 3 shows a consistent positive growth rate, suggesting that its effectiveness increased over time.

Promotion 2 had a strong start but couldn't sustain its performance.

Promotion 1 had relatively stable sales with minor fluctuations.

In [None]:
# Group data by 'Promotion' and 'MarketSize' and calculate average sales for each group
promo_market_sales = wa_marketing_data.groupby(['Promotion', 'MarketSize'])['SalesInThousands'].mean().unstack()

# Using a different color palette for the stacked bar chart
colors = ['lightcoral', 'lightskyblue', 'lightgreen']

# Plotting the updated stacked bar chart
ax = promo_market_sales.plot(kind='bar', stacked=True, figsize=(10, 7), color=colors)
plt.title('Average Sales by Promotion and Market Size')
plt.xlabel('Promotion')
plt.ylabel('Average Sales (in Thousands)')
plt.xticks(rotation=0)
plt.legend(title='Market Size')
plt.tight_layout()
plt.show()

From the chart:

Promotion 1: Has a balanced contribution from all three market sizes, with the Large market contributing the most.

Promotion 2: The Medium market size contributes the most to the sales, followed by the Large and Small market sizes.

Promotion 3: The Medium market size again contributes the most, but the Small market size has a more significant contribution compared to Promotion 2.

This visualization provides insights into how each promotion performed in different market segments.

In [None]:
# Segment the data based on the 'Promotion' column
promotion_groups = wa_marketing_data.groupby('Promotion')

# Calculate the average sales for each promotion
average_sales_per_promotion = promotion_groups['SalesInThousands'].mean()
average_sales_per_promotion

In [None]:
# Create a pie chart for average sales for each promotion
labels = average_sales_per_promotion.index
sizes = average_sales_per_promotion.values
colors = ['gold', 'lightskyblue', 'lightcoral']
explode = (0.1, 0, 0)  # explode 1st slice for emphasis

plt.figure(figsize=(10, 6))
plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%', shadow=True, startangle=140)
plt.title('Average Sales Distribution for Each Promotion')
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()

In [None]:
# Identify the promotion with the highest average sales
best_promotion = average_sales_per_promotion.idxmax()
best_promotion_sales = average_sales_per_promotion.max()
best_promotion, best_promotion_sales

The best promotion, based on the highest average sales, is Promotion 1 with an average sales value of approximately 58.10 thousand.

This suggests that Promotion 1 was the most effective in driving sales among the three promotions tested.

In [None]:
import scipy.stats as stats

# Extract sales data for each promotion
sales_promo_1 = wa_marketing_data[wa_marketing_data['Promotion'] == 1]['SalesInThousands']
sales_promo_2 = wa_marketing_data[wa_marketing_data['Promotion'] == 2]['SalesInThousands']
sales_promo_3 = wa_marketing_data[wa_marketing_data['Promotion'] == 3]['SalesInThousands']

# Perform ANOVA test
f_statistic, p_value = stats.f_oneway(sales_promo_1, sales_promo_2, sales_promo_3)
f_statistic, p_value

The p-value is extremely small, which indicates that there is a statistically significant difference in the average sales between the three promotions. In other words, the observed differences in average sales for the promotions are unlikely to have occurred by random chance.

Given the results, we can conclude that the promotions had a significant impact on sales. 

In [None]:
from statsmodels.stats.multicomp import pairwise_tukeyhsd

# Perform Tukey HSD test
tukey_results = pairwise_tukeyhsd(endog=wa_marketing_data['SalesInThousands'], groups=wa_marketing_data['Promotion'], alpha=0.05)
tukey_summary = tukey_results.summary()
tukey_summary

In summary:

Promotion 1 significantly outperforms Promotion 2.

There's no significant difference between Promotions 1 and 3.

Promotion 3 significantly outperforms Promotion 2.

In [None]:
import statsmodels.api as sm
from statsmodels.formula.api import ols
# Fit the ANOVA model
model = ols('SalesInThousands ~ C(Promotion)', data=wa_marketing_data).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
# Calculate the effect size (Cohen's f)
between_group_variance = anova_table['sum_sq'][0] / anova_table['df'][0]
within_group_variance = anova_table['sum_sq'][1] / anova_table['df'][1]
cohens_f = (between_group_variance / within_group_variance) ** 0.5
cohens_f

Given the general guidelines for interpreting Cohen's f:

Small effect: f = 0.10

Medium effect: f = 0.25

Large effect: f = 0.40

Our calculated value of 4.69 is significantly larger than the threshold for a "large" effect. This indicates that the effect of the promotions on sales is not only statistically significant (as we found with the ANOVA test) but also practically significant with a very strong magnitude of effect.

In [None]:
import itertools
def cohens_d(group1, group2):
    """Calculate Cohen's d for pairwise comparison."""
    mean_diff = group1.mean() - group2.mean()
    pooled_var = (len(group1) * group1.var() + len(group2) * group2.var()) / (len(group1) + len(group2))
    pooled_std = np.sqrt(pooled_var)
    return mean_diff / pooled_std
# Calculate Cohen's d for pairwise comparisons between the promotions
promotions = wa_marketing_data['Promotion'].unique()
effect_sizes = {}
for promo1, promo2 in itertools.combinations(promotions, 2):
    group1 = wa_marketing_data[wa_marketing_data['Promotion'] == promo1]['SalesInThousands']
    group2 = wa_marketing_data[wa_marketing_data['Promotion'] == promo2]['SalesInThousands']
    effect_sizes[f'Promo {promo1} vs Promo {promo2}'] = cohens_d(group1, group2)
# Visualize the effect sizes
plt.figure(figsize=(10, 6))
plt.bar(effect_sizes.keys(), effect_sizes.values(), color=['blue', 'green', 'red'])
plt.title('Effect Sizes (Cohens d) for Pairwise Promotion Comparisons')
plt.ylabel('Effect Size (Cohens d)')
plt.axhline(y=0, color='black', linestyle='-')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

Promo 1 vs Promo 2: The effect size is positive, indicating that Promo 1 has a higher average sales value than Promo 2.

Promo 1 vs Promo 3: The effect size is negative, indicating that Promo 3 has a higher average sales value than Promo 1.

Promo 2 vs Promo 3: The effect size is negative, indicating that Promo 3 has a higher average sales value than Promo 2.

The magnitude of Cohen's d provides insights into the size of the difference between the promotions. A larger absolute value indicates a larger difference in effect.

In [None]:
# Fit an ANOVA model with interaction terms for Promotion and MarketSize, and Promotion and AgeOfStore
interaction_model_market = ols('SalesInThousands ~ C(Promotion) * C(MarketSize)', data=wa_marketing_data).fit()
interaction_model_age = ols('SalesInThousands ~ C(Promotion) * AgeOfStore', data=wa_marketing_data).fit()
# Extract the summary for the interaction terms
interaction_summary_market = sm.stats.anova_lm(interaction_model_market, typ=2)
interaction_summary_age = sm.stats.anova_lm(interaction_model_age, typ=2)
interaction_summary_market, interaction_summary_age

Here are the results of the interaction effects:

#Interaction between Promotion and MarketSize:

Promotion: F-statistic = 56.24, p-value < 0.001

MarketSize: F-statistic = 337.14, p-value < 0.001

Interaction Term (Promotion x MarketSize): F-statistic = 4.59, p-value = 0.0012

The interaction between Promotion and MarketSize is statistically significant, suggesting that the effect of promotions on sales varies depending on the market size.

#Interaction between Promotion and AgeOfStore:

Promotion: F-statistic = 22.27, p-value < 0.001

AgeOfStore: F-statistic = 0.94, p-value = 0.331

Interaction Term (Promotion x AgeOfStore): F-statistic = 2.10, p-value = 0.124

The interaction between Promotion and AgeOfStore is not statistically significant at the 0.05 level, suggesting that the effect of promotions on sales does not vary significantly with the age of the store.

In [None]:
import seaborn as sns
# Create density plots for the distribution of sales for each promotion
plt.figure(figsize=(12, 7))
sns.kdeplot(wa_marketing_data[wa_marketing_data['Promotion'] == 1]['SalesInThousands'], label='Promotion 1', shade=True)
sns.kdeplot(wa_marketing_data[wa_marketing_data['Promotion'] == 2]['SalesInThousands'], label='Promotion 2', shade=True)
sns.kdeplot(wa_marketing_data[wa_marketing_data['Promotion'] == 3]['SalesInThousands'], label='Promotion 3', shade=True)
plt.title('Density Plot of Sales Distributions for Each Promotion')
plt.xlabel('Sales (in Thousands)')
plt.ylabel('Density')
plt.legend()
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
plt.show()

Here's the density plot of sales distributions for each promotion:

Promotion 1: The distribution for Promotion 1 is slightly right-skewed, with a peak around 50-60 thousand in sales.

Promotion 2: The distribution for Promotion 2 is more spread out and has a broader peak around 40-60 thousand in sales.

Promotion 3: The distribution for Promotion 3 is also right-skewed, with a peak around 40-50 thousand in sales.

From the density plots, we can observe that while the distributions of sales for the three promotions have some overlap, there are distinct differences in their shapes and peaks. This visualization complements our earlier statistical analyses, providing a visual representation of the sales distributions for each promotion.