In [None]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
try:
	import statsmodels.api as sm
except Exception:
	sm = None
	warnings.warn("statsmodels is not installed; continuing without it")

from scipy import stats
from scipy.stats import ttest_1samp
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats.mstats import winsorize
from scipy.stats import boxcox
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# As a data scientist, you must conduct exploratory data analysis and hypothesis testing to enhance your comprehension of the diverse factors influencing customer acquisition.
# Data description:

# The dataset aligns with the Four Ps of Marketing, categorizing variables to analyze consumer behavior. Product-related variables track spending across categories, 
# while Price factors like income and deal-based purchases indicate affordability. Place covers shopping channels and web visits, reflecting purchase preferences. 
# Promotion measures campaign engagement, complaints, and recency. Additionally, demographics support segmentation for personalized marketing. 
# This structured approach helps businesses optimize products, pricing, distribution, and promotions for better customer engagement and market performance.

df=pd.read_csv('marketing_data.csv')
print(df)
print("==============================================")
print(df.info())
print("==============================================")
print(df.describe())
print("====================NULLS FOR INCOME==========================")
print(df.isnull().sum())    
print("=================SOME COLUMS=============================")
df.columns = df.columns.str.replace(' ', '').str.lower()
if 'income' in df.columns:
	df['income'] = pd.to_numeric(df['income'].astype(str).str.replace('[$,]', '', regex=True), errors='coerce')
print(df['income'])
print("====================MEDIAN INCOME==========================")
print(df['income'].median())
print("====================MEAN INCOME==========================")
print(df['income'].mean())
print("====================MEAN INCOME WITH EDUCATION M_STATUS==========================")
avg_data = df.groupby(['education', 'marital_status'])['income'].mean()
print(avg_data)
for edu in df.education.unique():
	for marital in df.marital_status.unique():
		# lookup grouped mean using the MultiIndex (edu, marital)
		try:
			values = avg_data.loc[(edu, marital)]
		except KeyError:
			values = np.nan
		df.loc[(df.education == edu) & (df.marital_status == marital), 'income'] = values

print("===================SPENDING BY YEAR OF BIRTH===========================")
avg_spending_by_yob = df.groupby(['year_birth', 'education', 'marital_status'])['income'].mean()
print(avg_spending_by_yob)
for yob in df.year_birth.unique():
    for edu in df.education.unique():
        for marital in df.marital_status.unique():
            # lookup grouped mean using the MultiIndex (yob, edu, marital)
            try:
                values = avg_spending_by_yob.loc[(yob, edu, marital)]
            except KeyError:
                values = np.nan
            df.loc[(df.year_birth == yob) & (df.education == edu) & (df.marital_status == marital), 'income'] = values

print("==============================================")
df_plot = avg_spending_by_yob.reset_index(name='avg_income')
df_plot['year_birth'] = pd.to_numeric(df_plot['year_birth'], errors='coerce')
df_plot = df_plot.sort_values('year_birth')
# choose an education level to plot (example)
edu = 'PhD'   # change to one of your education values
sel = df_plot[df_plot['education'] == edu].dropna(subset=['avg_income','year_birth'])

sns.lineplot(data=sel, x='year_birth', y='avg_income', hue='marital_status', marker='o')
plt.title(f'Average Income by YOB â€” Education: {edu}')
plt.show()