### Table of Content
- [Import Data](#Import-Data)
- [Inspect Data](#Inspect-Data)

In [45]:
# Import pandas using alias pd, numpy as np
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
from pathlib import Path

### Import Data
---------------

Load data stored in a csv file into a DataFrame using `pd.read_scv()` method. The dataset is from [Kaggle](https://www.kaggle.com/datasets/bhupendram/marketing-campaign).

In [41]:
csv_ile_path = Path('data/corrected_marketing_campaign.csv')

if csv_ile_path.exists():
    df_marketing = pd.read_csv(csv_ile_path, sep = ',') 
    print(f'Data loaded successfully: {df_marketing.shape[0]} rows, {df_marketing.shape[1]} columns.')
else:
    raise FileNotFoundError('Dataset not found. Please check the path.')

Data loaded successfully: 2240 rows, 29 columns.


Load data stored in an Excel file into a DataFrame using `pd.read_excel()` method. To use `pd.read_excel()` the package `openpyxl` must be installed. 

In [42]:
excel_file_path = Path('data/corrected_marketing_campaign.xlsx')

if excel_file_path.exists():
    df_marketing = pd.read_excel(excel_file_path, sheet_name='corrected_marketing_campaign') 
    print(f'Data loaded successfully: {df_marketing.shape[0]} rows, {df_marketing.shape[1]} columns.')
else:
    raise FileNotFoundError('Dataset not found. Please check the path.')

Data loaded successfully: 2240 rows, 29 columns.


### Inspect Data
----------------

##### Initial Overview

In [43]:
# --- 1. Bacis information ---
print('\nBasis Info:')
df_marketing.info()


Basis Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2240 entries, 0 to 2239
Data columns (total 29 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   ID                   2240 non-null   int64         
 1   Year_Birth           2240 non-null   int64         
 2   Education            2240 non-null   object        
 3   Marital_Status       2240 non-null   object        
 4   Income               2216 non-null   float64       
 5   Kidhome              2240 non-null   int64         
 6   Teenhome             2240 non-null   int64         
 7   Dt_Customer          2240 non-null   datetime64[ns]
 8   Recency              2240 non-null   int64         
 9   MntWines             2240 non-null   int64         
 10  MntFruits            2240 non-null   int64         
 11  MntMeatProducts      2240 non-null   int64         
 12  MntFishProducts      2240 non-null   int64         
 13  MntSweetProducts    

In [29]:
# --- 2. Summary statistics ---
print('\nSummary Statistics:')
print(df_marketing.describe())


Summary Statistics:
                 ID   Year_Birth         Income      Kidhome     Teenhome  \
count   2240.000000  2240.000000    2216.000000  2240.000000  2240.000000   
mean    5592.159821  1968.805804   52247.251354     0.444196     0.506250   
min        0.000000  1893.000000    1730.000000     0.000000     0.000000   
25%     2828.250000  1959.000000   35303.000000     0.000000     0.000000   
50%     5458.500000  1970.000000   51381.500000     0.000000     0.000000   
75%     8427.750000  1977.000000   68522.000000     1.000000     1.000000   
max    11191.000000  1996.000000  666666.000000     2.000000     2.000000   
std     3246.662198    11.984069   25173.076661     0.538398     0.544538   

                         Dt_Customer      Recency     MntWines    MntFruits  \
count                           2240  2240.000000  2240.000000  2240.000000   
mean   2013-07-10 10:01:42.857142784    49.109375   303.935714    26.302232   
min              2012-07-30 00:00:00     0.00000

In [30]:
# --- 3. Check the first few rows ---
df_marketing.head()

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
0,5524,1957,Graduation,Single,58138.0,0,0,2012-09-04,58,635,...,7,0,0,0,0,0,0,3,11,1
1,2174,1954,Graduation,Single,46344.0,1,1,2014-03-08,38,11,...,5,0,0,0,0,0,0,3,11,0
2,4141,1965,Graduation,Together,71613.0,0,0,2013-08-21,26,426,...,4,0,0,0,0,0,0,3,11,0
3,6182,1984,Graduation,Together,26646.0,1,0,2014-02-10,26,11,...,6,0,0,0,0,0,0,3,11,0
4,5324,1981,PhD,Married,58293.0,1,0,2014-01-19,94,173,...,5,0,0,0,0,0,0,3,11,0


In [48]:
# --- 4. Missung values & duplicates check ---

def check_data_quality(dataframe):
    print('\nMissing Values:')
    print(dataframe.isnull().sum())
    print('\nDuplicates:')
    print(dataframe.duplicated().sum())

check_data_quality(df_marketing)


Missing Values:
ID                      0
Year_Birth              0
Education               0
Marital_Status          0
Income                 24
Kidhome                 0
Teenhome                0
Dt_Customer             0
Recency                 0
MntWines                0
MntFruits               0
MntMeatProducts         0
MntFishProducts         0
MntSweetProducts        0
MntGoldProds            0
NumDealsPurchases       0
NumWebPurchases         0
NumCatalogPurchases     0
NumStorePurchases       0
NumWebVisitsMonth       0
AcceptedCmp3            0
AcceptedCmp4            0
AcceptedCmp5            0
AcceptedCmp1            0
AcceptedCmp2            0
Complain                0
Z_CostContact           0
Z_Revenue               0
Response                0
dtype: int64

Duplicates:
0
