In [7]:
import pandas as pd

# 1. Load the dataset with correct separator
df = pd.read_csv('marketing_campaign (1).csv', sep='\t')

# 2. Clean column names
df.columns = df.columns.str.strip().str.lower()

# 3. Handle missing values
df['income'] = df['income'].fillna(df['income'].mean())
df.dropna(inplace=True)

# 4. Remove duplicates
df.drop_duplicates(inplace=True)

# 5. Convert dt_customer to datetime
df['dt_customer'] = pd.to_datetime(df['dt_customer'], dayfirst=True, errors='coerce')

# 6. Standardize categorical values
df['education'] = df['education'].str.strip().str.lower()
df['marital_status'] = df['marital_status'].str.strip().str.lower()

# 7. Create an age column from year_birth
current_year = pd.Timestamp.now().year
df['age'] = current_year - df['year_birth']
df['age'] = df['age'].astype(int)

# 8. Optional: remove unrealistic ages
df = df[(df['age'] >= 18) & (df['age'] <= 100)]

# 9. Rename columns to be clean & readable (title case and underscores)
df.rename(columns=lambda x: x.replace('_', ' ').title().replace(' ', '_'), inplace=True)

# 10. Save cleaned data
df.to_csv('cleaned_marketing_campaign.csv', index=False)

# 11. Summary report
print("\n✅ Summary of Cleaning:\n")
print(f"Final shape: {df.shape}")
print("Null values per column:\n", df.isnull().sum())
print("\nColumn types:\n", df.dtypes)



✅ Summary of Cleaning:

Final shape: (2237, 30)
Null values per column:
 Id                     0
Year_Birth             0
Education              0
Marital_Status         0
Income                 0
Kidhome                0
Teenhome               0
Dt_Customer            0
Recency                0
Mntwines               0
Mntfruits              0
Mntmeatproducts        0
Mntfishproducts        0
Mntsweetproducts       0
Mntgoldprods           0
Numdealspurchases      0
Numwebpurchases        0
Numcatalogpurchases    0
Numstorepurchases      0
Numwebvisitsmonth      0
Acceptedcmp3           0
Acceptedcmp4           0
Acceptedcmp5           0
Acceptedcmp1           0
Acceptedcmp2           0
Complain               0
Z_Costcontact          0
Z_Revenue              0
Response               0
Age                    0
dtype: int64

Column types:
 Id                              int64
Year_Birth                      int64
Education                      object
Marital_Status                 ob