In [1]:
import pandas as pd
from datetime import datetime
import joblib

In [2]:
df = pd.read_csv('marketing_campaign_final.csv')

In [3]:
def category_age(age):
    if age <= 18:
        return '0-18'
    elif age <= 30:
        return '19-30'
    elif age <= 50:
        return '31-50'
    elif age <= 70:
        return '51-70'
    else:
        return '71+'

In [4]:
df["Age"] = df["Year_Birth"].apply(lambda x : datetime.now().year - x)
df = df.dropna()
df = df.drop("ID", axis="columns")
df = df[df["Marital_Status"].isin(["YOLO", "Absurd"])==False]
df["Marital_Status"] = df["Marital_Status"].apply(lambda x: "Single" if x=="Alone" else x)
df["Year_Customer_Entered"] = df["Dt_Customer"].apply(lambda x: str(x).split("-")[2])
df['Age_Group'] = df['Age'].apply(category_age)
df = df.reset_index(drop=True)
print(df.shape)
df.head()

(2212, 30)


Unnamed: 0,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,MntFruits,...,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response,Age,Year_Customer_Entered,Age_Group
0,1957,Graduation,Single,58138.0,0,0,04-09-2012,58,138.35,27.049674,...,0,0,0,0,3,11,1,67,2012,51-70
1,1954,Graduation,Single,46344.0,1,1,08-03-2014,38,147.07,42.396587,...,0,0,0,0,3,11,0,70,2014,51-70
2,1965,Graduation,Together,71613.0,0,0,21-08-2013,26,132.56,43.350942,...,0,0,0,0,3,11,0,59,2013,51-70
3,1984,Graduation,Together,26646.0,1,0,10-02-2014,26,93.63,44.755249,...,0,0,0,0,3,11,0,40,2014,31-50
4,1981,PhD,Married,58293.0,1,0,19-01-2014,94,141.37,48.024349,...,0,0,0,0,3,11,0,43,2014,31-50


In [5]:
df['Complain'].value_counts()

Complain
0    2191
1      21
Name: count, dtype: int64

**MODELO PARA VINOS**

In [36]:
x_columns = ["Age", "Education", "Marital_Status", "Income", "Kidhome", "Teenhome",
             "Year_Customer_Entered", "Recency", "Complain"]
X = df[x_columns]
y = df["MntWines"]
X

Unnamed: 0,Age,Education,Marital_Status,Income,Kidhome,Teenhome,Year_Customer_Entered,Recency,Complain
0,67,Graduation,Single,58138.0,0,0,2012,58,0
1,70,Graduation,Single,46344.0,1,1,2014,38,0
2,59,Graduation,Together,71613.0,0,0,2013,26,0
3,40,Graduation,Together,26646.0,1,0,2014,26,0
4,43,PhD,Married,58293.0,1,0,2014,94,0
...,...,...,...,...,...,...,...,...,...
2207,57,Graduation,Married,61223.0,0,1,2013,46,0
2208,78,PhD,Together,64014.0,2,1,2014,56,0
2209,43,Graduation,Divorced,56981.0,0,0,2014,91,0
2210,68,Master,Together,69245.0,0,1,2014,8,0


**MODELO FINAL VINOS**

In [37]:
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import make_scorer, mean_absolute_error, mean_absolute_percentage_error, r2_score

X = pd.get_dummies(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = XGBRegressor()

scoring = {
    'mae': make_scorer(mean_absolute_error),
    'mape': make_scorer(mean_absolute_percentage_error),
    'r2': make_scorer(r2_score)
}

param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9]
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring=scoring, refit='r2', verbose=1)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print("Best parameters found: ", best_params)

best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

y_pred = best_model.predict(X_test)
mae_test = mean_absolute_error(y_test, y_pred)
mape_test = mean_absolute_percentage_error(y_test, y_pred)
r2_test = r2_score(y_test, y_pred)

print(f"Test MAE: {mae_test}")
print(f"Test MAPE: {mape_test}")
print(f"Test R2: {r2_test}")

Fitting 3 folds for each of 243 candidates, totalling 729 fits
Best parameters found:  {'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 50, 'subsample': 0.7}
Test MAE: 7.157142136172472
Test MAPE: 0.061391365247864337
Test R2: 0.8670067170765869


**GUARDAMOS EL MODELO ENTRENADO CON TODA LA MUESTRA**

In [8]:
best_model.fit(X, y)

joblib.dump(best_model, 'best_model_wines.sav')

['best_model_wines.sav']

**MODELO FINAL PARA FRUITS**

In [9]:
x_columns = ["Age", "Education", "Marital_Status", "Income", "Kidhome", "Teenhome",
             "Year_Customer_Entered", "Recency", "Complain"]
X = df[x_columns]
y = df["MntFruits"]

In [10]:
X = pd.get_dummies(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = XGBRegressor()

scoring = {
    'mae': make_scorer(mean_absolute_error),
    'mape': make_scorer(mean_absolute_percentage_error),
    'r2': make_scorer(r2_score)
}

param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9]
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring=scoring, refit='r2', verbose=1)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print("Best parameters found: ", best_params)

best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

y_pred = best_model.predict(X_test)
mae_test = mean_absolute_error(y_test, y_pred)
mape_test = mean_absolute_percentage_error(y_test, y_pred)
r2_test = r2_score(y_test, y_pred)

print(f"Test MAE: {mae_test}")
print(f"Test MAPE: {mape_test}")
print(f"Test R2: {r2_test}")

Fitting 3 folds for each of 243 candidates, totalling 729 fits
Best parameters found:  {'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.8}
Test MAE: 4.178127382373141
Test MAPE: 0.11806966899770022
Test R2: 0.7450056309296911


**GUARDAMOS EL MODELO ENTRENADO CON TODA LA MUESTRA**

In [11]:
best_model.fit(X, y)

joblib.dump(best_model, 'best_model_fruits.sav')

['best_model_fruits.sav']

**MODELO FINAL PARA MEAT**

In [12]:
x_columns = ["Age", "Education", "Marital_Status", "Income", "Kidhome", "Teenhome",
             "Year_Customer_Entered", "Recency", "Complain"]
X = df[x_columns]
y = df["MntMeatProducts"]

In [13]:
X = pd.get_dummies(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = XGBRegressor()

scoring = {
    'mae': make_scorer(mean_absolute_error),
    'mape': make_scorer(mean_absolute_percentage_error),
    'r2': make_scorer(r2_score)
}

param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9]
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring=scoring, refit='r2', verbose=1)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print("Best parameters found: ", best_params)

best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

y_pred = best_model.predict(X_test)
mae_test = mean_absolute_error(y_test, y_pred)
mape_test = mean_absolute_percentage_error(y_test, y_pred)
r2_test = r2_score(y_test, y_pred)

print(f"Test MAE: {mae_test}")
print(f"Test MAPE: {mape_test}")
print(f"Test R2: {r2_test}")

Fitting 3 folds for each of 243 candidates, totalling 729 fits
Best parameters found:  {'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.7}
Test MAE: 5.954523075353481
Test MAPE: 0.06228032379063866
Test R2: 0.8709383523908891


**GUARDAMOS EL MODELO ENTRENADO CON TODA LA MUESTRA**

In [14]:
best_model.fit(X, y)

joblib.dump(best_model, 'best_model_meat.sav')

['best_model_meat.sav']

**MODELO FINAL PARA FISH**

In [15]:
x_columns = ["Age", "Education", "Marital_Status", "Income", "Kidhome", "Teenhome",
             "Year_Customer_Entered", "Recency", "Complain"]
X = df[x_columns]
y = df["MntFishProducts"]

In [16]:
X = pd.get_dummies(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = XGBRegressor()

scoring = {
    'mae': make_scorer(mean_absolute_error),
    'mape': make_scorer(mean_absolute_percentage_error),
    'r2': make_scorer(r2_score)
}

param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9]
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring=scoring, refit='r2', verbose=1)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print("Best parameters found: ", best_params)

best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

y_pred = best_model.predict(X_test)
mae_test = mean_absolute_error(y_test, y_pred)
mape_test = mean_absolute_percentage_error(y_test, y_pred)
r2_test = r2_score(y_test, y_pred)

print(f"Test MAE: {mae_test}")
print(f"Test MAPE: {mape_test}")
print(f"Test R2: {r2_test}")

Fitting 3 folds for each of 243 candidates, totalling 729 fits
Best parameters found:  {'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.8}
Test MAE: 6.000269154361355
Test MAPE: 0.08622476560399847
Test R2: 0.7778559735847101


**GUARDAMOS EL MODELO ENTRENADO CON TODA LA MUESTRA**

In [17]:
best_model.fit(X, y)

joblib.dump(best_model, 'best_model_fish.sav')

['best_model_fish.sav']

**MODELO PARA SWEET**

In [18]:
x_columns = ["Age", "Education", "Marital_Status", "Income", "Kidhome", "Teenhome",
             "Year_Customer_Entered", "Recency", "Complain"]
X = df[x_columns]
y = df["MntSweetProducts"]

In [19]:
X = pd.get_dummies(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = XGBRegressor()

scoring = {
    'mae': make_scorer(mean_absolute_error),
    'mape': make_scorer(mean_absolute_percentage_error),
    'r2': make_scorer(r2_score)
}

param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9]
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring=scoring, refit='r2', verbose=1)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print("Best parameters found: ", best_params)

best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

y_pred = best_model.predict(X_test)
mae_test = mean_absolute_error(y_test, y_pred)
mape_test = mean_absolute_percentage_error(y_test, y_pred)
r2_test = r2_score(y_test, y_pred)

print(f"Test MAE: {mae_test}")
print(f"Test MAPE: {mape_test}")
print(f"Test R2: {r2_test}")

Fitting 3 folds for each of 243 candidates, totalling 729 fits
Best parameters found:  {'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.8}
Test MAE: 4.340626060791894
Test MAPE: 0.09869256111335074
Test R2: 0.8059551297887646


**GUARDAMOS EL MODELO ENTRENADO CON TODA LA MUESTRA**

In [20]:
best_model.fit(X, y)

joblib.dump(best_model, 'best_model_sweet.sav')

['best_model_sweet.sav']

**PROBLEMA DE CLASIFICACIÓN**

**CREAMOS LA VARIABLE "PRODUCTO_MAS_COMPRADO**

In [21]:
productos = ['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts']

df['producto_mas_comprado'] = df[productos].idxmax(axis=1).apply(lambda x: x.replace('Mnt', '').replace('Products', ''))
df = df[df['producto_mas_comprado'] != 'Fish']

In [23]:
df['producto_mas_comprado'].value_counts()

producto_mas_comprado
Wines    1914
Meat      295
Name: count, dtype: int64

**RANDOM FOREST CLASIFFIER**

In [24]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

X_columns = ["Age", "Education", "Marital_Status", "Income", "Kidhome", "Teenhome",
             "Year_Customer_Entered", "Recency", "Complain"]
X = df[X_columns]
y = df["producto_mas_comprado"]

X = pd.get_dummies(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)

model = RandomForestClassifier(max_depth= 20, n_estimators= 200,random_state=42)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")

Accuracy: 0.8891402714932126
Classification Report:
              precision    recall  f1-score   support

        Meat       0.61      0.46      0.52        59
       Wines       0.92      0.96      0.94       383

    accuracy                           0.89       442
   macro avg       0.77      0.71      0.73       442
weighted avg       0.88      0.89      0.88       442



**XGBOOST CLASIFFIER**

In [28]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score

X_columns = ["Age", "Education", "Marital_Status", "Income", "Kidhome", "Teenhome",
             "Year_Customer_Entered", "Recency", "Complain"]
X = df[X_columns]
y = df["producto_mas_comprado"]

X = pd.get_dummies(X)

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

model = XGBClassifier()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")

Accuracy: 0.8665158371040724
Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.53      0.51        59
           1       0.93      0.92      0.92       383

    accuracy                           0.87       442
   macro avg       0.71      0.72      0.72       442
weighted avg       0.87      0.87      0.87       442



In [29]:
classes = label_encoder.classes_
numeric_values = label_encoder.transform(classes)

for cls, value in zip(classes, numeric_values):
    print(f"Clase: {cls}, Valor Numérico: {value}")

Clase: Meat, Valor Numérico: 0
Clase: Wines, Valor Numérico: 1


**GUARDAMOS EL MODELO ENTRENADO CON TODOS LOS DATOS**

In [33]:
model.fit(X, y_encoded)

joblib.dump(model, 'modelo_final_clasificación_alimentos.sav')

['modelo_final_clasificación_alimentos.sav']