In [55]:
import pandas as pd
import sklearn as sk
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin

In [131]:
df = pd.read_csv("marketing_campaign.csv", sep=";")

In [14]:
df

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
0,5524,1957,Graduation,Single,58138.0,0,0,2012-09-04,58,635,...,7,0,0,0,0,0,0,3,11,1
1,2174,1954,Graduation,Single,46344.0,1,1,2014-03-08,38,11,...,5,0,0,0,0,0,0,3,11,0
2,4141,1965,Graduation,Together,71613.0,0,0,2013-08-21,26,426,...,4,0,0,0,0,0,0,3,11,0
3,6182,1984,Graduation,Together,26646.0,1,0,2014-02-10,26,11,...,6,0,0,0,0,0,0,3,11,0
4,5324,1981,PhD,Married,58293.0,1,0,2014-01-19,94,173,...,5,0,0,0,0,0,0,3,11,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2235,10870,1967,Graduation,Married,61223.0,0,1,2013-06-13,46,709,...,5,0,0,0,0,0,0,3,11,0
2236,4001,1946,PhD,Together,64014.0,2,1,2014-06-10,56,406,...,7,0,0,0,1,0,0,3,11,0
2237,7270,1981,Graduation,Divorced,56981.0,0,0,2014-01-25,91,908,...,6,0,1,0,0,0,0,3,11,0
2238,8235,1956,Master,Together,69245.0,0,1,2014-01-24,8,428,...,3,0,0,0,0,0,0,3,11,0


In [173]:
def prep_data(df):
    df.columns = df.columns.str.lower()
    df["dt_customer"] = pd.to_datetime(df["dt_customer"]).dt.year

    df = df.drop(columns=["id", "z_costcontact", "z_revenue"])
    df = df[df["year_birth"]>1930]
    df = df[df["income"]<200000]
    martial_ac = ["Single","Together","Married","Divorced","Widow"]
    df = df[df["marital_status"].isin(martial_ac)]

    X = df.drop(columns="response", axis=1)

    numeric_features = X.select_dtypes(include=['number']).columns.tolist()
    categorical_features = X.select_dtypes(include=['object']).columns.tolist()

    encoder = OneHotEncoder(sparse_output=False,drop='first')
    encoded_features = encoder.fit_transform(df[categorical_features])
    encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_features))

    df.drop(columns=categorical_features, inplace=True)
    df = pd.concat([df, encoded_df], axis=1)

    y = df["response"]
    X = df.drop(columns="response", axis=1)

    x_train, x_test, y_train, y_test = train_test_split(X,y, test_size=0.2)
    scaler = StandardScaler()
    x_train[numeric_features] = scaler.fit_transform(x_train[numeric_features])
    x_test[numeric_features] = scaler.transform(x_test[numeric_features])

    return x_train, y_train, x_test, y_test    

In [170]:
df = pd.read_csv("marketing_campaign.csv", sep=";")

In [171]:
x,y,_,_ = prep_data(df)

In [172]:
x

Unnamed: 0,year_birth,income,kidhome,teenhome,dt_customer,recency,mntwines,mntfruits,mntmeatproducts,mntfishproducts,...,acceptedcmp2,complain,education_Basic,education_Graduation,education_Master,education_PhD,marital_status_Married,marital_status_Single,marital_status_Together,marital_status_Widow
1699,-1.522950,0.987871,-0.802805,2.752217,1.443392,1.278219,0.896504,1.607419,1.523575,4.086452,...,-0.117478,-0.0957,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
218,-1.267052,-0.359030,-0.802805,0.912187,-0.023294,-0.855745,-0.382358,-0.558242,-0.600688,-0.685921,...,-0.117478,-0.0957,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1282,-1.011155,-0.678638,1.071825,0.912187,-1.489980,0.004725,-0.884981,-0.558242,-0.657393,-0.685921,...,-0.117478,-0.0957,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
305,-0.669958,1.322587,-0.802805,-0.927843,-0.023294,-0.167369,0.637758,3.001408,-0.256095,3.365046,...,-0.117478,-0.0957,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
372,0.097734,-1.052161,1.071825,-0.927843,-0.023294,0.107981,-0.902825,-0.632920,-0.718460,-0.630428,...,-0.117478,-0.0957,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1933,-0.072865,-1.052392,2.946455,-0.927843,1.443392,-0.064113,-0.873084,-0.583135,-0.666117,-0.630428,...,-0.117478,-0.0957,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1975,0.012434,-2.212693,-0.802805,0.912187,-0.023294,-1.681795,-0.876059,-0.558242,-0.692289,-0.648926,...,-0.117478,-0.0957,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
310,0.609528,-1.836490,1.071825,-0.927843,1.443392,1.278219,-0.914722,-0.533350,-0.683565,-0.648926,...,-0.117478,-0.0957,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
248,1.633117,0.703328,-0.802805,-0.927843,-1.489980,1.415894,1.645977,2.677804,2.605336,2.088714,...,-0.117478,-0.0957,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


## make_pipeline test

In [None]:
class ColumnRemover(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_remove):
        self.columns_to_remove = columns_to_remove

    def fit(self, X, y=None):
        return self

    def prep_data(X):
        pass

    def transform(self, X):
        return X.drop(self.columns_to_remove, axis=1)

In [65]:
class PrintData(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        #print(X)
        return X

In [43]:
preprocess = make_column_transformer(
    (StandardScaler(), numeric_features),
    (OneHotEncoder(categories=[categories.get(col, 'auto') for col in categorical_features]), categorical_features))

In [67]:
pipe = make_pipeline(
    preprocess,
    PrintData(),
    verbose=False
)

In [68]:
pipe.fit(df)

In [70]:
xd = pipe.fit_transform(df)

In [72]:
pd.DataFrame(xd)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,29,30,31,32,33,34,35,36,37,38
0,-0.985345,0.234063,-0.825218,-0.929894,-1.502225,0.307039,0.983781,1.551577,1.679702,2.462147,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-1.235733,-0.234559,1.032559,0.906934,1.420036,-0.383664,-0.870479,-0.636301,-0.713225,-0.650449,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.317643,0.769478,-0.825218,-0.929894,-0.041094,-0.798086,0.362723,0.570804,-0.177032,1.345274,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.268149,-1.017239,1.032559,-0.929894,1.420036,-0.798086,-0.870479,-0.560857,-0.651187,-0.503974,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.017761,0.240221,1.032559,-0.929894,1.420036,1.550305,-0.389085,0.419916,-0.216914,0.155164,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2235,-0.150717,0.356642,-0.825218,0.906934,-0.041094,-0.107383,1.203678,0.419916,0.066692,0.081926,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2236,-1.903435,0.467539,2.890335,0.906934,1.420036,0.237969,0.303291,-0.661449,-0.606873,-0.687068,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2237,1.017761,0.188091,-0.825218,-0.929894,1.420036,1.446700,1.795020,0.545656,0.221789,-0.101168,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2238,-1.068807,0.675388,-0.825218,0.906934,1.420036,-1.419719,0.368666,0.092992,0.208495,0.777683,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
