### Table of Contents

# 1. Import Data

## 1.1 Import the needed libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import date
from scipy.stats import zscore
from sklearn.cluster import KMeans, DBSCAN, SpectralClustering
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.impute import KNNImputer
from sklearn.decomposition import PCA
from kmodes.kprototypes import KPrototypes
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.spatial.distance import cdist
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


%matplotlib inline
pd.set_option('display.max_columns', None)

## 1.2 Import a integrate data

In [None]:
df_crm = pd.read_csv('crm.csv')
df_mkt = pd.read_csv('mkt.csv')
df_sales = pd.read_excel('sales.xlsx')

In [None]:
df = pd.merge(pd.merge(df_crm,df_sales,on='CustomerID',how="inner"),df_mkt,on="CustomerID",how="inner")

## 1.3 Set Index


In [None]:
df.set_index('CustomerID',inplace = True)

## 1.4 Check and removing duplicates

In [None]:
df[df.duplicated()] # checking duplicates

In [None]:
df = df[~df.duplicated()] # drop duplicates rows

# 2. Explore Data

## 2.0 Data profiling

Se não quiserem instalar a biblioteca não corram esta secção. Caso contrário o comando para instalar é pip install ydata-profiling. No final **apagar esta secção**

In [None]:
#from ydata_profiling import ProfileReport
#profile= ProfileReport (df, title= "DSML_Project")

In [None]:
#profile.to_file('DSML_profile.html')

## 2.1 Basic Exploration

Q: _To check the number of columns and rows_ we used `shape` _attribute_

In [None]:
df.shape

> A: _The dataset has **7000 rows** and **26 columns**_

__*Q*__: Check the name of the features of the dataset we used `columns` _attribute_

In [None]:
df.columns

> A: The dataset has the following columns/features names: <br>
        >Index. CustomerID
        >1. 'Name' <br>
        >2. 'Birthyear'<br>
        >3. 'Education'<br>
        >4. 'Marital_Status'<br>
        >5. 'Income'<br>
        >6. 'Kid_Younger6'<br>
        >7. 'Children_6to18'<br>
        >8. 'Date_Adherence'<br>
        >9. 'Recency'<br>
        >10. 'MntMeat&Fish'<br>
        >11. 'MntEntries'<br>
        >12. 'MntVegan&Vegetarian'<br>
        >13. 'MntDrinks'<br>
        >14. 'MntDesserts'<br>
        >15. 'MntAdditionalRequests'<br>
        >16. 'NumOfferPurchases'<br>
        >17. 'NumAppPurchases'<br>
        >18. 'NumTakeAwayPurchases'<br>
        >19. 'NumStorePurchases'<br>
        >20. 'NumAppVisitsMonth'<br>
        >21. 'Complain'<br>
        >22. 'Response_Cmp1'<br>
        >23. 'Response_Cmp2'<br>
        >24. 'Response_Cmp3'<br>
        >25. 'Response_Cmp4'<br>
        >26. 'Response_Cmp5'<br>

Q: First glance of the dataset using `head` and `tail` methods to check the first and last 5 rows.

In [None]:
df.head(3)

In [None]:
df.tail(3)

Q: To check the basic information of the dataset we've used the `info` method

In [None]:
df.info()

>A: We can observe the data type of the dataset and the how many of features per data type  `dtypes: float64 - (7), int64 - (15), object - (4)`, the memory usage of `1.4+MB`, and the non-null values present per columns. <br>
> Using only `info` method we understand that `'Education', 'Recency', 'MntDrinks'` have __14, 23, 28 null values__ that require some action.

# 2.2 Statistical Exploration

## 2.2.1 Numerical Variables

In [None]:
df.describe()

> _The describe return we can get a first glance and make some conclusion:_

>__Birthyear__ - could originate an Age column for readability purposes<br>
__Income__ - Min and Max are very far from each other and far from the mean value which could indicate outliers<br>
__Recency__ - 6977 valid values, hence we should look in deep and decide on how to minimize that effect of missing values<br>
__MntMeat&Fish__ - Min and Max are distant from each other and have high standard deviation which could effect some future conclusion<br>
__MntEntries__ - Again has high standard deviation that we should analyze, Min and Max far apart, similar to MntMeat&Fish<br>
__MntVegan&Vegetarian__ - Similar to the previous two Mnt columns<br>
__MntDrinks, MntDesserts__ - Seems to be very similar between them<br>
__MntAdditionalRequests__ - The max value standard deviation seems high and also the max value very far apart from the mean<br>
__NumOfferPurchases, NumTakeAwayPurchases, NumAppVisitsMonth__  - Have a max value to distante from the mean that could be true but we need to take into account<br>
__NumAppPurchases, SumStorePurchases__ - Seems does not have strange summary statistcs<br>
__Kid_Younger6, Children_6to18__ - 75% of clients have at least one child

**Q**: Skewness of each variable 

In [None]:
df.skew()

Concerning the variables' skewness, we can conclude the following:
- `Moderate skewness (between |0.5| and |1.0|)`: Birthyear, Income, Kid_Younger6, Children_6to18, Recency, NumAppPurchases, NumStorePurchases, NumAppVisitsMonth
- `High skewness (higher than |1.0|)`: MntMeat&Fish, MntEntries, MntVegan&Vegetarian, MntDrinks, MntDesserts, MntAdditionalRequests, NumOfferPurchases, NumTakeAwayPurchases, Complain, Response_Cmp1, Response_Cmp2, Response_Cmp3, Response_Cmp4, Response_Cmp5

In [None]:
df.kurt()

Features with kurtosis higher than 3 could indicate presence of outliers, hence we should have special considerantion with the following features:
>MntEntries, MntVegan&Vegetarian, MntDrinks, MntDesserts, NumOfferPurchases, NumAppVisitsMonth

Note: Binomial Variables Complain, and Response_Cmp1 the kurtosis we will not consider as outliers

## 2.2.2 Categorical Values

In [None]:
df.describe(include = object)

> We can conclude that the education as **14 missing** values

#### Level/Possible values of Categorical Features

### `Name` prefix unique values and count

In [None]:
df['Name'].str.partition(" ")[0].value_counts()

With the prefix we can generate a `gender` feature to further explore the dataset. We will deal with that in the data transformation capther

#### **`Gender`** feature creation

In [None]:
df["Gender"] = df['Name'].str.partition(" ")[0]
df = df.replace({"Gender":{"Mr.": 1,"Miss": 0,"Mrs.": 0}})

### `Education` unique values and count

In [None]:
df["Education"].value_counts()

We have some issues that will need trasformatioin:<br>
- Graduation, Master, HighSchool are written in different ways<br>
- `Basic` and `HighSchool` need different levels?

#### Education standardization

In [None]:
df = df.replace({"Education":{"master":"Master", "graduation":"Graduation", "phd":"PhD","highschool":"HighSchool"}})

### `Marital_Status` unique values and count

In [None]:
df["Marital_Status"].value_counts()

Similarly to previous feature we also have some issues that need transformation:<br>
- Married, Together, Single, Divorced and Widow are written with lower and capital letters
- We could also consider that Married and Together are similar and joined them in the same level<br>

#### Marital_Status standardization

In [None]:
df = df.replace({"Marital_Status":{"married":"Married", "together":"Married", "single":"Single","widow":"Widow","divorced":"Divorced","Together":"Married"}})
df["Marital_Status"].value_counts()

`Date_Adherence` unqiue values and count

In [None]:
df["Date_Adherence"].value_counts()

`Date_Adherence` is a date and will need transformation to a date format for further exploration

## 2.3 Visual Exploration

### 2.3.1 Numerical Variables

## 2.4. In-Depth Exploration

# 3. Preprocess Data

### 3.1.2. Missing Values

In [None]:
Response_is_null = df["Response_Cmp1"].isna().sum() + df["Response_Cmp2"].isna().sum() + df["Response_Cmp3"].isna().sum() + df["Response_Cmp4"].isna().sum()
Response_is_null == 0

In [None]:
df.info()

In [None]:
df.isna().sum()

- **`Education`**, **`Recency`**, **`MntDrinks`** and **`MntTotal`** (due to dependancy of `MntDrinks`) have missing values

#### Filling the missing values

Fill `Education` with the mode

In [None]:
df["Education"].fillna(df["Education"].mode()[0], inplace = True)

Fill `Recency` with the median value

In [None]:
df["Recency"].fillna(df["Recency"].mean(), inplace = True)

In [None]:
df_mnt = df[[ 'MntMeat&Fish', 'MntEntries', 'MntVegan&Vegetarian', 'MntDrinks',
       'MntDesserts', 'MntAdditionalRequests']]

imputer = KNNImputer(n_neighbors=3)
array_impute = imputer.fit_transform(df_mnt)
df_mnt = pd.DataFrame(array_impute, columns = df_mnt.columns)

In [None]:
df["MntDrinks"] = df_mnt["MntDrinks"].values

In [None]:
df.isna().sum()

## 3.2. Data Transformation

### 3.2.1. Create new Variables

### Utils

#### Creating Age variable from the Birthyear

In [None]:
df['Age'] = df.Birthyear.apply(lambda x: date.today().year-x)

In [None]:
df.drop('Birthyear', axis= 1, inplace = True)

#### Creating card adherence age variable from the Date adherence

In [None]:
from datetime import datetime
df = df.replace({"Date_Adherence":{"2/29/2022": datetime.strptime("2022-03-01", '%Y-%m-%d')}}) #2022 is not a leap year, therefore 29/02/2022 is not a possible day

In [None]:
df['daysAsCardClient'] = df['Date_Adherence'].apply(lambda x: (date.today() - x.date()).days)

In [None]:
df.drop('Date_Adherence', axis= 1, inplace = True)

#### Fill Education

In [None]:
edu_encode = pd.get_dummies(df.Education, drop_first= True) #TODO ver se nao ha formas melhores para tratar desta categoria
df = pd.concat([df, edu_encode], axis = 1)

In [None]:
df.drop('Education', axis = 1, inplace = True)

#### Fill Maritial Status

In [None]:
marital_encode = pd.get_dummies(df.Marital_Status, drop_first= True)
df = pd.concat([df, marital_encode], axis = 1)

In [None]:
df.drop('Marital_Status', axis= 1, inplace = True)

#### Create MntTotal

In [None]:
df["MntTotal"] = df['MntMeat&Fish'] + df['MntEntries'] + df['MntVegan&Vegetarian'] + df['MntDrinks'] + df['MntDesserts'] + df['MntAdditionalRequests']
df["MntTotal"]
# em falta Mnt Add Requests

#### Create Mnt Pday Card

In [None]:
df['Mnt_pday_card']= df.MntTotal/df.daysAsCardClient

#### Create Response Campaigns

In [None]:
df['Response_Campaigns'] = df['Response_Cmp1'] + df['Response_Cmp2'] + df['Response_Cmp3'] + df['Response_Cmp4'] + df[
       'Response_Cmp5']

#### Create Total Kids

In [None]:
df["Total_Kids"] = df["Kid_Younger6"] + df["Children_6to18"]

#### Create Has Kids

In [None]:
df["has_Kids"] = df["Total_Kids"].apply(lambda x: 0 if x == 0 else 1)
df["has_Kids"]

#### Create age_bins

In [None]:
df["age_bins"] = pd.cut(df["Age"], bins = 5)
age_bin = pd.get_dummies(df['age_bins'],prefix='age')
df = pd.concat([df,age_bin], axis=1)
df.drop(['age_bins'],axis=1, inplace=True)

In [None]:
df

## Incoherencies


Verificar que todos os clientes que têm valores gastos têm compras registadas

In [None]:
df[(df[['MntMeat&Fish', 'MntEntries',
        'MntVegan&Vegetarian', 'MntDrinks', 'MntDesserts',
        'MntAdditionalRequests']].sum(axis = 1) > 0) & (df[['NumAppPurchases', 'NumTakeAwayPurchases', 'NumStorePurchases']].sum(axis = 1) <= 0)]

Alterar as linhas que não têm compras registadas e valor gasto para que o valor gasto seja 0

In [None]:

df.drop(df.loc[(df[['MntMeat&Fish', 'MntEntries',
                    'MntVegan&Vegetarian', 'MntDrinks', 'MntDesserts',
                    'MntAdditionalRequests']].sum(axis = 1) > 0) & (df[['NumAppPurchases', 'NumTakeAwayPurchases', 'NumStorePurchases']].sum(axis = 1) <= 0)].index,inplace=True)

In [None]:
df[(df[['MntMeat&Fish', 'MntEntries',
        'MntVegan&Vegetarian', 'MntDrinks', 'MntDesserts',
        'MntAdditionalRequests']].sum(axis = 1) > 0) & (df[['NumAppPurchases', 'NumTakeAwayPurchases', 'NumStorePurchases']].sum(axis = 1) <= 0)] # confirmação do ajuste

Ver se não há mais compras com ofertas do que compras totais

In [None]:
df[(df['NumOfferPurchases'] > df[['NumAppPurchases','NumTakeAwayPurchases','NumStorePurchases']].sum(axis=1))]

In [None]:
# será esta a abordagem mais acertada, isto é, assumir que todas as compras deste cliente foram "OfferPurchases"?

df.loc[(df['NumOfferPurchases'] > df[['NumAppPurchases','NumTakeAwayPurchases','NumStorePurchases']].sum(axis=1)),'NumOfferPurchases'] = df['NumAppPurchases'] + df['NumTakeAwayPurchases'] + df['NumStorePurchases']

In [None]:
df[(df['NumOfferPurchases'] > df[['NumAppPurchases','NumTakeAwayPurchases','NumStorePurchases']].sum(axis=1))] # confirmação do ajuste

### 3.1.2 Outliers

In [None]:
#fig, (ax1, ax2) = plt.subplots(1, 2, figsize = (10, 5))
#ax1.boxplot(df['MntVegan&Vegetarian'])
#ax2.boxplot(df['Income'])

In [None]:
df.drop(df[abs(zscore(df['MntVegan&Vegetarian'])) > 3].index,
        inplace=True)
df.drop(df[abs(zscore(df['Income'])) > 3].index, inplace=True)


In [None]:
#fig, (ax1, ax2) = plt.subplots(1, 2, figsize = (10, 5))
#ax1.boxplot(df['MntVegan&Vegetarian'])
#ax2.boxplot(df['Income'])

# 3.1.1 Skewness Correction

In [None]:
#TODO Ver se o sklearn nao tem funçao para fazer isto. ver sklearn.preprocessing, meter depois de outliers/incoherencies

In [None]:
df['MntMeat&Fish'] = df['MntMeat&Fish'].apply(lambda x: np.log10(x+1))
df['MntVegan&Vegetarian'] = df['MntVegan&Vegetarian'].apply(lambda x: np.log10(x+1))
df['MntEntries'] = df['MntEntries'].apply(lambda x: np.log10(x+1))
df['MntDrinks'] = df['MntDrinks'].apply(lambda x: np.log10(x+1))
df['MntDesserts'] = df['MntDesserts'].apply(lambda x: np.log10(x+1))
df['MntAdditionalRequests'] = df['MntAdditionalRequests'].apply(lambda x: np.log10(x+1))
df['NumOfferPurchases'] = df['NumOfferPurchases'].apply(lambda x: np.log10(x+1))
df['NumTakeAwayPurchases'] = df['NumTakeAwayPurchases'].apply(lambda x: np.log10(x+1))
df['Complain'] = df['Complain'].apply(lambda x: np.log10(x+1))
df['Response_Cmp1'] = df['Response_Cmp1'].apply(lambda x: np.log10(x+1))
df['Response_Cmp2'] = df['Response_Cmp2'].apply(lambda x: np.log10(x+1))
df['Response_Cmp3'] = df['Response_Cmp3'].apply(lambda x: np.log10(x+1))
df['Response_Cmp4'] = df['Response_Cmp4'].apply(lambda x: np.log10(x+1))
df['Response_Cmp5'] = df['Response_Cmp5'].apply(lambda x: np.log10(x+1))

## Data Review

Ver a dataframe no seu estado final
Drop: Id, name, birthyear, date_adherence, total_kids, mntTotal

In [None]:
df.head()

In [None]:
df.describe()

Utilizar df como base para treinos/clustering

In [None]:
df.drop(['Name'], axis = 1, inplace = True)

In [None]:
df.columns

# Feature Selection

As the problem is not a classification problem we will need to adapt our data so it can be used in classifiers. We will use Random Forest Classifiers as tools for feature selection, using MntTotal as the target variable, as our goal is to devise a marketing campaign that aims to increase sales

## Redução de Dimensões e Scaling
Pipeline para passar da df para uma df que se possa utilizar nos modelos de clustering.
1: Selecionar as colunas a partir da dataframe df. Criar uma copia com as colunas desejadas utilizando a função df_select
2: Aplicar a função scaling_dfs que aplica scaling à dataframe obtida no passo anterior e retorna uma df scaled
3: Escolher o método de redução de dimensoes a aplicar: PCA ou Feature selection com RandomForest
3.1.1 PCA: Utilizar a função PCA_graph_df para ver quantos principal components serão necessários para a variancia desejada
3.1.2 PCA: Utilizar a função create_PCA_df para aplicar PCA com o numero de componentes desejado à dataframe obtida em 2, obtendo uma df com o numero de colunas igual ao numero de principal components
3.2.1 Feature Selection com RandomForest: Utilizar a função RFR_feature_select para obter os nomes das colunas consideradas importantes e o MSE (para ver se a abordagem é viável). Atenção que a df utilizada para fazer feature selection deverá ser uma com as colunas desejadas mas **SEM SCALING**, para que não haja bleeding de informação.
3.2.2 Feature Selection com RandomForest: Utilizar a função selected_features_df para obter uma df sem scaling com apenas as colunas obtidas no passo anterior mais a coluna da target variable. **Aplicar o passo 2 apenas agora**



In [None]:
def df_select(column_names: list, df: pd.DataFrame):
    df_train = df[column_names].copy()
    return df_train

In [None]:
def scaling_dfs(df):
    scaler = StandardScaler()
    df_total_scaled = pd.DataFrame(scaler.fit_transform(df))
    return df_total_scaled

In [None]:
def PCA_graph_df(df_scaled: pd.DataFrame, exp_variance: int):
    pca = PCA()
    pca.fit(df_scaled)
    var1=np.cumsum(np.round(pca.explained_variance_ratio_, decimals=4)*100)

    plt.title("PCA Variance against num of Componmnets")
    plt.ylabel("Variance %")
    plt.xlabel("Number of componments")
    l = plt.axhline(exp_variance, color="red")

    plt.plot(var1)
    plt.grid()

In [None]:
def create_PCA_df(n_components: int, df_scaled: pd.DataFrame):
    pca = PCA(n_components= n_components)
    pca_train = pca.fit_transform(df_scaled)
    return pca_train

In [None]:
def RFR_feature_select(df: pd.DataFrame, target_var: str):

    x = df.drop([target_var], axis= 1).copy()
    y = df[target_var].copy()

    x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=1)

    y_train = np.array(y_train).reshape(-1, 1)
    y_test = np.array(y_test).reshape(-1, 1)

    scalerx = MinMaxScaler()
    scalery = MinMaxScaler()

    scalerx = scalerx.fit(x_train)
    scalery = scalery.fit(y_train)

    x_train = pd.DataFrame(scalerx.transform(x_train), columns= x.columns)
    x_test = pd.DataFrame(scalerx.transform(x_test), columns= x.columns)

    y_train = pd.DataFrame(scalery.transform(y_train))
    y_test = pd.DataFrame(scalery.transform(y_test))

    RFR = RandomForestRegressor(random_state= 1)
    RFR.fit(x_train, y_train)

    importances = pd.Series(RFR.feature_importances_, index= x_train.columns)
    threshold = importances.median()
    selected_features = x_train.reset_index(drop=True).loc[:, importances >= threshold]

    new_RFR = RandomForestRegressor(random_state=1)
    new_RFR.fit(selected_features, y_train)

    x_test= x_test[selected_features.columns]
    y_pred = new_RFR.predict(x_test)
    mse = mean_squared_error(y_test, y_pred)

    return list(selected_features.columns), mse

In [None]:
def selected_features_df(selected_columns: list, target_variable: str, df: pd.DataFrame):
    selected_columns = selected_columns.append(target_variable)
    selected_df = df[selected_columns].copy()
    return selected_df

## Exemplo

Neste exemplo serão utilizadas as colunas que vimos na quarta com o método PCA e o modelo kmeans

In [None]:
kmeans_columns = ['Income', 'Recency', 'NumOfferPurchases', 'NumAppPurchases',
       'NumTakeAwayPurchases', 'NumStorePurchases', 'NumAppVisitsMonth',
       'Complain', 'Gender', 'daysAsCardClient',
       'Graduation', 'HighSchool', 'Master', 'PhD', 'Married', 'Single',
       'Widow', 'MntTotal', 'Response_Campaigns',
       'Total_Kids', 'Age']
kmeans_df = df_select(column_names= kmeans_columns, df= df) #Step 1
kmeans_df_scaled = scaling_dfs(kmeans_df) #Step 2
PCA_graph_df(df_scaled= kmeans_df_scaled, exp_variance= 80) #Step 3.1.1

In [None]:
kmeans_pca_df= create_PCA_df(n_components= 8, df_scaled= kmeans_df_scaled) #Step 3.1.2

## Treino de modelos e avaliação **Não funcional para modelos que nao o kmeans**
Pipeline para passar de uma dataframe com dimensoes reduzidas para modelos de clustering e as suas métricas **Só funciona para modelos semelhantes a kmeans** #TODO fazer para modelos com outra estrutura

1: Criação do modelo. Utilizar os modelos do sklearn.cluster para obter um objeto **model** que se possa utilizar nas funções
2: Escolha do número de clusters utilizando uma das funções ..._nclusters
2.1 Elbow Method: Queremos selecionar o número de clusters que corresponde ao ponto da curva em que a derivada começa a ter declives menores
2.2 Total Sum of Squares Method: Queremos selecionar o número de clusters que maximiza o between_ss e minimza o within_ss

In [None]:
def elbow_nclusters(model_type, dimensioned_df: pd.DataFrame, cluster_range: int):
    ks = range(1,cluster_range)
    inertias = []


    for k in ks:
        model = model_type(n_clusters = k).fit(dimensioned_df)
        inertias.append(model.inertia_)
    # Plot ks (x-axis) vs inertias (y-axis) using plt.plot().
    plt.plot(ks, inertias)

    # define the label for the x axis as 'number of clusters' using matplotlib.pyplot.xlabel
    plt.xlabel('number of clusters')
    # define the label for the y axis as 'inertia' using matplotlib.pyplot.ylabel
    plt.ylabel('inertia')
    # define the ticks on the x axis using the values of ks
    plt.xticks(ks)
    # call plt.show()
    plt.show()

In [None]:
def sumsquares_nclusters(model_type,  dimensioned_df: pd.DataFrame, cluster_range: int):
    within_ss = []
    between_ss = []
    model_list = []
    n_cluster = range(1,cluster_range)

    ssc = pd.DataFrame({"model": n_cluster})
    ssc_melted = pd.melt(ssc, id_vars=["model"], var_name="measurement", value_name="value")

    for k in n_cluster:
        model = model_type(n_clusters=k)
        model.fit(dimensioned_df)
        within_ss.append(model.inertia_)
        between_ss.append(sum(np.min(
            cdist(dimensioned_df, model.cluster_centers_, 'euclidean'), axis=1)) / dimensioned_df.shape[0])
        model_list.append(model)

        ssc.loc[ssc["model"] == k, "within_ss"] = within_ss[-1]
        ssc.loc[ssc["model"] == k, "between_ss"] = between_ss[-1]

    ssc_melted = pd.melt(ssc, id_vars=["model"], var_name="measurement", value_name="value")

    plt.figure(figsize=(8, 6))
    sns.barplot(x="model", y=np.log10(ssc_melted["value"]),
                hue="measurement", data=ssc_melted)
    plt.title("Cluster Model Comparison")
    plt.xlabel("Number of Clusters")
    plt.ylabel("Log10 Total Sum of Squares")
    plt.show()

## 4. Models

## Model Train

Modelos utilizados: kmeans
Modelos propostos: DBScan, Kmodes

In [None]:
kmeans= KMeans(n_clusters = 4, n_init= 100, max_iter =10000, random_state= 1)
kmeans.fit(kmeans_pca_df)
pca_train_label = kmeans.labels_
pca_train_label = pd.DataFrame(pca_train_label)

In [None]:
df['cluster'] = kmeans.predict(kmeans_pca_df)

# variavel modelo visao monetary:
 Mnt_pday_card, has_Kids, Income, age_bins, 'Graduation', 'HighSchool', 'Master', 'PhD', 'Gender'

### Measuring distances between clusters

#TODO Experimentar com 3 clusters para ver se as distancias aumentam/diminuem para fundamentar a escolha do numero de clusters

In [None]:
from sklearn.metrics.pairwise import euclidean_distances

dists = euclidean_distances(kmeans.cluster_centers_)
dists

### Visualization

In [None]:
cluster0 = df[df.cluster == 0]
cluster1 = df[df.cluster == 1]
cluster2 = df[df.cluster == 2]
cluster3 = df[df.cluster == 3]
# cluster4 = df[df.cluster == 4]

In [None]:
sns.histplot(cluster0['Income'],color='red',label='Cluster 0', bins = 20)
sns.histplot(cluster1['Income'],color='yellow',label='Cluster 1', bins = 20)
sns.histplot(cluster2['Income'],color='green',label='Cluster 2', bins = 20)
# sns.histplot(cluster3['Income'],color='blue',label='Cluster 3', bins = 20)
# sns.histplot(cluster4['Income'],color='pink',label='Cluster 4', bins = 20)
plt.legend()

# variavel modelo visao customer behaviour: NumOfferPurchases', 'NumAppPurchases',
       'NumTakeAwayPurchases', 'NumStorePurchases', 'NumAppVisitsMonth',
       'Complain', 'Gender', 'Income', 'Age', 'Graduation', 'HighSchool', 'Master', 'PhD', 'Married', 'Single',
       'Widow'

## Murilo - Análise

In [None]:
#num_cols = ['Income', 'MntMeat&Fish', 'MntEntries', 'MntVegan&Vegetarian', 'MntDrinks', 'MntDesserts', 'MntAdditionalRequests']
#cat_cols = ['age_(17.943, 29.4]', 'age_(29.4, 40.8]', 'age_(40.8, 52.2]', 'age_(52.2, 63.6]', 'age_(63.6, 75.0]', 'Graduation', 'HighSchool', 'Master', 'PhD', 'Married', 'Single', 'Widow']

In [None]:
num_cols = ['Income', 'MntMeat&Fish', 'MntEntries', 'MntVegan&Vegetarian', 'MntDrinks', 'MntDesserts', 'MntAdditionalRequests', 'NumOfferPurchases', 'NumAppPurchases', 'NumTakeAwayPurchases', 'NumStorePurchases', 'NumAppVisitsMonth', 'daysAsCardClient', 'Response_Campaigns', 'Total_Kids']
cat_cols = ['age_(17.943, 29.4]', 'age_(29.4, 40.8]', 'age_(40.8, 52.2]', 'age_(52.2, 63.6]', 'age_(63.6, 75.0]', 'Graduation', 'HighSchool', 'Master', 'PhD', 'Married', 'Single', 'Widow']

In [None]:
kprototype_columns = []
kprototype_columns.extend(num_cols)
kprototype_columns.extend(cat_cols)

kprototype_df = df_select(column_names= kprototype_columns, df= df)

df_num = kprototype_df[num_cols]
df_cat = kprototype_df[cat_cols]

df_num = df_num.reset_index(drop=True)
df_cat = df_cat.reset_index(drop=True)

scaler = StandardScaler()
df_num_scaled = scaler.fit_transform(df_num)

In [None]:
PCA_graph_df(df_scaled= df_num_scaled, exp_variance= 80) #Step 3.1.1

In [None]:
# Aplicando o PCA às colunas numéricas
pca = PCA(n_components=4)
df_num_pca = pca.fit_transform(df_num_scaled)

In [None]:
# Sem o PCA às colunas numéricas
#df_num_pca = df_num_scaled

In [None]:
# Concatenando as colunas numéricas do PCA com as colunas categóricas
df_combined = pd.concat([pd.DataFrame(df_num_pca), df_cat], axis=1)

In [None]:
def plot_elbow(X, k_range):
    costs = []
    for k in k_range:
        kp = KPrototypes(n_clusters=k, init='Huang', n_init=5, verbose=0)
        kp.fit(X, categorical=[i for i in range(3, len(X.columns))])
        costs.append(kp.cost_)
    plt.plot(k_range, costs, marker='o')
    plt.xticks(k_range)
    plt.xlabel('Número de clusters (k)')
    plt.ylabel('Cost')
    plt.show()

In [None]:
#plot_elbow(df_combined, range(3, 7))

In [None]:
df_combined.head()

In [None]:
# Aplicando o K-Prototypes
kp = KPrototypes(n_clusters=3, init='Huang', n_init=20, verbose=0, n_jobs=4, random_state=1)
clusters_kp = kp.fit_predict(df_combined, categorical=[i for i in range(3, len(df_combined.columns))])

# Adicionando os clusters ao DataFrame original
kprototype_df['Cluster_kp'] = clusters_kp

In [None]:
kprototype_df.head()

In [None]:
cluster0 = kprototype_df[kprototype_df.Cluster_kp == 0]
cluster1 = kprototype_df[kprototype_df.Cluster_kp == 1]
cluster2 = kprototype_df[kprototype_df.Cluster_kp == 2]


sns.histplot(cluster0['Income'], color='red', label='Cluster 0', bins=20)
sns.histplot(cluster1['Income'], color='yellow', label='Cluster 1', bins=20)
sns.histplot(cluster2['Income'], color='green', label='Cluster 2', bins=20)

plt.legend()

In [None]:
dists = euclidean_distances(kp.cluster_centroids_)
dists

In [None]:
kprototype_df.groupby(['Cluster_kp']).describe().transpose().to_csv('output2.csv')

In [None]:
kprototype_df.head()