# Epic 4: Data analysis

`Als een key user kan ik voor een contact met weinig transacties een lookalike met veel transacties identificeren. Ik kan ook een clustering maken van contactpersonen die qua functie, bedrijfseigenschappen, gedrag en (verwacht) gedrag`

- Clustering (classification)
- Bij het geven van een contact, enkele lookalikes teruggeven gebaseerd op deze clustering

IN:
- Target contact
- Alle andere contacten

OUT: 
- X aantal dichtsbijzijnde contacten uit de lijst van alle andere contacten

In [19]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import dotenv
from sqlalchemy import create_engine, text
import pyodbc

dotenv.load_dotenv()

True

In [20]:
# Start db
def create_conn():
    driver = os.getenv("DB_DRIVER")
    server = os.getenv("DB_SERVER")
    database = os.getenv("DB_NAME")
    trusted_connection = os.getenv("DB_TRUSTED_CONNECTION")

    return create_engine(
        f"mssql+pyodbc://{server}/{database}?trusted_connection={trusted_connection}&driver={driver}"
    )


engine = create_conn()
# Test connection
#connection = engine.connect()
#res = connection.execute(text("SELECT @@version;")).fetchone()
#connection.close()
#res[0]

In [21]:
SQL_LOCATION = "data/start_data.sql"
SAVE_LOCATION = os.getenv("EPIC_4_SAVE_LOCATION")
query = open(SQL_LOCATION, "r").read()

df = pd.read_sql(query, engine)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80270 entries, 0 to 80269
Data columns (total 30 columns):
 #   Column                                                     Non-Null Count  Dtype  
---  ------                                                     --------------  -----  
 0   crm_contact_functietitel                                   77794 non-null  object 
 1   crm_contact_voka_medewerker                                80270 non-null  object 
 2   crm_contact_id                                             80270 non-null  object 
 3   crm_functie_naam                                           80270 non-null  object 
 4   crm_persoon_mail_thema_duurzaamheid                        80270 non-null  object 
 5   crm_persoon_mail_thema_financieel_fiscaal                  80270 non-null  object 
 6   crm_persoon_mail_thema_innovatie                           80270 non-null  object 
 7   crm_persoon_mail_thema_internationaal_ondernemen           80270 non-null  object 
 8   crm_pe

# 2. Data cleaning

In [22]:
for col in df.columns:
    print(col, len(df[col].unique()))

crm_contact_functietitel 15409
crm_contact_voka_medewerker 2
crm_contact_id 55483
crm_functie_naam 67
crm_persoon_mail_thema_duurzaamheid 2
crm_persoon_mail_thema_financieel_fiscaal 2
crm_persoon_mail_thema_innovatie 2
crm_persoon_mail_thema_internationaal_ondernemen 2
crm_persoon_mail_thema_mobiliteit 2
crm_persoon_mail_thema_omgeving 2
crm_persoon_mail_thema_sales_marketing_communicatie 2
crm_persoon_mail_thema_strategie_en_algemeen_management 2
crm_Persoon_Mail_thema_talent 2
crm_persoon_mail_thema_welzijn 2
crm_persoon_mail_type_bevraging 2
crm_persoon_mail_type_communities_en_projecten 2
crm_persoon_mail_type_netwerkevenementen 2
crm_persoon_mail_type_nieuwsbrieven 2
crm_persoon_mail_type_opleidingen 2
crm_persoon_mail_type_persberichten_belangrijke_meldingen 2
crm_persoon_marketingcommunicatie 4
crm_account_id 8723
crm_account_is_voka_entiteit 2
crm_account_ondernemingsaard 4
crm_account_ondernemingstype 13
crm_account_primaire_activiteit 37
crm_activiteitscode_naam 37
crm_financ

## Columns based on other columns, and edits

In [23]:
# Financiele data aanpassen
# This is done so that the values are annual values
df["crm_financieledata_toegevoegde_waarde"] = df["crm_financieledata_toegevoegde_waarde"] / df["crm_financieledata_aantal_maanden"] * 12

# Edit crm_functie_naam , crm_activiteitscode_naam 

# crm_functie_naam
# Group by contact_id and transpose all crm_functie_naam values to columns
df_functie = df.groupby("crm_contact_id")["crm_functie_naam"].value_counts().unstack().fillna(0)
# Rename columns
df_functie.columns = ["crm_functie_naam_" + str(col) for col in df_functie.columns]
# Join to original dataframe
df = df.join(df_functie, on="crm_contact_id")

# Do the same for crm_activiteitscode_naam
df_activiteit = df.groupby("crm_account_id")["crm_activiteitscode_naam"].value_counts().unstack().fillna(0)
df_activiteit.columns = ["crm_activiteitscode_naam_" + str(col) for col in df_activiteit.columns]
df = df.join(df_activiteit, on="crm_account_id")

# Drop the original columns
df.drop(["crm_functie_naam", "crm_activiteitscode_naam"], axis=1, inplace=True)
# Drop duplicate rows
df.drop_duplicates(inplace=True)

df

Unnamed: 0,crm_contact_functietitel,crm_contact_voka_medewerker,crm_contact_id,crm_persoon_mail_thema_duurzaamheid,crm_persoon_mail_thema_financieel_fiscaal,crm_persoon_mail_thema_innovatie,crm_persoon_mail_thema_internationaal_ondernemen,crm_persoon_mail_thema_mobiliteit,crm_persoon_mail_thema_omgeving,crm_persoon_mail_thema_sales_marketing_communicatie,...,crm_activiteitscode_naam_Papier & karton,crm_activiteitscode_naam_Technologische industrie & diensten,crm_activiteitscode_naam_Telecom & IT,"crm_activiteitscode_naam_Textiel, kleding en confectie",crm_activiteitscode_naam_Vastgoed,crm_activiteitscode_naam_Verenigingen en maatschappelijke organisaties,crm_activiteitscode_naam_Verzekering,crm_activiteitscode_naam_Voeding,crm_activiteitscode_naam_Vrije beroepen,crm_activiteitscode_naam_Zorg
0,Zaakvoerder,0,B05C5B73-4F93-E511-B7D1-005056B06EC4,Nee,Nee,Nee,Nee,Nee,Nee,Nee,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Business manager,0,CB54A5ED-0D3F-ED11-9DB0-6045BD895BFB,Nee,Nee,Nee,Nee,Nee,Nee,Nee,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Zaakvoerder,0,4B95BAFD-ABFA-E611-80E4-001DD8B72B62,Nee,Nee,Nee,Nee,Nee,Nee,Nee,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Co-owner,0,80E226C4-AFC5-E911-8104-001DD8B72B61,Nee,Nee,Nee,Nee,Nee,Nee,Nee,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Zaakvoerder,0,9DC0BE41-64B2-E711-80EC-001DD8B72B62,Nee,Nee,Nee,Nee,Nee,Nee,Nee,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80263,Connect & Develop P&G,0,DD2FAD8A-8CEF-E611-80E4-001DD8B72B62,Ja,Nee,Ja,Nee,Nee,Ja,Nee,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
80264,"HR Manager, BENELUX",0,8D1F28C1-0673-E111-B43A-00505680000A,Nee,Nee,Nee,Nee,Nee,Nee,Nee,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
80266,Shopmanager,0,F3025D6B-C95E-EA11-810F-001DD8B72B62,Nee,Nee,Nee,Nee,Nee,Nee,Nee,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
80267,Algemeen Directeur,0,CE6EF7AA-AD72-E111-B43A-00505680000A,Nee,Nee,Nee,Nee,Nee,Nee,Nee,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
"""
crm_contact_functietitel 15409
crm_contact_voka_medewerker 2
crm_contact_id 55483
crm_functie_naam 67
crm_persoon_mail_thema_duurzaamheid 2
crm_persoon_mail_thema_financieel_fiscaal 2
crm_persoon_mail_thema_innovatie 2
crm_persoon_mail_thema_internationaal_ondernemen 2
crm_persoon_mail_thema_mobiliteit 2
crm_persoon_mail_thema_omgeving 2
crm_persoon_mail_thema_sales_marketing_communicatie 2
crm_persoon_mail_thema_strategie_en_algemeen_management 2
crm_Persoon_Mail_thema_talent 2
crm_persoon_mail_thema_welzijn 2
crm_persoon_mail_type_bevraging 2
crm_persoon_mail_type_communities_en_projecten 2
crm_persoon_mail_type_netwerkevenementen 2
crm_persoon_mail_type_nieuwsbrieven 2
crm_persoon_mail_type_opleidingen 2
crm_persoon_mail_type_persberichten_belangrijke_meldingen 2
crm_persoon_marketingcommunicatie 4
crm_account_is_voka_entiteit 2
crm_account_ondernemingsaard 4
crm_account_ondernemingstype 13
crm_account_primaire_activiteit 37
crm_activiteitscode_naam 37
crm_financieledata_toegevoegde_waarde 8462
crm_financieledata_fte 535
crm_financieledata_aantal_maanden 25
"""

columns_priorities = {
    "Most important": [ 
        "crm_functie_naam",
        "crm_contact_id",
        "crm_persoon_mail_thema_duurzaamheid",
        "crm_persoon_mail_thema_financieel_fiscaal",
        "crm_persoon_mail_thema_innovatie",
        "crm_persoon_mail_thema_internationaal_ondernemen",
        "crm_persoon_mail_thema_mobiliteit",
        "crm_persoon_mail_thema_omgeving",
        "crm_persoon_mail_thema_sales_marketing_communicatie",
        "crm_persoon_mail_thema_strategie_en_algemeen_management",
        "crm_Persoon_Mail_thema_talent",
        "crm_persoon_mail_thema_welzijn",
        "crm_persoon_mail_type_bevraging",
        "crm_persoon_mail_type_communities_en_projecten",
        "crm_persoon_mail_type_netwerkevenementen",
        "crm_persoon_mail_type_nieuwsbrieven",
        "crm_persoon_mail_type_opleidingen",
        "crm_persoon_mail_type_persberichten_belangrijke_meldingen",
        "crm_persoon_marketingcommunicatie",
        "crm_account_ondernemingsaard",
        "crm_account_ondernemingstype",
        "crm_account_primaire_activiteit",
        "crm_activiteitscode_naam",
        "crm_financieledata_toegevoegde_waarde",
        "crm_financieledata_fte",

    ],
    "Medium importance": [
        "crm_contact_functietitel",

    ],
    "Least important": [
        "crm_contact_voka_medewerker",
        # "crm_contact_id",
        "crm_account_is_voka_entiteit",
        "crm_account_id",

    ],
}

# Remove all columns that are the least important
df = df.drop(columns_priorities["Least important"], axis=1)

df.reset_index(drop=True, inplace=True)
df.info()
df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55486 entries, 0 to 55485
Columns: 129 entries, crm_contact_functietitel to crm_activiteitscode_naam_Zorg
dtypes: float64(107), object(22)
memory usage: 54.6+ MB


Unnamed: 0,crm_contact_functietitel,crm_contact_id,crm_persoon_mail_thema_duurzaamheid,crm_persoon_mail_thema_financieel_fiscaal,crm_persoon_mail_thema_innovatie,crm_persoon_mail_thema_internationaal_ondernemen,crm_persoon_mail_thema_mobiliteit,crm_persoon_mail_thema_omgeving,crm_persoon_mail_thema_sales_marketing_communicatie,crm_persoon_mail_thema_strategie_en_algemeen_management,...,crm_activiteitscode_naam_Papier & karton,crm_activiteitscode_naam_Technologische industrie & diensten,crm_activiteitscode_naam_Telecom & IT,"crm_activiteitscode_naam_Textiel, kleding en confectie",crm_activiteitscode_naam_Vastgoed,crm_activiteitscode_naam_Verenigingen en maatschappelijke organisaties,crm_activiteitscode_naam_Verzekering,crm_activiteitscode_naam_Voeding,crm_activiteitscode_naam_Vrije beroepen,crm_activiteitscode_naam_Zorg
0,Zaakvoerder,B05C5B73-4F93-E511-B7D1-005056B06EC4,Nee,Nee,Nee,Nee,Nee,Nee,Nee,Nee,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Business manager,CB54A5ED-0D3F-ED11-9DB0-6045BD895BFB,Nee,Nee,Nee,Nee,Nee,Nee,Nee,Nee,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Zaakvoerder,4B95BAFD-ABFA-E611-80E4-001DD8B72B62,Nee,Nee,Nee,Nee,Nee,Nee,Nee,Nee,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Co-owner,80E226C4-AFC5-E911-8104-001DD8B72B61,Nee,Nee,Nee,Nee,Nee,Nee,Nee,Nee,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Zaakvoerder,9DC0BE41-64B2-E711-80EC-001DD8B72B62,Nee,Nee,Nee,Nee,Nee,Nee,Nee,Nee,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55481,Connect & Develop P&G,DD2FAD8A-8CEF-E611-80E4-001DD8B72B62,Ja,Nee,Ja,Nee,Nee,Ja,Nee,Nee,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
55482,"HR Manager, BENELUX",8D1F28C1-0673-E111-B43A-00505680000A,Nee,Nee,Nee,Nee,Nee,Nee,Nee,Nee,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
55483,Shopmanager,F3025D6B-C95E-EA11-810F-001DD8B72B62,Nee,Nee,Nee,Nee,Nee,Nee,Nee,Nee,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
55484,Algemeen Directeur,CE6EF7AA-AD72-E111-B43A-00505680000A,Nee,Nee,Nee,Nee,Nee,Nee,Nee,Nee,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
# Bring all of the columns to numeric values
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
# Get all of the columns that are not numeric
non_numeric_columns = df.select_dtypes(['object']).columns.tolist()

# select the id column
id_column = df["crm_contact_id"]



# Encode all of the columns that are not numeric but not the id column
for col in non_numeric_columns:
    if col != "crm_contact_id":
        df[col] = le.fit_transform(df[col])

# Save the label encoder
import pickle
with open(os.path.join(SAVE_LOCATION, "label_encoder.pickle"), "wb") as f:
    pickle.dump(le, f)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55486 entries, 0 to 55485
Columns: 129 entries, crm_contact_functietitel to crm_activiteitscode_naam_Zorg
dtypes: float64(107), int32(21), object(1)
memory usage: 50.2+ MB


In [18]:
# Save the dataframe
df.to_csv(os.path.join("./data/", "data_new.csv"), index=False)