<a href="https://colab.research.google.com/github/Ilvecho/Project_7/blob/main/Project_7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import os
import pickle
import re
import random

import torch
from torch import nn
from torch import flatten
from torch.utils.data import TensorDataset, DataLoader
from torch.optim import Adam

from google.colab import files,drive
drive.mount('/content/gdrive')

# get the GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

Mounted at /content/gdrive


In [112]:
data = pd.read_csv('/content/gdrive/MyDrive/P7_files/SalesCRM - CRM.csv')

In [113]:
for col in data.columns:
  print(col)

ID
Country
Education
First Contact
Last Contact
Status
Stage
First Call
Signed up for a demo
Filled in customer survey
Did sign up to the platform
Account Manager assigned
Subscribed


In [114]:
# Many of the cells have value NaN. Let's replace it with a different value so that we are able to count the number of NaN's as well
tmp = data.copy()
tmp['Country'] = tmp['Country'].fillna('Missing')
tmp['Education'] = tmp['Education'].fillna('Missing')
tmp['Status'] = tmp['Status'].fillna('Missing')
tmp['Stage'] = tmp['Stage'].fillna('Missing')


tmp.set_index('ID', inplace=True)

let's first analyze the **categorical** features

# Exploratory Data Analysis - Country

In [115]:
# Bar plot for the counties
fig = px.bar(
    tmp,
    x=tmp.groupby('Country').size().values,
    y=tmp.groupby('Country').size().index
)

fig.update_layout(
    xaxis_title='Count',
    yaxis_title='Country',
    xaxis=dict(showline=False, showgrid=False),
    yaxis=dict(showline=False, showgrid=False)
)

fig.show()

I did not expect these many values for the country. Let's analyze this more in details:

In [116]:
tmp['Country'].value_counts()

USA                         6604
Missing                      891
Canada                       817
France                       336
UK                           329
                            ... 
uSA                            1
Hong Kong                      1
Singapore                      1
Cameroon                       1
Central African Republic       1
Name: Country, Length: 104, dtype: int64

There are **104** different Country values.
Actually there are some typos (e.g. uSA), so let's elaborate a bit the strings to get a more informative overview

In [117]:
countries = tmp['Country'].str.rstrip().str.lower()
countries.value_counts()

usa                         6641
missing                      891
canada                       818
france                       336
uk                           331
                            ... 
full time                      1
bulgaria                       1
senegal                        1
jordan                         1
central african republic       1
Name: Country, Length: 94, dtype: int64

The modification showed indeed some typos. \

We also added a **rstrip** function to remove trailing spaces.

The new count is **94**

In [118]:
countries.value_counts().tail(30)

bolivia                     2
kenya                       2
malaysia                    2
sri lanka                   2
philadelphia                1
congo                       1
california                  1
czechia                     1
russia&ukraine              1
nottingham                  1
korea                       1
bulgaria & uk               1
vietnamese                  1
greek                       1
romania                     1
turkey                      1
latvia                      1
hong kong                   1
seoul                       1
venezuela                   1
chuang                      1
benin                       1
guinea                      1
cameroon                    1
serbia                      1
full time                   1
bulgaria                    1
senegal                     1
jordan                      1
central african republic    1
Name: Country, dtype: int64

Looking at the tail, we still notice some duplicates (non compehensive list):
- Czechia is mentioned twice
- Bulgaria
- Nottingham is a city, not a country. Same is true for Seul, philadelphia and california
- Vietnamese is not a country
- Full time is not a country

Hence, let's create a lookup table to pre-process the country column

In [119]:
lookup_table = {
    "nottingham": "uk",
    "seoul": "south korea",
    "greek": "greece",
    "vietnamese": "vietnam",
    "bulgaria & uk": "bulgaria",
    "korea": "south korea",
    "russia&ukraine": "russia",
    "california": "usa",
    "philadelphia": "usa",
    "ca": "usa",
    'england': "uk",
    "dubai": "united arab emirates",
    "czechia (czech republic)": "czechia",
    "chuang": "china",
    "-": "missing"
}

In [120]:
countries = countries.apply(lambda x: lookup_table[x] if x in lookup_table.keys() else x)
countries.value_counts()

usa                         6645
missing                      901
canada                       818
france                       336
uk                           334
                            ... 
benin                          1
cameroon                       1
full time                      1
turkey                         1
central african republic       1
Name: Country, Length: 79, dtype: int64

Now we have **79** unique values

In [121]:
tmp['Country'] = countries.copy()

# Bar plot for the counties
fig = px.bar(
    tmp,
    x=countries.value_counts(),
    y=countries.value_counts().index
)

fig.update_layout(
    xaxis_title='Count',
    yaxis_title='Country',
    xaxis=dict(showline=False, showgrid=False),
    yaxis=dict(showline=False, showgrid=False)
)

fig.show()

# Exploratory Data Analysis - Education

In [122]:
# Bar plot for the counties
fig = px.bar(
    tmp,
    x=tmp.groupby('Education').size().values,
    y=tmp.groupby('Education').size().index
)

fig.update_layout(
    xaxis_title='Count',
    yaxis_title='Education',
    xaxis=dict(showline=False, showgrid=False),
    yaxis=dict(showline=False, showgrid=False)
)

fig.show()

the education category has a limited number of options - good.

Still important to notice that a lot of samples have value **missing**

# Exploratory Data Analysis - Status

The **Status** column, formerly "Message State", indicates how many times a potential customer was contacted. If the customer at some point is interested, they will proceed with the demo call

In [123]:
fig = px.bar(
    tmp,
    x=tmp.groupby('Status').size().values,
    y=tmp.groupby('Status').size().index
)

fig.update_layout(
    xaxis_title='Count',
    yaxis_title='Status',
    xaxis=dict(showline=False, showgrid=False),
    yaxis=dict(showline=False, showgrid=False)
)

fig.show()

# Exploratory Data Analysis - Stage

The column stage reflects the potential customer reaction before and/or after the demo call

In [124]:
fig = px.bar(
    tmp,
    x=tmp.groupby('Stage').size().values,
    y=tmp.groupby('Stage').size().index
)

fig.update_layout(
    xaxis_title='Count',
    yaxis_title='Stage',
    xaxis=dict(showline=False, showgrid=False),
    yaxis=dict(showline=False, showgrid=False)
)

fig.show()

# Date features

All the other features represent a date in time.

As a first thing, we convert the columns to **datetime format**

In [125]:
# There is one row that has format Y-d-m rather then Y-m-d
tmp['First Contact'].iloc[9737] = "2021-12-13"
tmp['Last Contact'].iloc[9737] = "2021-12-13"

In [126]:
# Change the First contact to datetime
# The exceptions are handled above
tmp['First Contact'] = pd.to_datetime(tmp['First Contact'], format='%Y-%m-%d')

The last contact date can have different formats, so we need to be careful in the parsing

In [137]:
last_contact = tmp['Last Contact'].copy()

for i in list(last_contact.index):

  pattern_1 = re.compile(r'^[0-9]{4}\-[0-9]{2}\-[0-9]{4}$')
  pattern_2 = re.compile(r'^[0-9]{2}\.[0-9]{2}\.[0-9]{4}$')
  pattern_3 = re.compile(r'^[0-9]{2}\-[0-9]{2}\-[0-9]{4}$')

  if isinstance(last_contact.loc[i], pd.Timestamp):
    pass
  elif pd.isna(last_contact.loc[i]):
    last_contact.loc[i] = pd.NaT
  elif pattern_1.match(last_contact.loc[i]):
    last_contact.loc[i] = pd.to_datetime(last_contact.loc[i], format='%Y-%m-%d')
  elif pattern_2.match(last_contact.loc[i]):
    last_contact.loc[i] = pd.to_datetime(last_contact.loc[i], format='%d.%m.%Y')
  elif pattern_3.match(last_contact.loc[i]):
    try:
      last_contact.loc[i] = pd.to_datetime(last_contact.loc[i], format='%d-%m-%Y')
    except:
      last_contact.loc[i] = pd.to_datetime(last_contact.loc[i], format='%m-%d-%Y')

tmp['Last Contact'] = last_contact

In [139]:
first_call = tmp['First Call'].copy()

for i in list(first_call.index):

  pattern_1 = re.compile(r'^[0-9]{4}\-[0-9]{2}\-[0-9]{4}$')
  pattern_2 = re.compile(r'^[0-9]{2}\.[0-9]{2}\.[0-9]{4}$')
  pattern_3 = re.compile(r'^[0-9]{2}\-[0-9]{2}\-[0-9]{4}$')

  if isinstance(first_call.loc[i], pd.Timestamp):
    pass
  elif pd.isna(first_call.loc[i]):
    first_call.loc[i] = pd.NaT
  elif pattern_1.match(first_call.loc[i]):
    first_call.loc[i] = pd.to_datetime(first_call.loc[i], format='%Y-%m-%d')
  elif pattern_2.match(first_call.loc[i]):
    first_call.loc[i] = pd.to_datetime(first_call.loc[i], format='%d.%m.%Y')
  elif pattern_3.match(first_call.loc[i]):
    try:
      first_call.loc[i] = pd.to_datetime(first_call.loc[i], format='%d-%m-%Y')
    except:
      first_call.loc[i] = pd.to_datetime(first_call.loc[i], format='%m-%d-%Y')

tmp['First Call'] = first_call