In [58]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import pickle

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

Now we load in the data file.

In [59]:
original_data = pd.read_csv("../Data/Historical Lead Records.csv",
                        encoding="ISO-8859-1")
# Need encoding change for weird characters to come through
lead_data = original_data.copy()
lead_data.head(20)

Unnamed: 0,Title,Job Role,Job Function,Job Level
0,Chief Information Security Officer,Information Security,IT,C-Level
1,Group IT Director,IT General,IT,Director
2,"Regional Health Administrator, Region 7",None Technical,Medical,Contributor
3,Cyber Analyst,Information Security,IT,Contributor
4,Director Transformación Digital,Networking,IT,Director
5,Head of Change Management,Program Management,IT,Executive
6,"Head, Technology - Digital Bank",IT General,IT,Executive
7,Director de Riesgos,IT General,IT,Director
8,IT Director - Information Security & Risk Mana...,Networking,IT,Director
9,Lead Architect,Networking,IT,Manager


Dimensions of data

In [60]:
lead_data.shape

(1647488, 4)

Unique values count

In [61]:
lead_data.nunique()

Title           335895
Job Role            42
Job Function        90
Job Level           56
dtype: int64

Detailed summary

In [62]:
pd.set_option('display.max_rows',100)
for col in lead_data.drop('Title',axis = 1):
    lead_data[col].value_counts()

Job Role
Information Security                              459381
None Technical                                    410687
Networking                                        299567
IT General                                        168195
Development                                       137220
Governance Risk Compliance                         30575
Non-IT                                             25702
Systems                                            22855
Program Management                                 17716
Business Systems                                   14688
Help Desk                                          14100
Operations                                          8331
Data                                                5962
Communications                                      1985
Integration                                         1368
IT Facilities                                        728
Vendor Management                                    445
Training              

Job Function
IT                                                                                                                     1036097
Engineering                                                                                                             143806
Unknown                                                                                                                 120845
Sales                                                                                                                    93194
Management                                                                                                               63094
Support                                                                                                                  33666
Operations                                                                                                               30653
Marketing                                                                                         

Job Level
Contributor                                                     605724
Manager                                                         336357
Executive                                                       212423
Director                                                        209707
C-Level                                                         172919
Unknown                                                          13594
Non-Manager                                                       4221
C-level                                                            600
VP-Level                                                           336
Team Lead                                                          165
Individual Contributor                                             118
VP/Director                                                        113
Engineer/Admin                                                      75
Other                                                              

In [63]:
pd.set_option('display.max_rows',100)
for col in lead_data.drop('Title',axis = 1):
    temp = lead_data[col].value_counts(normalize=True)
    temp
    temp.iloc[0:5].sum()

Job Role
Information Security                              2.835496e-01
None Technical                                    2.534936e-01
Networking                                        1.849056e-01
IT General                                        1.038172e-01
Development                                       8.469806e-02
Governance Risk Compliance                        1.887220e-02
Non-IT                                            1.586437e-02
Systems                                           1.410708e-02
Program Management                                1.093507e-02
Business Systems                                  9.066062e-03
Help Desk                                         8.703123e-03
Operations                                        5.142250e-03
Data                                              3.680002e-03
Communications                                    1.225227e-03
Integration                                       8.443882e-04
IT Facilities                                 

0.9104639937584407

Job Function
IT                                                                                                                     6.401738e-01
Engineering                                                                                                            8.885349e-02
Unknown                                                                                                                7.466657e-02
Sales                                                                                                                  5.758183e-02
Management                                                                                                             3.898392e-02
Support                                                                                                                2.080123e-02
Operations                                                                                                             1.893959e-02
Marketing                                                      

0.9002596292035278

Job Level
Contributor                                                     3.891182e-01
Manager                                                         2.160764e-01
Executive                                                       1.364609e-01
Director                                                        1.347162e-01
C-Level                                                         1.110835e-01
Unknown                                                         8.732811e-03
Non-Manager                                                     2.711578e-03
C-level                                                         3.854411e-04
VP-Level                                                        2.158470e-04
Team Lead                                                       1.059963e-04
Individual Contributor                                          7.580342e-05
VP/Director                                                     7.259141e-05
Engineer/Admin                                                  4.

0.9874551764099758

A few directives to clean this data:

1. Job roles to remain - Information Security (group in 'INformation Security', 'information security'), Networking (group in 'Netoworking'), IT General (group in 'IT Facilities', 'IT', 'Senior Manager, Information Technology'), Development, Systems (Group in 'Business Systems'), Governance Risk Compliance (not indicated but there are a significant number of them, group in 'Senior Manager, Security, Risk, and Compliance', 'IT/IS Compliance/Risk/Control Staff'); everything else will flow through as Non-ICP
2. Job functions to remain - IT (group in 'Information Technology','IT - Security','IT - Network','Information Security, Information Technology','IT Operations','IT-Sec Admin','Director Global IT','Information Security, Information Technology, Enterprise Architecture','It','Information Technology, Information Technology Executive'), Engineering (include 'Engineering & Technical','Engineer SASE'), Procurement (group in 'Purchasing','Sourcing / Procurement'), Risk/Legal/Compliance (include 'Legal','Risk, Legal Operations','Lawyer / Attorney','Governmental Affairs & Regulatory Law')
3. Job levels to remain - Contributor (include 'Individual Contributor','contributor','contribtuor'), Manager (include 'Management','Manager Level','manager','Threat Hunting Manager','IT Security Manager'), Executive (include 'Senior Executive','Exec.'), Director (include 'Director Level','IT Infrastructure Director','Director of Enterprise Cloud Business','IT Security Director'), C-Level (group in 'C-level','CxO','C level','C-suite','Director (It & Project) & Chief Information Security Officer','C Level')

In [64]:
lead_data = lead_data.replace({'Job Role':['INformation Security', 'information security']}, 'Information Security')
lead_data = lead_data.replace({'Job Role':['Netoworking']}, 'Networking')
lead_data = lead_data.replace({'Job Role':['IT Facilities', 'IT', 'Senior Manager, Information Technology']}, 'IT General')
lead_data = lead_data.replace({'Job Role':['Business Systems']}, 'Systems')
lead_data = lead_data.replace({'Job Role':['Senior Manager, Security, Risk, and Compliance', 'IT/IS Compliance/Risk/Control Staff']}, 'Governance Risk Compliance')
lead_data.loc[~lead_data['Job Role'].isin(['Information Security','Networking','IT General','Systems','Governance Risk Compliance']) &
              ~lead_data['Job Role'].isna(),
              lead_data.columns == 'Job Role'] = 'Non-ICP'

In [65]:
lead_data = lead_data.replace({'Job Function':['Information Technology','IT - Security','IT - Network','Information Security, Information Technology','IT Operations','IT-Sec Admin','Director Global IT','Information Security, Information Technology, Enterprise Architecture','It','Information Technology, Information Technology Executive']},
                              'IT')
lead_data = lead_data.replace({'Job Function':['Engineering & Technical','Engineer SASE']},'Engineering')
lead_data = lead_data.replace({'Job Function':['Purchasing','Sourcing / Procurement']},'Procurement')
lead_data = lead_data.replace({'Job Function':['Legal','Risk, Legal Operations','Lawyer / Attorney','Governmental Affairs & Regulatory Law']},
                              'Risk/Legal/Compliance')
lead_data.loc[~lead_data['Job Function'].isin(['IT','Engineering','Procurement','Risk/Legal/Compliance']) &
              ~lead_data['Job Function'].isna(),
              lead_data.columns == 'Job Function'] = 'Non-ICP'

In [66]:
lead_data = lead_data.replace({'Job Level':['Individual Contributor','contributor','contribtuor']},'Contributor')
lead_data = lead_data.replace({'Job Level':['Management','Manager Level','manager','Threat Hunting Manager','IT Security Manager']},'Manager')
lead_data = lead_data.replace({'Job Level':['Senior Executive','Exec.']},'Executive')
lead_data = lead_data.replace({'Job Level':['Director Level','IT Infrastructure Director','Director of Enterprise Cloud Business','IT Security Director']},'Director')
lead_data = lead_data.replace({'Job Level':['C-level','CxO','C level','C-suite','Director (It & Project) & Chief Information Security Officer','C Level']},'C-Level')
lead_data.loc[~lead_data['Job Level'].isin(['Contributor','Manager','Executive','Director','C-Level']) &
              ~lead_data['Job Level'].isna(),
              lead_data.columns == 'Job Level'] = 'Unknown'

Now let's look at the counts of the modified dataframe

In [67]:
for col in lead_data.drop('Title',axis = 1):
    lead_data[col].value_counts()

Job Role
Non-ICP                       624092
Information Security          459389
Networking                    299568
IT General                    168933
Systems                        37543
Governance Risk Compliance     30583
Name: count, dtype: int64

Job Function
IT                       1036136
Non-ICP                   429404
Engineering               143817
Procurement                 5872
Risk/Legal/Compliance       3233
Name: count, dtype: int64

Job Level
Contributor    605854
Manager        336396
Executive      212425
Director       209726
C-Level        173566
Unknown         18691
Name: count, dtype: int64

In [68]:
for col in lead_data.drop('Title',axis = 1):
    lead_data[col].value_counts(normalize=True)

Job Role
Non-ICP                       0.385216
Information Security          0.283555
Networking                    0.184906
IT General                    0.104273
Systems                       0.023173
Governance Risk Compliance    0.018877
Name: proportion, dtype: float64

Job Function
IT                       0.640198
Non-ICP                  0.265316
Engineering              0.088860
Procurement              0.003628
Risk/Legal/Compliance    0.001998
Name: proportion, dtype: float64

Job Level
Contributor    0.389202
Manager        0.216101
Executive      0.136462
Director       0.134728
C-Level        0.111499
Unknown        0.012007
Name: proportion, dtype: float64

Number of rows where any one of the outputs are NA

In [69]:
(lead_data['Job Role'].isna()|lead_data['Job Function'].isna()|lead_data['Job Level'].isna()).sum()
(lead_data['Job Role'].isna()|lead_data['Job Function'].isna()|lead_data['Job Level'].isna()).sum()/lead_data.shape[0]

112178

0.06809032903426307

To make modeling easier, it might be acceptable to drop this 7% of the data. Now let's see the number of distinct words in the job title field. Need to filter out NaNs.

In [70]:
lead_data['Title'].isna().sum()
lead_data['Title'].isna().sum()/lead_data.shape[0]

47285

0.028701271268743687

In [71]:
(lead_data['Job Role'].isna()|lead_data['Job Function'].isna()|lead_data['Job Level'].isna()|lead_data['Title'].isna()).sum()
(lead_data['Job Role'].isna()|lead_data['Job Function'].isna()|lead_data['Job Level'].isna()|lead_data['Title'].isna()).sum()/lead_data.shape[0]

119472

0.07251767539429725

Nearly 50K NAs, which is about 3% of the data.

In [72]:
words_series = lead_data['Title'].loc[~lead_data['Title'].isna()].str.lower().str.split().tolist()
words_list = [word for title in words_series for word in title]

Unique number of words

In [73]:
words_set = set(words_list)
len(words_set)

63781

Remove stopwords

In [74]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\csarc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [75]:
len(words_set - stop_words)

63669

Not that many overlapping, so there's not really that much of an impact of removing stopwords. Thus, for the model, I won't even bother to do this.