In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

Now we load in the data file.

In [2]:
original_data = pd.read_csv("../Data/Historical Lead Records - Condensed.csv",
                        encoding="utf-8")
# Need encoding change for weird characters to come through
lead_data = original_data.copy()
lead_data.head(20)

Unnamed: 0,Title,Job Role,Job Function,Job Level
0,Manager-Cybersecurity,Information Security,IT,Manager
1,"Manager, Information Security",Information Security,IT,Manager
2,User Experience Analyst,Development,Engineering,Contributor
3,Network Specialist,Networking,IT,Contributor
4,Director of Privacy and Compliance,Information Security,IT,Director
5,"Assistant Vp, Network Architecture Ccie 1676",Networking,IT,Contributor
6,"Director, Information Security",Networking,IT,Director
7,CIO,Information Security,IT,C-Level
8,Director Cloud Operations,IT General,IT,Director
9,"Director, Enterprise Applications",Networking,IT,Director


Dimensions of data

In [3]:
lead_data.shape

(865671, 4)

Unique values count

In [4]:
lead_data.nunique()

Title           186749
Job Role            35
Job Function        52
Job Level           28
dtype: int64

Detailed summary

In [5]:
pd.set_option('display.max_rows',100)
for col in lead_data.drop('Title',axis = 1):
    lead_data[col].value_counts()

Job Role
Information Security                      266133
Networking                                207794
None Technical                            160988
Development                                82228
IT General                                 69190
Governance Risk Compliance                 20438
Program Management                         10057
Help Desk                                   8709
Non-IT                                      8280
Business Systems                            8225
Systems                                     7882
Operations                                  4851
Data                                        3531
Communications                              1353
Integration                                  563
IT Facilities                                493
Vendor Management                            187
Training                                     130
Business Continuity                           80
SVP                                           12
Security   

Job Function
IT                                                                                                                     602920
Engineering                                                                                                             80429
Sales                                                                                                                   35541
Management                                                                                                              32222
Unknown                                                                                                                 31296
Support                                                                                                                 16457
Operations                                                                                                              12870
Finance                                                                                                  

Job Level
Contributor                              306530
Manager                                  175443
Director                                 143140
Executive                                109596
C-Level                                   97370
Unknown                                    5495
Non-Manager                                2768
VP-Level                                    298
Team Lead                                   154
VP/Director                                 107
Engineer/Admin                               68
C-level                                      54
CxO                                          38
Decision maker                               16
contributor                                  10
Director Level                                9
VP                                            8
Director / C-Level                            7
Member of Technical Staff 2                   2
Individual Contributor                        2
C-suite                       

In [6]:
pd.set_option('display.max_rows',100)
for col in lead_data.drop('Title',axis = 1):
    temp = lead_data[col].value_counts(normalize=True)
    temp
    temp.iloc[0:5].sum()

Job Role
Information Security                      0.309037
Networking                                0.241293
None Technical                            0.186941
Development                               0.095484
IT General                                0.080344
Governance Risk Compliance                0.023733
Program Management                        0.011678
Help Desk                                 0.010113
Non-IT                                    0.009615
Business Systems                          0.009551
Systems                                   0.009153
Operations                                0.005633
Data                                      0.004100
Communications                            0.001571
Integration                               0.000654
IT Facilities                             0.000572
Vendor Management                         0.000217
Training                                  0.000151
Business Continuity                       0.000093
SVP                   

0.9130995193742459

Job Function
IT                                                                                                                     0.700591
Engineering                                                                                                            0.093458
Sales                                                                                                                  0.041299
Management                                                                                                             0.037442
Unknown                                                                                                                0.036366
Support                                                                                                                0.019123
Operations                                                                                                             0.014955
Finance                                                                                    

0.9091551357908779

Job Level
Contributor                              0.364429
Manager                                  0.208581
Director                                 0.170177
Executive                                0.130297
C-Level                                  0.115762
Unknown                                  0.006533
Non-Manager                              0.003291
VP-Level                                 0.000354
Team Lead                                0.000183
VP/Director                              0.000127
Engineer/Admin                           0.000081
C-level                                  0.000064
CxO                                      0.000045
Decision maker                           0.000019
contributor                              0.000012
Director Level                           0.000011
VP                                       0.000010
Director / C-Level                       0.000008
Member of Technical Staff 2              0.000002
Individual Contributor                  

0.989245355922128

A few directives to clean this data:

1. Job roles to remain - Information Security (group in 'INformation Security', 'information security'), Networking (group in 'Netoworking'), IT General (group in 'IT Facilities', 'IT', 'Senior Manager, Information Technology'), Development, Systems (Group in 'Business Systems'), Governance Risk Compliance (not indicated but there are a significant number of them, group in 'Senior Manager, Security, Risk, and Compliance', 'IT/IS Compliance/Risk/Control Staff'); everything else will flow through as Non-ICP
2. Job functions to remain - IT (group in 'Information Technology','IT - Security','IT - Network','Information Security, Information Technology','IT Operations','IT-Sec Admin','Director Global IT','Information Security, Information Technology, Enterprise Architecture','It','Information Technology, Information Technology Executive'), Engineering (include 'Engineering & Technical','Engineer SASE'), Procurement (group in 'Purchasing','Sourcing / Procurement'), Risk/Legal/Compliance (include 'Legal','Risk, Legal Operations','Lawyer / Attorney','Governmental Affairs & Regulatory Law')
3. Job levels to remain - Contributor (include 'Individual Contributor','contributor','contribtuor'), Manager (include 'Management','Manager Level','manager','Threat Hunting Manager','IT Security Manager'), Executive (include 'Senior Executive','Exec.'), Director (include 'Director Level','IT Infrastructure Director','Director of Enterprise Cloud Business','IT Security Director'), C-Level (group in 'C-level','CxO','C level','C-suite','Director (It & Project) & Chief Information Security Officer','C Level')

Number of rows where any one of the outputs are NA

In [7]:
(lead_data['Job Role'].isna()|lead_data['Job Function'].isna()|lead_data['Job Level'].isna()).sum()
(lead_data['Job Role'].isna()|lead_data['Job Function'].isna()|lead_data['Job Level'].isna()).sum()/lead_data.shape[0]

29420

0.03398519760971547

To make modeling easier, it might be acceptable to drop this 3% of the data. Now let's see the number of distinct words in the job title field. Need to filter out NaNs.

In [8]:
lead_data['Title'].isna().sum()
lead_data['Title'].isna().sum()/lead_data.shape[0]

12690

0.01465914879902411

In [9]:
(lead_data['Job Role'].isna()|lead_data['Job Function'].isna()|lead_data['Job Level'].isna()|lead_data['Title'].isna()).sum()
(lead_data['Job Role'].isna()|lead_data['Job Function'].isna()|lead_data['Job Level'].isna()|lead_data['Title'].isna()).sum()/lead_data.shape[0]

31727

0.0366501823441007

Nearly 32K NAs, which is about 4% of the data.

In [10]:
words_series = lead_data['Title'].loc[~lead_data['Title'].isna()].str.lower().str.split().tolist()
words_list = [word for title in words_series for word in title]

Unique number of words

In [11]:
words_set = set(words_list)
len(words_set)

31772

Remove stopwords

In [12]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\csarc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [13]:
len(words_set - stop_words)

31669

Not that many overlapping, so there's not really that much of an impact of removing stopwords. Thus, for the model, I won't even bother to do this.