# Filmijob Capstone Project: Interview Question Scoring

In [27]:
import pandas as pd
import numpy as np
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer

## EDA (Exploratory Data Analysis)

### Importing the Dataset and checking the data

In [28]:
df=pd.read_csv("Dataset.csv")
df

Unnamed: 0,title,job description,questions
0,Collection Specialist,<h4>Job brief</h4><p>We are looking for a comp...,['A debtor has missed six months of payments. ...
1,Billing Analyst,<h4>Job brief</h4><p>We are looking for a Bill...,['How would you create a report on debts from ...
2,Tax Accountant,<h4>Job brief</h4><p>We are looking for a Tax ...,['What would you do if you received a B-Notice...
3,Accounts Receivable Manager,<h4>Job brief</h4><p>We are looking for an Acc...,['What would be the first thing you’d check if...
4,Cost Accountant,<h4>Job brief</h4><p>We are looking for a Cost...,"['If you could use only Excel, how would you o..."
...,...,...,...
278,Sales Advisor,"<h4>Job brief</h4><p><span style=""font-weight:...",['Imagine a customer asks you a question about...
279,Retail Buyer,"<h4>Job brief</h4><p><span style=""font-weight:...",['If I asked you to eliminate one line of our ...
280,Category Manager,"<h4>Job brief</h4><p><span style=""font-weight:...",['Here’s a spreadsheet with data. Can you spot...
281,Merchandiser,<h4>Job brief</h4><p>We are looking for an exp...,"['As a merchandiser, what do you think is the ..."


###  Counting the number of keywords and checking for repeated keywords

In [29]:
opened_file= open("keywords.txt")
from csv import reader
read_file = reader(opened_file)
keywords_data = list(read_file)

In [30]:
keywords_data[802]

['strong analytical skills']

In [31]:
len(keywords_data)

1410

In [32]:
keyword_list=[]
for keyword in keywords_data:
    keyword_list.append(keyword[0])
keyword_list
len(keyword_list)

1410

In [33]:
all_keywords_set=set(keyword_list)
all_keywords_list=list(all_keywords_set)
len(all_keywords_list)

1054

In [34]:
len(keyword_list)-len(all_keywords_list)

356

There are **356** keyword repetitions

In [35]:
all_keywords_list

['Evaluated',
 'Methodical',
 'Zealous',
 'public health',
 'Quantified',
 'daily operations',
 'Assembled',
 'Piloted',
 'Solicited',
 'Rousing',
 'licensing',
 'lighting',
 'financing',
 'audit',
 'finance',
 'Merged',
 'Visionary',
 'windows',
 'accounts receivable',
 'BI',
 'invoicing',
 'Fielded',
 'alliances',
 'recruitment',
 'status reports',
 'Photoshop',
 'aviation',
 'Microsoft Word',
 'value proposition',
 'AutoCAD',
 'proposal',
 'financial performance',
 'Consolidated',
 'acquisitions',
 'litigation',
 'Administered',
 'teaching',
 'key performance indicators',
 'Crafted',
 'financial reporting',
 'Amiable',
 'Precise',
 'Step-by-step',
 'Gathered',
 'hardware',
 'Calculating',
 'sales operations',
 'statistics',
 'support services',
 'dynamic environment',
 'Augmented',
 'Authorized',
 'Interacted',
 'management experience',
 'Interviewed',
 'office software',
 'Ensured',
 'Shattered',
 'Arbitrated',
 'Condensed',
 'Recruited',
 'supply chain',
 'Captured',
 'Amicable',


### Normalization for keywords (CV Parser)

In [74]:
from sklearn.metrics.pairwise import linear_kernel
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [75]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to C:\Users\Carlos
[nltk_data]     Isidor\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [76]:
# Download stopwords list
nltk.download('punkt')
stop_words = set(stopwords.words('english')) 

[nltk_data] Downloading package punkt to C:\Users\Carlos
[nltk_data]     Isidor\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [77]:
lemmatizer = WordNetLemmatizer()

In [78]:
lemmatized_keywords_list=[]

for keyword in all_keywords_list:
    lemmatized_keywords_list.append(lemmatizer.lemmatize(keyword))

In [124]:
lemmatized_keywords_list

['Evaluated',
 'Methodical',
 'Zealous',
 'public health',
 'Quantified',
 'daily operations',
 'Assembled',
 'Piloted',
 'Solicited',
 'Rousing',
 'licensing',
 'lighting',
 'financing',
 'audit',
 'finance',
 'Merged',
 'Visionary',
 'window',
 'accounts receivable',
 'BI',
 'invoicing',
 'Fielded',
 'alliance',
 'recruitment',
 'status reports',
 'Photoshop',
 'aviation',
 'Microsoft Word',
 'value proposition',
 'AutoCAD',
 'proposal',
 'financial performance',
 'Consolidated',
 'acquisition',
 'litigation',
 'Administered',
 'teaching',
 'key performance indicators',
 'Crafted',
 'financial reporting',
 'Amiable',
 'Precise',
 'Step-by-step',
 'Gathered',
 'hardware',
 'Calculating',
 'sales operations',
 'statistic',
 'support services',
 'dynamic environment',
 'Augmented',
 'Authorized',
 'Interacted',
 'management experience',
 'Interviewed',
 'office software',
 'Ensured',
 'Shattered',
 'Arbitrated',
 'Condensed',
 'Recruited',
 'supply chain',
 'Captured',
 'Amicable',
 'iO

### Creating a new column with the number of question per role

In [36]:
df["number_of_questions"]=df["questions"].str.split("[^a-e]', ?'[^a-e]").str.len()
df

Unnamed: 0,title,job description,questions,number_of_questions
0,Collection Specialist,<h4>Job brief</h4><p>We are looking for a comp...,['A debtor has missed six months of payments. ...,16
1,Billing Analyst,<h4>Job brief</h4><p>We are looking for a Bill...,['How would you create a report on debts from ...,14
2,Tax Accountant,<h4>Job brief</h4><p>We are looking for a Tax ...,['What would you do if you received a B-Notice...,14
3,Accounts Receivable Manager,<h4>Job brief</h4><p>We are looking for an Acc...,['What would be the first thing you’d check if...,17
4,Cost Accountant,<h4>Job brief</h4><p>We are looking for a Cost...,"['If you could use only Excel, how would you o...",15
...,...,...,...,...
278,Sales Advisor,"<h4>Job brief</h4><p><span style=""font-weight:...",['Imagine a customer asks you a question about...,16
279,Retail Buyer,"<h4>Job brief</h4><p><span style=""font-weight:...",['If I asked you to eliminate one line of our ...,19
280,Category Manager,"<h4>Job brief</h4><p><span style=""font-weight:...",['Here’s a spreadsheet with data. Can you spot...,15
281,Merchandiser,<h4>Job brief</h4><p>We are looking for an exp...,"['As a merchandiser, what do you think is the ...",15


In [37]:
df.describe()

Unnamed: 0,number_of_questions
count,283.0
mean,15.342756
std,3.106703
min,7.0
25%,13.0
50%,15.0
75%,17.0
max,29.0


On **average**, we have **15** questions per role. Also, the role with **less** questions has **7** and the role with the max questions has **29** questions.

### Cheking if there are any duplicated roles

In [38]:
all_jobs=df["title"].tolist()
all_jobs_set=set(all_jobs)
all_jobs_list=list(all_jobs_set)
len(all_jobs_list)

283

All the rows have a different role and no role is duplicated

### Checking if any of the columns has null values

In [39]:
df.isnull().any()

title                  False
job description        False
questions              False
number_of_questions    False
dtype: bool

No null values on the dataset

### Inspecting the total number of questions and questions repetitions

First, we are goint to split the questions in every row of the questions column into a list of lists:

In [40]:
df["questions"]=df["questions"].str.split("[^a-e]', ?'[^a-e]")  
df

Unnamed: 0,title,job description,questions,number_of_questions
0,Collection Specialist,<h4>Job brief</h4><p>We are looking for a comp...,[['A debtor has missed six months of payments....,16
1,Billing Analyst,<h4>Job brief</h4><p>We are looking for a Bill...,[['How would you create a report on debts from...,14
2,Tax Accountant,<h4>Job brief</h4><p>We are looking for a Tax ...,[['What would you do if you received a B-Notic...,14
3,Accounts Receivable Manager,<h4>Job brief</h4><p>We are looking for an Acc...,[['What would be the first thing you’d check i...,17
4,Cost Accountant,<h4>Job brief</h4><p>We are looking for a Cost...,"[['If you could use only Excel, how would you ...",15
...,...,...,...,...
278,Sales Advisor,"<h4>Job brief</h4><p><span style=""font-weight:...",[['Imagine a customer asks you a question abou...,16
279,Retail Buyer,"<h4>Job brief</h4><p><span style=""font-weight:...",[['If I asked you to eliminate one line of our...,19
280,Category Manager,"<h4>Job brief</h4><p><span style=""font-weight:...",[['Here’s a spreadsheet with data. Can you spo...,15
281,Merchandiser,<h4>Job brief</h4><p>We are looking for an exp...,"[['As a merchandiser, what do you think is the...",15


After having a list of questions for every role, we are going to create a list of all the questions in the dataset:

In [41]:
all_questions=[]
for questions in df["questions"]:
    for question in questions:
        all_questions.append(question)
all_questions

["['A debtor has missed six months of payments. When you call the individual, they bring up financial difficulties and get emotional. How do you handle this",
 'hat payback plan would you recommend for a long-term client whose payments are due to be collected in X months',
 'f you called a client to update them on their payment status and they were aggressive or rude to you, how would you handle it',
 'an you think of a few common excuses that debtors make? How would you respond to each of them',
 'hat collection methods do you know? Which technique do you think would be more effective for our company/clients',
 'ow do you keep historical data for each account',
 'ow often do you follow up with delinquent account holders and what’s your approach when you get in touch',
 'ention three things a Collection Specialist should avoid saying when speaking to a debtor',
 'ow do you keep track of all payments',
 'hat software do you use to increase your productivity',
 'hat do you find interesti

In [42]:
len(all_questions)

4342

There are **4342** total questions

In [43]:
set_allquestions=set(all_questions)
unique_questions=list(set_allquestions)
len(unique_questions)

4141

There are **4141** different questions

In [44]:
num_repeated_questions=len(all_questions)-len(unique_questions)
num_repeated_questions

201

There are **201** question repetitions

Now, we are going to create a frequency table of questions to double check total number and repetitions:

In [45]:
questions_freq={}
for questions in df["questions"]:
    for question in questions:
        if question in questions_freq.keys():
            questions_freq[question]+=1
        else:
            questions_freq[question]=1
            
questions_freq    

{"['A debtor has missed six months of payments. When you call the individual, they bring up financial difficulties and get emotional. How do you handle this": 1,
 'hat payback plan would you recommend for a long-term client whose payments are due to be collected in X months': 1,
 'f you called a client to update them on their payment status and they were aggressive or rude to you, how would you handle it': 1,
 'an you think of a few common excuses that debtors make? How would you respond to each of them': 1,
 'hat collection methods do you know? Which technique do you think would be more effective for our company/clients': 1,
 'ow do you keep historical data for each account': 1,
 'ow often do you follow up with delinquent account holders and what’s your approach when you get in touch': 1,
 'ention three things a Collection Specialist should avoid saying when speaking to a debtor': 1,
 'ow do you keep track of all payments': 1,
 'hat software do you use to increase your productivity': 

In this step we order the dictionary of questions in descending order to check the "most repeated" questions:

In [46]:
{k: v for k, v in sorted(questions_freq.items(), key=lambda item: item[1],reverse=True)}

{'ow would you rate your computer skills': 5,
 'ow do you ensure accuracy in routine tasks such as processing expenses and preparing reports': 4,
 'hat constitutes a good unit test and what a functional one': 4,
 'hat accounting software have you used': 3,
 'ow familiar are you with SFAS (Statement of Financial Accounting Standards)': 3,
 'ow fast can you type': 3,
 'escribe your experience with calendar management': 3,
 'hat tools do you use for linting, debugging and profiling': 3,
 'hat do we mean when we say that a certain Lambda expression forms a closure': 3,
 'hat tools & practices would you consider necessary for a Continuous Delivery solution': 3,
 'o arguments in Java get passed by reference or by value': 3,
 'ow are Runtime exceptions different from Checked exceptions': 3,
 'ervlets 3.0 introduced async support. Describe a use case for it': 3,
 'ow do you troubleshoot a crashing application': 3,
 'hat’s your current occupation/What are you currently working on': 3,
 'ake us 

In [47]:
count=0
repetitions=0
uniques=0
rep=[]
for key,value in questions_freq.items():
    if value>1:
        count+=1
        repetitions+=value
        rep.append(key)
    else:
        uniques+=1
print("There are",count,"questions with at least 2 appearances")
print("There are",uniques,"unique questions")
print("There are a total of ",(uniques+repetitions),"questions appearances")


There are 169 questions with at least 2 appearances
There are 3972 unique questions
There are a total of  4342 questions appearances


In [48]:
count=0
rep=[]
for key,value in questions_freq.items():
        count+=1
        rep.append(key)
print(count)
len(rep)

4141


4141

## Feature Engineering

### Grouping Keywords with job descriptions

In [49]:
df_keywords = pd.read_csv("keywords.txt")

Creating random list to check if the code works 

In [53]:
keywords = ["accounting","and","payments","job","financial","Collection"] 
print(len(keywords)) 

6


In [54]:
key_final = df_keywords.stack().tolist()
#key_final

In [55]:
df["keyword"] = df["job description"].str.findall('|'.join(keywords)).apply(set).str.join(', ')
df.head(20)

Unnamed: 0,title,job description,questions,number_of_questions,keyword
0,Collection Specialist,<h4>Job brief</h4><p>We are looking for a comp...,[['A debtor has missed six months of payments....,16,"and, Collection, payments"
1,Billing Analyst,<h4>Job brief</h4><p>We are looking for a Bill...,[['How would you create a report on debts from...,14,"and, job, accounting"
2,Tax Accountant,<h4>Job brief</h4><p>We are looking for a Tax ...,[['What would you do if you received a B-Notic...,14,"accounting, job, and, payments, financial"
3,Accounts Receivable Manager,<h4>Job brief</h4><p>We are looking for an Acc...,[['What would be the first thing you’d check i...,17,"accounting, job, and, payments, financial"
4,Cost Accountant,<h4>Job brief</h4><p>We are looking for a Cost...,"[['If you could use only Excel, how would you ...",15,"and, accounting"
5,Certified Public Accountant (CPA),<h4>Job brief</h4><p>We are looking for an exp...,[['If you missed the deadline for paying our c...,15,"accounting, job, and, payments, financial"
6,Accounting Supervisor,<h4>Job brief</h4><p>We are looking for an Acc...,[['What measures would you take if you discove...,13,"and, job, accounting, financial"
7,Senior Auditor,<h4>Job brief</h4><p>We are looking for a Seni...,[['What would you do if a client/manager asked...,14,"and, accounting, financial, job"
8,Billing Specialist,"<h4>Job brief</h4><p><span style=""font-weight:...",[['Describe your typical day at your current/p...,14,"and, payments, financial"
9,Billing Clerk,"<h4>Job brief</h4><p><span style=""font-weight:...",[['What data would you present to inform your ...,16,"accounting, job, and, payments, financial"


Once we have the keywords that match the job description, we create a list out of them for further processing:

In [56]:
df["keyword"]=df["keyword"].str.split(", ")
df.head(20)

Unnamed: 0,title,job description,questions,number_of_questions,keyword
0,Collection Specialist,<h4>Job brief</h4><p>We are looking for a comp...,[['A debtor has missed six months of payments....,16,"[and, Collection, payments]"
1,Billing Analyst,<h4>Job brief</h4><p>We are looking for a Bill...,[['How would you create a report on debts from...,14,"[and, job, accounting]"
2,Tax Accountant,<h4>Job brief</h4><p>We are looking for a Tax ...,[['What would you do if you received a B-Notic...,14,"[accounting, job, and, payments, financial]"
3,Accounts Receivable Manager,<h4>Job brief</h4><p>We are looking for an Acc...,[['What would be the first thing you’d check i...,17,"[accounting, job, and, payments, financial]"
4,Cost Accountant,<h4>Job brief</h4><p>We are looking for a Cost...,"[['If you could use only Excel, how would you ...",15,"[and, accounting]"
5,Certified Public Accountant (CPA),<h4>Job brief</h4><p>We are looking for an exp...,[['If you missed the deadline for paying our c...,15,"[accounting, job, and, payments, financial]"
6,Accounting Supervisor,<h4>Job brief</h4><p>We are looking for an Acc...,[['What measures would you take if you discove...,13,"[and, job, accounting, financial]"
7,Senior Auditor,<h4>Job brief</h4><p>We are looking for a Seni...,[['What would you do if a client/manager asked...,14,"[and, accounting, financial, job]"
8,Billing Specialist,"<h4>Job brief</h4><p><span style=""font-weight:...",[['Describe your typical day at your current/p...,14,"[and, payments, financial]"
9,Billing Clerk,"<h4>Job brief</h4><p><span style=""font-weight:...",[['What data would you present to inform your ...,16,"[accounting, job, and, payments, financial]"


### Create a counter of keywords per question

In [95]:
column_list=[]
question_key_count={}
for role in df["questions"]:
    new_dict={}
    for question in role:
        new_dict[question]=0
    column_list.append(new_dict)
#column_list.append(question_key_count)
#print(column_list)
#print(len(column_list))

In [96]:
column_list=[]
question_key_count={}
for role in df["questions"]:
    new_dict={}
    for question in role:
        new_dict[question]=0
    column_list.append(new_dict)

i=0
keyword_count=0
for role_dict in column_list:  
    for question in role_dict.keys():
        for keyword in df["keyword"][i]: 
            if keyword in question:  
                keyword_count+=1
        role_dict[question]=keyword_count
        keyword_count=0
    i+=1
            
#print(column_list)
#print(len(column_list))

In [59]:
df["keywords_per_question"]=column_list
df.head()

Unnamed: 0,title,job description,questions,number_of_questions,keyword,keywords_per_question
0,Collection Specialist,<h4>Job brief</h4><p>We are looking for a comp...,[['A debtor has missed six months of payments....,16,"[and, Collection, payments]",{'['A debtor has missed six months of payments...
1,Billing Analyst,<h4>Job brief</h4><p>We are looking for a Bill...,[['How would you create a report on debts from...,14,"[and, job, accounting]",{'['How would you create a report on debts fro...
2,Tax Accountant,<h4>Job brief</h4><p>We are looking for a Tax ...,[['What would you do if you received a B-Notic...,14,"[accounting, job, and, payments, financial]",{'['What would you do if you received a B-Noti...
3,Accounts Receivable Manager,<h4>Job brief</h4><p>We are looking for an Acc...,[['What would be the first thing you’d check i...,17,"[accounting, job, and, payments, financial]",{'['What would be the first thing you’d check ...
4,Cost Accountant,<h4>Job brief</h4><p>We are looking for a Cost...,"[['If you could use only Excel, how would you ...",15,"[and, accounting]","{'['If you could use only Excel, how would you..."


In [60]:
df["keywords_per_question"][0]

{"['A debtor has missed six months of payments. When you call the individual, they bring up financial difficulties and get emotional. How do you handle this": 2,
 'hat payback plan would you recommend for a long-term client whose payments are due to be collected in X months': 1,
 'f you called a client to update them on their payment status and they were aggressive or rude to you, how would you handle it': 1,
 'an you think of a few common excuses that debtors make? How would you respond to each of them': 0,
 'hat collection methods do you know? Which technique do you think would be more effective for our company/clients': 0,
 'ow do you keep historical data for each account': 0,
 'ow often do you follow up with delinquent account holders and what’s your approach when you get in touch': 1,
 'ention three things a Collection Specialist should avoid saying when speaking to a debtor': 1,
 'ow do you keep track of all payments': 1,
 'hat software do you use to increase your productivity': 

In [61]:
df_test=pd.read_csv("Dataset.csv")
df_test

Unnamed: 0,title,job description,questions
0,Collection Specialist,<h4>Job brief</h4><p>We are looking for a comp...,['A debtor has missed six months of payments. ...
1,Billing Analyst,<h4>Job brief</h4><p>We are looking for a Bill...,['How would you create a report on debts from ...
2,Tax Accountant,<h4>Job brief</h4><p>We are looking for a Tax ...,['What would you do if you received a B-Notice...
3,Accounts Receivable Manager,<h4>Job brief</h4><p>We are looking for an Acc...,['What would be the first thing you’d check if...
4,Cost Accountant,<h4>Job brief</h4><p>We are looking for a Cost...,"['If you could use only Excel, how would you o..."
...,...,...,...
278,Sales Advisor,"<h4>Job brief</h4><p><span style=""font-weight:...",['Imagine a customer asks you a question about...
279,Retail Buyer,"<h4>Job brief</h4><p><span style=""font-weight:...",['If I asked you to eliminate one line of our ...
280,Category Manager,"<h4>Job brief</h4><p><span style=""font-weight:...",['Here’s a spreadsheet with data. Can you spot...
281,Merchandiser,<h4>Job brief</h4><p>We are looking for an exp...,"['As a merchandiser, what do you think is the ..."


In [97]:
# import SpaceTokenizer() method from nltk
from nltk.tokenize import SpaceTokenizer
	
# Create a reference variable for Class SpaceTokenizer
tk = SpaceTokenizer()
	
# Create a string input
#gfg = "Geeksfor Geeks.. .$$&* \nis\t for geeks"
	
# Use tokenize method
#geek = tk.tokenize(gfg)
	
#print(geek)

In [225]:
test3=[]

for role in df['questions']:
    test4=[]
    for question in role:
        test4.append(tk.tokenize(question))
    test3.append(test4)

In [231]:
df["prueba"] = test3

In [238]:
len(df["prueba"][0])

16

In [175]:
test4=[]
for role in test3:
    for token in role:
        test5=[]
        for word in token:
            test5.append(lemmatizer.lemmatize(word))
            test4.append(test5)

In [189]:
test3[0]

[['ow',
  'does',
  'empathy',
  'help',
  'you',
  'build',
  'trusting,',
  'long-term',
  'relationships?',
  'Give',
  'an',
  'example',
  'of',
  'a',
  'time',
  'when',
  'you',
  'were',
  'able',
  'to',
  'better',
  'understand',
  'and',
  'address',
  'a',
  'client’s',
  "needs.']"]]

In [137]:
for tokens in test3:
    for word in tokens:
        df_test["test_1"] += lemmatizer.lemmatize(word)

KeyboardInterrupt: 

In [None]:
df_test.head()

Unnamed: 0,title,job description,questions,test_1
0,Collection Specialist,<h4>Job brief</h4><p>We are looking for a comp...,['A debtor has missed six months of payments. ...,employees?']
1,Billing Analyst,<h4>Job brief</h4><p>We are looking for a Bill...,['How would you create a report on debts from ...,employees?']
2,Tax Accountant,<h4>Job brief</h4><p>We are looking for a Tax ...,['What would you do if you received a B-Notice...,employees?']
3,Accounts Receivable Manager,<h4>Job brief</h4><p>We are looking for an Acc...,['What would be the first thing you’d check if...,employees?']
4,Cost Accountant,<h4>Job brief</h4><p>We are looking for a Cost...,"['If you could use only Excel, how would you o...",employees?']


In [129]:
tk.tokenize(df_test['questions'][1])

["['How",
 'would',
 'you',
 'create',
 'a',
 'report',
 'on',
 'debts',
 'from',
 'the',
 'previous',
 "quarter?',",
 "'How",
 'would',
 'you',
 'contact',
 'clients',
 'who',
 'haven’t',
 'paid',
 'their',
 "bills?',",
 "'If",
 'a',
 'long-term',
 'client',
 'had',
 'a',
 'debt',
 'at',
 'the',
 'end',
 'of',
 'the',
 'fiscal',
 'year,',
 'how',
 'would',
 'you',
 'address',
 'it',
 'with',
 "him/her?',",
 "'How",
 'would',
 'you',
 'make',
 'sure',
 'we',
 'collect',
 'invoices',
 'in',
 'a',
 'timely',
 "manner?',",
 "'How",
 'would',
 'you',
 'handle',
 'an',
 'invoice',
 'discrepancy',
 'with',
 'a',
 "client?',",
 "'Briefly",
 'explain',
 'the',
 'financial',
 'transactions',
 'involved',
 'in',
 'the',
 'billing',
 'process',
 'and',
 'your',
 'experience',
 'with',
 "each.',",
 "'What",
 'accounting',
 'software',
 'have',
 'you',
 "used?',",
 "'How",
 'do',
 'you',
 'keep',
 'updated',
 'records',
 'of',
 'accounts',
 'receivable',
 'from',
 "clients?',",
 "'How",
 'do',
 'yo

[["['A",
  'debtor',
  'has',
  'missed',
  'six',
  'months',
  'of',
  'payments.',
  'When',
  'you',
  'call',
  'the',
  'individual,',
  'they',
  'bring',
  'up',
  'financial',
  'difficulties',
  'and',
  'get',
  'emotional.',
  'How',
  'do',
  'you',
  'handle',
  "this?',",
  "'What",
  'payback',
  'plan',
  'would',
  'you',
  'recommend',
  'for',
  'a',
  'long-term',
  'client',
  'whose',
  'payments',
  'are',
  'due',
  'to',
  'be',
  'collected',
  'in',
  'X',
  "months?',",
  "'If",
  'you',
  'called',
  'a',
  'client',
  'to',
  'update',
  'them',
  'on',
  'their',
  'payment',
  'status',
  'and',
  'they',
  'were',
  'aggressive',
  'or',
  'rude',
  'to',
  'you,',
  'how',
  'would',
  'you',
  'handle',
  "it?',",
  "'Can",
  'you',
  'think',
  'of',
  'a',
  'few',
  'common',
  'excuses',
  'that',
  'debtors',
  'make?',
  'How',
  'would',
  'you',
  'respond',
  'to',
  'each',
  'of',
  "them?',",
  "'What",
  'collection',
  'methods',
  'do'

In [62]:
my_test=df_test["questions"]
print(my_test)

0      ['A debtor has missed six months of payments. ...
1      ['How would you create a report on debts from ...
2      ['What would you do if you received a B-Notice...
3      ['What would be the first thing you’d check if...
4      ['If you could use only Excel, how would you o...
                             ...                        
278    ['Imagine a customer asks you a question about...
279    ['If I asked you to eliminate one line of our ...
280    ['Here’s a spreadsheet with data. Can you spot...
281    ['As a merchandiser, what do you think is the ...
282    ['Who are our main competitors?', 'What tasks,...
Name: questions, Length: 283, dtype: object


In [63]:
tf_idf_weighting = TfidfVectorizer()
tf_idf_test = tf_idf_weighting.fit_transform(my_test)
tf_idf_dt_matrix = pd.DataFrame(tf_idf_test.A, columns=tf_idf_weighting.get_feature_names())
tf_idf_dt_matrix



Unnamed: 0,00,000,10,100,1096,1099,12,14,14001,15,...,yes,yet,yii,you,young,younger,your,yourself,zoho,zone
0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.399921,0.0,0.0,0.050953,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.319325,0.0,0.0,0.048434,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.122246,0.122246,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.369459,0.0,0.0,0.108962,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.419516,0.0,0.0,0.121476,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.086231,0.000000,0.0,0.372815,0.0,0.0,0.018849,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
278,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.096141,0.0,0.529787,0.0,0.0,0.000000,0.0,0.0,0.0
279,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.327401,0.0,0.0,0.109772,0.0,0.0,0.0
280,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.249732,0.0,0.0,0.033143,0.0,0.0,0.0
281,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.269223,0.0,0.0,0.153914,0.0,0.0,0.0


In [64]:
document = 2
print("25 most important terms for role", df_test.iloc[document]['title'])
print(tf_idf_dt_matrix.iloc[:, np.argsort(tf_idf_dt_matrix.loc[document])[::-1]].iloc[document][:25])

print("25 least important terms for role", df_test.iloc[document]['title'])
print(tf_idf_dt_matrix.iloc[:, np.argsort(tf_idf_dt_matrix.loc[document])[::-1]].iloc[document][-25:])

25 most important terms for role Tax Accountant
you            0.369459
tax            0.281376
notice         0.206878
regulatory     0.193066
what           0.164204
accounting     0.146057
do             0.124023
create         0.123621
how            0.123588
1099           0.122246
1096           0.122246
withholding    0.122246
actually       0.122246
submission     0.122246
attracted      0.122246
forensic       0.122246
reports        0.120501
versus         0.113924
fields         0.113924
irs            0.113924
aware          0.113924
backup         0.113924
returns        0.113924
your           0.108962
fine           0.108019
Name: 2, dtype: float64
25 least important terms for role Tax Accountant
portfolio       0.0
portability     0.0
populations     0.0
popular         0.0
poorly          0.0
poor            0.0
polymorphism    0.0
polling         0.0
policy          0.0
policies        0.0
police          0.0
points          0.0
pointless       0.0
pointers        0.0

In [65]:
tf_idf_dt_matrix[["accounting","and","payments","job"]]

Unnamed: 0,accounting,and,payments,job
0,0.000000,0.087664,0.233100,0.035107
1,0.081152,0.049998,0.000000,0.000000
2,0.146057,0.022496,0.000000,0.000000
3,0.135692,0.062699,0.185244,0.000000
4,0.189493,0.116746,0.000000,0.000000
...,...,...,...,...
278,0.000000,0.060068,0.000000,0.000000
279,0.000000,0.018886,0.000000,0.000000
280,0.000000,0.068427,0.000000,0.000000
281,0.000000,0.045396,0.000000,0.000000


In [66]:
word_value_dict={}
test=[]
n=0
for keyword in df["keyword"][0]:
    if keyword in tf_idf_dt_matrix.columns:
        test.append(tf_idf_dt_matrix[keyword][n])
        #test.append(keyword)
print(test)

[0.08766390783250323, 0.23310040239034044]


In [67]:
word_score_list=[]

for title in df["questions"]:
    role_list=[]
    for question in title:
        new_dict={}
        role_list.append(new_dict)
        for role in df["keyword"]:
            for keyword in role:
                if keyword in tf_idf_dt_matrix.columns and (keyword in question):
                    new_dict[keyword]=0
    word_score_list.append(role_list)


i=0
for role_list in word_score_list:  
    for quest_dict in role_list:
        for word in quest_dict.keys():
            quest_dict[word]=tf_idf_dt_matrix[word][i]
    i+=1  
    
print(word_score_list[0])
print(len(word_score_list)) 

[{'and': 0.08766390783250323, 'payments': 0.23310040239034044, 'financial': 0.05487076842823954}, {'payments': 0.23310040239034044}, {'and': 0.08766390783250323}, {}, {}, {}, {'and': 0.08766390783250323}, {}, {'payments': 0.23310040239034044}, {}, {'job': 0.03510711711626515}, {'and': 0.08766390783250323}, {'and': 0.08766390783250323}, {'and': 0.08766390783250323}, {}, {'and': 0.08766390783250323}]
283


In [68]:
list_of_points=[]
for role in word_score_list:
    role_list=[]
    for word_dict in role:
        sum=0
        for word in word_dict.keys():
            sum+=word_dict[word]
        role_list.append(sum)
    
    list_of_points.append(role_list)


list_of_points[0]


[0.37563507865108325,
 0.23310040239034044,
 0.08766390783250323,
 0,
 0,
 0,
 0.08766390783250323,
 0,
 0.23310040239034044,
 0,
 0.03510711711626515,
 0.08766390783250323,
 0.08766390783250323,
 0.08766390783250323,
 0,
 0.08766390783250323]

In [69]:
question_score_list=[]
for title in df["questions"]:
    quest_dict={}
    for question in title:
        quest_dict[question]=0

    question_score_list.append(quest_dict)


print(question_score_list[0])
print(len(question_score_list))
        

{"['A debtor has missed six months of payments. When you call the individual, they bring up financial difficulties and get emotional. How do you handle this": 0, 'hat payback plan would you recommend for a long-term client whose payments are due to be collected in X months': 0, 'f you called a client to update them on their payment status and they were aggressive or rude to you, how would you handle it': 0, 'an you think of a few common excuses that debtors make? How would you respond to each of them': 0, 'hat collection methods do you know? Which technique do you think would be more effective for our company/clients': 0, 'ow do you keep historical data for each account': 0, 'ow often do you follow up with delinquent account holders and what’s your approach when you get in touch': 0, 'ention three things a Collection Specialist should avoid saying when speaking to a debtor': 0, 'ow do you keep track of all payments': 0, 'hat software do you use to increase your productivity': 0, 'hat d

In [70]:
question_score_list=[]
t=0
for title in df["questions"]:
    quest_dict={}
    q=0
    for question in title:
        score=list_of_points[t][q]
        quest_dict[question]=score
        q+=1
    t+=1

    question_score_list.append(quest_dict)


print(question_score_list[0])
print(len(question_score_list))

{"['A debtor has missed six months of payments. When you call the individual, they bring up financial difficulties and get emotional. How do you handle this": 0.37563507865108325, 'hat payback plan would you recommend for a long-term client whose payments are due to be collected in X months': 0.23310040239034044, 'f you called a client to update them on their payment status and they were aggressive or rude to you, how would you handle it': 0.08766390783250323, 'an you think of a few common excuses that debtors make? How would you respond to each of them': 0, 'hat collection methods do you know? Which technique do you think would be more effective for our company/clients': 0, 'ow do you keep historical data for each account': 0, 'ow often do you follow up with delinquent account holders and what’s your approach when you get in touch': 0.08766390783250323, 'ention three things a Collection Specialist should avoid saying when speaking to a debtor': 0, 'ow do you keep track of all payments

In [71]:
 df["scored_questions"]=question_score_list
df.head()

Unnamed: 0,title,job description,questions,number_of_questions,keyword,keywords_per_question,scored_questions
0,Collection Specialist,<h4>Job brief</h4><p>We are looking for a comp...,[['A debtor has missed six months of payments....,16,"[and, Collection, payments]",{'['A debtor has missed six months of payments...,{'['A debtor has missed six months of payments...
1,Billing Analyst,<h4>Job brief</h4><p>We are looking for a Bill...,[['How would you create a report on debts from...,14,"[and, job, accounting]",{'['How would you create a report on debts fro...,{'['How would you create a report on debts fro...
2,Tax Accountant,<h4>Job brief</h4><p>We are looking for a Tax ...,[['What would you do if you received a B-Notic...,14,"[accounting, job, and, payments, financial]",{'['What would you do if you received a B-Noti...,{'['What would you do if you received a B-Noti...
3,Accounts Receivable Manager,<h4>Job brief</h4><p>We are looking for an Acc...,[['What would be the first thing you’d check i...,17,"[accounting, job, and, payments, financial]",{'['What would be the first thing you’d check ...,{'['What would be the first thing you’d check ...
4,Cost Accountant,<h4>Job brief</h4><p>We are looking for a Cost...,"[['If you could use only Excel, how would you ...",15,"[and, accounting]","{'['If you could use only Excel, how would you...","{'['If you could use only Excel, how would you..."


In [72]:
 first_question=df["scored_questions"][0]

In [73]:
{k: v for k, v in sorted( first_question.items(), key=lambda item: item[1],reverse=True)}

{"['A debtor has missed six months of payments. When you call the individual, they bring up financial difficulties and get emotional. How do you handle this": 0.37563507865108325,
 'hat payback plan would you recommend for a long-term client whose payments are due to be collected in X months': 0.23310040239034044,
 'ow do you keep track of all payments': 0.23310040239034044,
 'f you called a client to update them on their payment status and they were aggressive or rude to you, how would you handle it': 0.08766390783250323,
 'ow often do you follow up with delinquent account holders and what’s your approach when you get in touch': 0.08766390783250323,
 'escribe a time you had to deal with a difficult client. How did you handle the situation': 0.08766390783250323,
 'escribe a time you persuaded a client to pay off a large debt. How did you handle it, from beginning to end': 0.08766390783250323,
 'ave you ever faced an ethical dilemma at work? If so, what was the situation and what did yo