In [1]:
import numpy
import pandas
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score
import random

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas


In [2]:
random.seed(42)

In [3]:
from preprocessing_NLP import pipeline

resumes_df = pipeline('UpdatedResumeDataSet_T1_7.csv', feature_name='Resume')
resumes_df

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\JS\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\JS\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\JS\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\JS\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\JS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,Category,Resume
0,Data Science,qwtnrvduof education detail may 2013 may 2017 ...
1,Data Science,qwtnrvduof area interest deep learn control sy...
2,Data Science,skill r python sap hana tableau sap hana sql s...
3,Data Science,education detail mca ymcaust faridabad haryana...
4,Data Science,skill c basic iot python matlab data science m...
...,...,...
873,Testing,skill set o window xp 7 8 8bntgbqlmkk1 10 data...
874,Testing,good logical analytical skill positive attitud...
878,Testing,personal skill quick learner eagerness learn n...
1540,DevOps Engineer,core skill project program management agile sc...


In [4]:
resumes_df = resumes_df.reset_index(drop=True)
test_df = resumes_df.sample(32, random_state=42)
train_df = resumes_df.drop(test_df.index)

test_df = test_df.reset_index(drop=True)
train_df = train_df.reset_index(drop=True)

Fit Vectorizer using train_df

In [5]:
resumes = train_df.Resume.to_numpy()
resumes

array(['qwtnrvduof education detail may 2013 may 2017 bbntgbqlmkke uit rgpv data scientist data scientist matelabs skill detail python exprience le 1 year month statsmodels exprience 12 month aws exprience less 1 year month machine learn exprience le 1 year month sklearn exprience le 1 year month scipy exprience le 1 year month keras exprience less 1 year monthscompany detail company matelabs description ml platform business professional dummy enthusiastsckekjofvwq 60 koramangala 5th block achievement task behind sukh sagar bengaluru india develop deployed auto preprocessing step machine learn mainly miss value treatment outlier detection encode scaling feature selection dimensionality reductionqunsobcudt deploy automated classification regression modelrynoolxhuv b4600b146 reasearch deploy time series forecast model arima sarimax holt winter prophetiqmadshiyn work meta feature extract problemiszogerzlf implement state art research paper outlier detection mixed attribute company matelab

In [6]:
count_vectorizer = CountVectorizer()
tfidf_vectorizer = TfidfVectorizer()

In [7]:
count_vectorizer.fit(resumes)
tfidf_vectorizer.fit(resumes)

## Resume Search

In [8]:
def retrieval(resume: str, database: pandas.DataFrame, vectorizer, verbose=False, word2vec=False) -> str:
    if not word2vec:
        test_vector = vectorizer.transform([resume])
    else:
        import gensim.downloader
        word2vec_model = gensim.downloader.load('word2vec-google-news-300')

        def vectorize_documents(documents, word2vec_model):
            document_vectors = []
        
            for document in documents:
                tokens = document.lower().split()
                vectors = []
        
                for token in tokens:
                    if token in word2vec_model:
                        vectors.append(word2vec_model[token])
        
                if vectors:
                    document_vectors.append(sum(vectors) / len(vectors))
                else:
                    # Handle the case where all words are out-of-vocabulary
                    document_vectors.append([0] * 300)
            return document_vectors
        
        test_vector = vectorize_documents([resume], word2vec_model)[0]

    highest_similarity = 0
    most_similar = None
    for to_test in database.Resume.values:
        
        if not word2vec:
            similarity = cosine_similarity(test_vector, vectorizer.transform([to_test]))
        else:
            similarity = cosine_similarity([test_vector], 
                                           [vectorize_documents([to_test], word2vec_model)[0]])
            
        if verbose:
            print(f'Similarity between input query to document "{to_test[0:100]}..." = {similarity[0][0]}')
        if similarity[0][0] > highest_similarity:
            highest_similarity = similarity
            most_similar = to_test
    
    if verbose:
        print("***END***\n\n")
    return most_similar

### Test within internal database

In [9]:
resumes[12] #Quick Test Example

'education detail june 2012 may 2015 bbntgbqlmkka economics chennai tamil nadu sdnbvc hr skill detail company detail company anything solution description hr'

In [50]:
query = '''
june 2012 may 2015
'''

In [52]:
result = retrieval(query, resumes_df, tfidf_vectorizer, verbose=True)
print(result)

Similarity between input query to document "qwtnrvduof education detail may 2013 may 2017 bbntgbqlmkke uit rgpv data scientist data scientist ma..." = 0.057711712630515125
Similarity between input query to document "qwtnrvduof area interest deep learn control system design program python electric machinery web deve..." = 0.021168073062769178
Similarity between input query to document "skill r python sap hana tableau sap hana sql sap hana pal m sql sap lumira c linear program data mod..." = 0.0
Similarity between input query to document "education detail mca ymcaust faridabad haryana data science internship skill detail data structure e..." = 0.0
Similarity between input query to document "skill c basic iot python matlab data science machine learn html microsoft word microsoft excel micro..." = 0.16300143713293566
Similarity between input query to document "skill python tableau data visualization r studio machine learn statistic iabac certify data scientis..." = 0.0
Similarity between i

In [12]:
%timeit retrieval(query, resumes_df, count_vectorizer)

78.4 ms ± 351 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [13]:
%timeit retrieval(query, resumes_df, tfidf_vectorizer)

107 ms ± 554 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [14]:
%timeit retrieval(query, resumes_df, tfidf_vectorizer, word2vec=True)

21.2 s ± 49.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Test within external database

In [15]:
test_resumes = test_df.Resume
test_resumes

0     skill set o window xp 7 8 8bntgbqlmkk1 10 data...
1     skill set talend big data informatica power ce...
2     train special education certificate course edu...
3     education detail bba lovely professional unive...
4     computer knowledge drafting tool autocadbntgbq...
5     education detail july 2016 may 2019 electrical...
6     education detail bbntgbqlmkkcckekjofvwqa bache...
7     technicalskills springmvc hibernate jdbc java ...
8     education detail mba acn college engineering m...
9     software skill rdbms m sql server 2000 2005 20...
10    education detail january 2009 pbntgbqlmkkgckek...
11    skill visa b1 visa usa onsite visit sweden u s...
12    technical skill web technology angular j html5...
13    technical skill program language c c java bntg...
14    education detail bachelor bachelor commerce in...
15    skill 1bntgbqlmkkautocad 2ckekjofvwqpro v 3qun...
16    education detail bca vinayaka mission universi...
17    skill set hadoop map reduce hdfs hive sqoo

In [16]:
test_labels = test_df.Category
test_labels

0                    Testing
1              ETL Developer
2                         HR
3                         HR
4             Civil Engineer
5     Electrical Engineering
6              Web Designing
7             Java Developer
8                         HR
9                   Database
10        Health and fitness
11           DevOps Engineer
12             Web Designing
13                    Hadoop
14                     Sales
15            Civil Engineer
16        Operations Manager
17                    Hadoop
18                  Advocate
19          Python Developer
20            Civil Engineer
21                Blockchain
22                       PMO
23                        HR
24                     Sales
25                   Testing
26              Data Science
27          DotNet Developer
28                  Advocate
29             SAP Developer
30          DotNet Developer
31              Data Science
Name: Category, dtype: object

In [128]:
def string_subset(string_value: str):
    max_len = len(string_value)
    
    subset_length = random.randint(1, max_len // 3)
    
    start = random.randint(1, max_len - subset_length)
    end = start + subset_length
    
    return string_value[start:end]

In [134]:
test_queries = numpy.array([string_subset(query) for query in test_resumes])
test_queries = pandas.DataFrame(test_queries, columns=['Resume'])
test_queries

Unnamed: 0,Resume
0,month database test exprience 6 monthscompany...
1,stqunsobcudt francis talend etl developer tale...
2,lappuram kerala calicut university july 2013 m...
3,exprience
4,d muweilah commercial near alfalah round sharj...
5,ail july 2016 may 2019 elec
6,erapy project iii vitsanindia system technolog...
7,uter engineering nashik maharashtra late giszo...
8,mgt hr skill detail company
9,name barclays technology centre indiackekjofv...


In [135]:
num = 2
test_resumes[2]

'train special education certificate course education detail july 2016 october 2018 mbntgbqlmkksc psychology specialization organizational behaviour malappuram kerala calicut university july 2013 march 2016 bsc psychology thrissur prajyoti niketan college hr skill detail company detail company description do 30 day internship hr department foster hot bread kinfra malappuram kerala also do 60 day internship santhwana institute counsel psychotherapy cochin kerala counsellor'

In [136]:
queryTest = '''
department foster hot bread kinfra mal
'''

In [137]:
result = retrieval(queryTest, test_df, tfidf_vectorizer)
print(result)

train special education certificate course education detail july 2016 october 2018 mbntgbqlmkksc psychology specialization organizational behaviour malappuram kerala calicut university july 2013 march 2016 bsc psychology thrissur prajyoti niketan college hr skill detail company detail company description do 30 day internship hr department foster hot bread kinfra malappuram kerala also do 60 day internship santhwana institute counsel psychotherapy cochin kerala counsellor


#### Evaluation

Count Vectorizer

In [145]:
y_pred = test_queries['Resume'].apply(lambda x: retrieval(x, test_df, count_vectorizer))
y_pred = y_pred.fillna('wrong')
y_pred

0     skill set o window xp 7 8 8bntgbqlmkk1 10 data...
1     skill set talend big data informatica power ce...
2     train special education certificate course edu...
3     technical skill web technology asp bntgbqlmkkn...
4     computer knowledge drafting tool autocadbntgbq...
5     education detail july 2016 may 2019 electrical...
6     education detail bbntgbqlmkkcckekjofvwqa bache...
7     technicalskills springmvc hibernate jdbc java ...
8     education detail mba acn college engineering m...
9     software skill rdbms m sql server 2000 2005 20...
10    education detail january 2009 pbntgbqlmkkgckek...
11                                                wrong
12    technical skill web technology angular j html5...
13    technical skill program language c c java bntg...
14    education detail bachelor bachelor commerce in...
15    skill 1bntgbqlmkkautocad 2ckekjofvwqpro v 3qun...
16    education detail bca vinayaka mission universi...
17    skill set hadoop map reduce hdfs hive sqoo

In [146]:
accuracy_score(test_resumes, y_pred)

0.90625

Tf-Idf Vectorizer

In [147]:
y_pred = test_queries['Resume'].apply(lambda x: retrieval(x, test_df, tfidf_vectorizer))
y_pred = y_pred.fillna('wrong')
y_pred

0     skill set o window xp 7 8 8bntgbqlmkk1 10 data...
1     skill set talend big data informatica power ce...
2     train special education certificate course edu...
3     technical skill web technology asp bntgbqlmkkn...
4     computer knowledge drafting tool autocadbntgbq...
5     education detail july 2016 may 2019 electrical...
6     education detail bbntgbqlmkkcckekjofvwqa bache...
7     technicalskills springmvc hibernate jdbc java ...
8     education detail mba acn college engineering m...
9     software skill rdbms m sql server 2000 2005 20...
10    education detail january 2009 pbntgbqlmkkgckek...
11                                                wrong
12    technical skill web technology angular j html5...
13    technical skill program language c c java bntg...
14    education detail bachelor bachelor commerce in...
15    skill 1bntgbqlmkkautocad 2ckekjofvwqpro v 3qun...
16    education detail bca vinayaka mission universi...
17    skill set hadoop map reduce hdfs hive sqoo

In [148]:
accuracy_score(test_resumes, y_pred)

0.90625

Word2Vec Vectorizer

In [149]:
y_pred = test_queries['Resume'].apply(lambda x: retrieval(x, test_df, count_vectorizer, word2vec=True))
y_pred = y_pred.fillna('wrong')
y_pred

0     skill python tableau data visualization r stud...
1     education detail bbntgbqlmkkcckekjofvwqa bache...
2     train special education certificate course edu...
3     education detail sap technical architect sap t...
4     computer knowledge drafting tool autocadbntgbq...
5     technicalskills springmvc hibernate jdbc java ...
6     education detail bbntgbqlmkkcckekjofvwqa bache...
7     technicalskills springmvc hibernate jdbc java ...
8     education detail mba acn college engineering m...
9     skill strong c fundamental problem solve ether...
10    education detail january 2009 pbntgbqlmkkgckek...
11                                                wrong
12    education detail bca vinayaka mission universi...
13    technical skill program language c c java bntg...
14    education detail bachelor bachelor commerce in...
15    skill 1bntgbqlmkkautocad 2ckekjofvwqpro v 3qun...
16    education detail bca vinayaka mission universi...
17    education detail bbntgbqlmkkcckekjofvwqa b

In [150]:
accuracy_score(test_resumes, y_pred)

0.65625

#### Test on own resume

In [28]:
with open('personalResume.txt', 'r', encoding='utf-8') as f:
    own_resume = f.read()
    
own_resume

"Name: Chong Jia Shuo\nEmail: 222978d@mymail.nyp.edu.sg\nHandphone: 9777 1234\nLinkedIn: https://www.linkedin.com/in/js-chong-30766b273\nNationality: Singaporean\nLanguages Known: English and Chinese\nAbout Me\nAs an individual who has developed a strong appreciation and passion for Artificial Intelligence (AI) during my study at NYP, I have actively engaged in independent, self-directed learning. I have taught myself various programming languages and fundamental theories of AI (Probability & Calculus, etc.). Currently, I am looking to add value to an organisation by using my knowledge and expertise to develop various AI models and algorithms, as well as share these outcomes with multiple stakeholders.\nEducation\n•\t‘O’ Level Graduate, Chung Cheng High School (Yishun)\no\tELR2B2 Nett 7\n•\tDiploma in AI and Data Engineering, Nanyang Polytechnic, Singapore, Year 2\no\tGPA 4.0\no\tRelevant Distinctions in:\n\uf0a7\tProgramming\n\uf0a7\tData Preparation & Visualization\n\uf0a7\tData Mana

In [29]:
own_resume_subset = own_resume[10: 200]
own_resume_subset

'g Jia Shuo\nEmail: 222978d@mymail.nyp.edu.sg\nHandphone: 9777 1234\nLinkedIn: https://www.linkedin.com/in/js-chong-30766b273\nNationality: Singaporean\nLanguages Known: English and Chinese\nAbout '

In [30]:
new_database = pandas.concat([test_df, pandas.Series([own_resume], name='Resume')])
print(new_database.tail(1).values)

[[nan
  "Name: Chong Jia Shuo\nEmail: 222978d@mymail.nyp.edu.sg\nHandphone: 9777 1234\nLinkedIn: https://www.linkedin.com/in/js-chong-30766b273\nNationality: Singaporean\nLanguages Known: English and Chinese\nAbout Me\nAs an individual who has developed a strong appreciation and passion for Artificial Intelligence (AI) during my study at NYP, I have actively engaged in independent, self-directed learning. I have taught myself various programming languages and fundamental theories of AI (Probability & Calculus, etc.). Currently, I am looking to add value to an organisation by using my knowledge and expertise to develop various AI models and algorithms, as well as share these outcomes with multiple stakeholders.\nEducation\n•\t‘O’ Level Graduate, Chung Cheng High School (Yishun)\no\tELR2B2 Nett 7\n•\tDiploma in AI and Data Engineering, Nanyang Polytechnic, Singapore, Year 2\no\tGPA 4.0\no\tRelevant Distinctions in:\n\uf0a7\tProgramming\n\uf0a7\tData Preparation & Visualization\n\uf0a7\tD

In [31]:
new_database.tail(3)

Unnamed: 0,Category,Resume
30,DotNet Developer,participate intra college cricket competition ...
31,Data Science,skill program language python panda numpy scip...
0,,Name: Chong Jia Shuo\nEmail: 222978d@mymail.ny...


In [32]:
result = retrieval(own_resume_subset, new_database, tfidf_vectorizer)
print(result)

Name: Chong Jia Shuo
Email: 222978d@mymail.nyp.edu.sg
Handphone: 9777 1234
LinkedIn: https://www.linkedin.com/in/js-chong-30766b273
Nationality: Singaporean
Languages Known: English and Chinese
About Me
As an individual who has developed a strong appreciation and passion for Artificial Intelligence (AI) during my study at NYP, I have actively engaged in independent, self-directed learning. I have taught myself various programming languages and fundamental theories of AI (Probability & Calculus, etc.). Currently, I am looking to add value to an organisation by using my knowledge and expertise to develop various AI models and algorithms, as well as share these outcomes with multiple stakeholders.
Education
•	‘O’ Level Graduate, Chung Cheng High School (Yishun)
o	ELR2B2 Nett 7
•	Diploma in AI and Data Engineering, Nanyang Polytechnic, Singapore, Year 2
o	GPA 4.0
o	Relevant Distinctions in:
	Programming
	Data Preparation & Visualization
	Data Management
	Machine Learning
	Cloud Comput

In [33]:
query = '''
statistical analysis
'''

In [34]:
result = retrieval(query, new_database, tfidf_vectorizer)
print(result)

Name: Chong Jia Shuo
Email: 222978d@mymail.nyp.edu.sg
Handphone: 9777 1234
LinkedIn: https://www.linkedin.com/in/js-chong-30766b273
Nationality: Singaporean
Languages Known: English and Chinese
About Me
As an individual who has developed a strong appreciation and passion for Artificial Intelligence (AI) during my study at NYP, I have actively engaged in independent, self-directed learning. I have taught myself various programming languages and fundamental theories of AI (Probability & Calculus, etc.). Currently, I am looking to add value to an organisation by using my knowledge and expertise to develop various AI models and algorithms, as well as share these outcomes with multiple stakeholders.
Education
•	‘O’ Level Graduate, Chung Cheng High School (Yishun)
o	ELR2B2 Nett 7
•	Diploma in AI and Data Engineering, Nanyang Polytechnic, Singapore, Year 2
o	GPA 4.0
o	Relevant Distinctions in:
	Programming
	Data Preparation & Visualization
	Data Management
	Machine Learning
	Cloud Comput

## Resume Classification


In [35]:
def classification(resume: str, database: pandas.DataFrame, vectorizer):
    test_vector = vectorizer.transform([resume])
    
    highest_similarity = 0
    highest_label = None
    for to_test, label in zip(database.Resume.values, database.Category.values):
        similarity = cosine_similarity(test_vector, vectorizer.transform([to_test]))
        if similarity > highest_similarity:
            highest_similarity = similarity
            highest_label = label
    
    return highest_label

In [36]:
num = 2
classification(test_resumes[num], train_df, tfidf_vectorizer)
print(test_labels[num])

HR


#### Evaluation

In [37]:
y_pred = test_resumes.apply(lambda x: classification(x, train_df, tfidf_vectorizer))
y_pred

#Compare new values with initial database train_df to find most similar label

0         Automation Testing
1              ETL Developer
2                         HR
3                         HR
4             Civil Engineer
5     Electrical Engineering
6             Java Developer
7             Java Developer
8                         HR
9                   Database
10        Health and fitness
11           DevOps Engineer
12             Web Designing
13                    Hadoop
14              Data Science
15            Civil Engineer
16        Operations Manager
17                    Hadoop
18                  Advocate
19          Python Developer
20            Civil Engineer
21                Blockchain
22                       PMO
23                        HR
24                     Sales
25                   Testing
26              Data Science
27          DotNet Developer
28              Data Science
29              Data Science
30          DotNet Developer
31              Data Science
Name: Resume, dtype: object

In [38]:
accuracy_score(test_labels, y_pred)

0.84375

## Resume Summarization

In [39]:
import cohere
co_client = cohere.Client('41ApJD1HzYGiYv7R3uKNNp8W4yK8UFlcyBcwRD68')

In [40]:
initial_context = 'This is a resume provided in a normalized format:\n'
resume_to_summarize = own_resume
task_to_complete = '\nIn summary:'

message = initial_context + resume_to_summarize + task_to_complete

In [41]:
response = co_client.chat(
    message,
    model="command",
    temperature=0
)

In [42]:
print(response.text)

Here is a resume summary:

During his studies at Nanyang Polytechnic, Chong Jia Shuo developed a passion for Artificial Intelligence (AI). He is proficient in Python and well-versed in programming, data management, and machine learning. He also has experience with cloud computing and platform topics. With a Distinction in these subjects under his belt, he is equipped with the necessary skills to develop innovative AI models and algorithms. 

He is an independent learner who is also pursuing hobbies like programming and classical piano to augment his skills. He has also received multiple awards and certificates for excellence in his field, including third place in the Huawei ICT Competition Cloud Track. With his expertise and achievements, he is eager to contribute value to a company and facilitate the sharing of these outcomes with stakeholders. 

Chong is a motivated individual who is dedicated to solving problems and learning new skills. He is confident that his skills and experience

In [43]:
len(response.text)

1046

In [44]:
len(own_resume)

3986