In [16]:
import spacy
from spacy.vectors import Vectors
import os
import pandas as pd
import numpy as np
nlp = spacy.load('en_core_web_sm')
vectors = Vectors(shape=(10000, 300))
nlp.vocab.vectors = vectors
print(nlp.vocab.vectors.shape)
import re
re_c = re.compile(r'\w+')
import warnings
warnings.filterwarnings('ignore')

(10000, 300)


In [2]:
# switch for debug
flag_print = True

# switch to clear existing data
flag_clear = True

#threshold value for determining section
threshold = 0.5

In [3]:
similar_to = {
    'edu' : ['education', 'study', 'academics', 'institute', 'school', 'college'],
    'exp' : ['job', 'internship', 'training', 'research', 'career', 'profession', 'role'
             'project', 'responsibility', 'description', 'work experience', 'workshop', 'conference'],
    'skill' : ['skill', 'languages', 'technology', 'framework', 'tools', 'database'],
    'extra' : ['introduction', 'intro', 'achievement', 'hobby', 'links', 'additional', 
               'personal', 'award', 'objective', 'miscellaneous', 'interest']
}

list_of_sections = similar_to.keys()

# to bring similar_words to their normal forms
for section in list_of_sections:
    new_list = []
    
    for word in similar_to[section]:
        docx = nlp(word)
        new_list.append(docx[0].lemma_)
        
    if flag_print:
        print(section, new_list)
        
    similar_to[section] = new_list

edu ['education', 'study', 'academics', 'institute', 'school', 'college']
exp ['job', 'internship', 'training', 'research', 'career', 'profession', 'roleproject', 'responsibility', 'description', 'work', 'workshop', 'conference']
skill ['skill', 'language', 'technology', 'framework', 'tool', 'database']
extra ['introduction', 'intro', 'achievement', 'hobby', 'link', 'additional', 'personal', 'award', 'objective', 'miscellaneous', 'interest']


In [4]:
def modify(word):
    try:
        symbols = '''~'`!@#$%^&*)(_+-=}{][|\:;",./<>?'''
        mod_word = ''
        
        for char in word:
            if (char not in symbols):
                mod_word += char.lower()

        docx = nlp(mod_word)

        if (len(mod_word) == 0 or docx[0].is_stop):
            return None
        else:
            return docx[0].lemma_
    except:
        return None # to handle the odd case of characters like 'x02', etc.
    
if flag_print:
    test_words = ['Hello!!', '.,<>', 'India', 'of', '..freedoM..', 'e-mail']
    
    for word in test_words:
        print(word, '--returned-->', modify(word))

Hello!! --returned--> hello
.,<> --returned--> None
India --returned--> india
of --returned--> None
..freedoM.. --returned--> freedom
e-mail --returned--> email


In [7]:
def is_empty(line):
    for c in line:
        if (c.isalpha()):
            return False
    return True
      
if flag_print:
    test_words = ['.', '<.>', 'Speak', 'out', '"Eric"', 'freemail...']
    


In [None]:
dict_of_data_series = {}
flag_print = False
path_cv = "/home/ubuntu/Documents/python works/unicept/b/Resume_Filtering-develop/Data/CVs/"
for file_name in os.listdir(path_cv):
    if flag_print:
        print('\n')
        print('*'*25) 
        print(file_name) 
        print('*'*25) 
        
    main_file_handler = open(path_cv+file_name, 'r', encoding='latin-1')  
    previous_section  = 'extra'
    
    curr_data_series = pd.Series([""]*len(list_of_sections), index=list_of_sections)
                   
    for line in main_file_handler:
        # skip line if empty
        if (len(line.strip()) == 0 or is_empty(line)):
            continue
                
        # processing next line
        list_of_words_in_line = re_c.findall(line)
        list_of_imp_words_in_line  = []
        
        for i in range(len(list_of_words_in_line)):
            modified_word = modify(list_of_words_in_line[i])
            
            if (modified_word):
                list_of_imp_words_in_line.append(modified_word)

        curr_line = ' '.join(list_of_imp_words_in_line)
        doc = nlp(curr_line)
        section_value = {}
            
       
        for section in list_of_sections:
            section_value[section] = 0.0
        section_value[None] = 0.0
            
         
        for token in doc:
            for section in list_of_sections:
                for word in similar_to[section]:
                    word_token = doc.vocab[word]
                    section_value[section] = max(section_value[section], float(word_token.similarity(token)))

        
        most_likely_section = None
        for section in list_of_sections:
            
            if (section_value[most_likely_section] < section_value[section] and section_value[section] > threshold):
                most_likely_section = section
            
 
        if (previous_section != most_likely_section and most_likely_section is not None):
            previous_section = most_likely_section
                

       
        try:
            docx = nlp(line)
        except:
            continue  # to handle the odd case of characters like 'x02', etc.
        mod_line = ''
        for token in docx:
            if (not token.is_stop):
                mod_line += token.lemma_ + ' '
        
        curr_data_series[previous_section] += mod_line
            
    dict_of_data_series[file_name] = curr_data_series
    if flag_print:
        print(curr_data_series)
    main_file_handler.close()
     
data_frame = pd.DataFrame(dict_of_data_series)
data_frame.to_csv('prc_data.csv', sep='\t')


In [None]:
data_frame.head()