In [11]:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
import re
import nltk
import spacy
import os
from docx2pdf import convert
#import string
from nltk.stem.snowball import SnowballStemmer
nlp = spacy.load("en_core_web_sm")



def doc2pdf(path):
    
    new_path = re.sub('docx','pdf',path)
    
    convert(path, new_path)
    return new_path
    

def convert_pdf_to_txt(path):
    
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 10
    caching = True
    pagenos=set()
    
    base = os.path.basename(path)
    ext = os.path.splitext(base)[1]
    if ext == '.pdf':
        fp = open(path, 'rb')
        
    elif ext == '.docx':
        path = doc2pdf(path)
        fp = open(path, 'rb')
        
    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True):
            interpreter.process_page(page)

    text = retstr.getvalue()
    text = re.sub('\n', ' ', text)
        
    fp.close()
    device.close()
    retstr.close()
    
    return text

def remove_StopWords(text):
    
    sentence = nlp(text)
    filtered_sentence=''
    token_list = []
    for token in sentence : 
        token_list.append(token.text)
    for word in token_list:
        lexeme = nlp.vocab[word]
        if lexeme.is_stop == False :
            filtered_sentence += ' '+word
    filtered_sentence = re.sub('é','e',filtered_sentence)
    filtered_sentence = re.sub('è','e',filtered_sentence)
    filtered_sentence = re.sub('à','a',filtered_sentence)
    filtered_sentence = re.sub('â','a',filtered_sentence)
    return filtered_sentence

def get_lem(text):
    
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

#def get_stem(text):
    filtered_sentence = ''    
    stemmer = SnowballStemmer(language='french')
    for token in text.split(" "):
        filtered_sentence += ' '+stemmer.stem(token)
    return filtered_sentence

        
def remove_Ent(text):
    l = ['janvier','fevrier','mars','avril','mai','juin','juillet','aout','septembre','octobre','novombre','decembre','mois','an',
         'année','jan','feb','mar','april','june','july','jul','aug','august','september','sep','oct','nov','dec']
    for i in l :
        text = re.sub(i, '', text)
    doc = nlp(text)
    text =''
    ent_type = ['PER','DATE','GPE','LOC']
    for ent in doc:
        if ent.ent_type_ in ent_type:
            text += ''
        else:
            text += ' ' + ent.text
    return text


def get_low(text):
    
    return text.lower()

def remove_SpeChar(text):
    text = re.sub('\W[a-z]\W',' ',text)
    text = re.sub('\W[a-z]\W',' ',text)
    text = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', text, flags=re.MULTILINE)
    text = re.sub('\S*@\S*\s?', '', text)
    text = re.sub('\[.*?\]', '', text)
    text = re.sub(r'[^\w]', ' ', text)
    text = re.sub('[!"#$%&()*+,./:;<=>?@[\]^_`{|}~‘’•]', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = " ".join(text.split())
    return text

def main_PreProc(text):
    
    text = remove_StopWords(text)
    text = remove_Ent(text)
    #text = get_lem(text)
    text = remove_Ent(text)
    #text = get_stem(text)
    text = get_low(text)
    text = remove_SpeChar(text)
    
    return text
    

In [12]:
def lang_selec(lang):
    if lang == 'fr':
        nlp = spacy.load("fr_core_news_sm")
    elif lang == 'en':
        nlp = spacy.load("en_core_web_sm")
    return nlp

        

In [14]:
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures

from nltk.collocations import TrigramCollocationFinder
from nltk.metrics import TrigramAssocMeasures


def tokenizer(text):
    doc = nlp(text)
    L=[]
    for token in doc : 
        if re.search(' +', token.text)==None:
            L.append(token.text)
    return L


def bigrams_(text, tokens):
    l=[]

    finder = BigramCollocationFinder.from_words(tokens)
    
    a = finder.nbest(BigramAssocMeasures.likelihood_ratio,1000)    
    for i in range(len(a)):
        l.append(a[i][0]+' '+a[i][1])
    return l
    


def trigrams_(text, tokens):
    l=[]
     
    finder = TrigramCollocationFinder.from_words(tokens)

    a = finder.nbest(TrigramAssocMeasures.likelihood_ratio,1000)    
    for i in range(len(a)):
        l.append(a[i][0]+' '+a[i][1]+' '+a[i][2])
    
    return l
    

In [15]:
def path_selector(path):
    """ this function will return a list of all files paths with pdf format"""
    L=[]
    
    if type(path) == str and path.endswith('.pdf') == True:
        base = os.path.basename(path)
        return {'name': get_low(os.path.splitext(base)[0]), 'path': path}
    
    
    elif type(path) == str and path.endswith('.pdf') == False:
        for filename in os.listdir(path):
            if filename.endswith('.pdf'):
                base = os.path.basename(filename)
                L.append({'name': get_low(os.path.splitext(base)[0]), 'path': os.path.join(path, filename)})
    return L

In [16]:
from collections import Counter
def insert_text_into(path):
    List = []
    path_ = path_selector(path)
    if type(path_) == list :  
        
        for i in path_:
            text = convert_pdf_to_txt(i.get('path'))
            preproceced_text = main_PreProc(text)
            tokens = tokenizer(preproceced_text)
            bigrams = bigrams_(preproceced_text, tokens)
            trigrams = trigrams_(preproceced_text, tokens)
            counter = Counter(tokens)
            most_freq = counter.most_common(1) 
            List.append(
                         {
                          'name' : re.sub('[\d+ .pdf !"#$%&()*+,./:;<=>?@[\]^_`{|}~‘’•]','',i.get('name')),
                          'raw_text' : text,
                          'tokens' : tokens,
                          'bigrams': bigrams,
                          'trigrams': trigrams,
                          'most_common_word' : most_freq[0][0],
                          'word_freq' : most_freq[0][1]/len(tokens),
                          }
                        )
                         #'tokenized' : tokenized
                         #'bigram_text': bigrams_(main_(i.get('path'))), 
                         #'tokenized_text': tokenizer(main_(i.get('path')))
        return List
    elif type(path_) == dict : 
        text = convert_pdf_to_txt(path_.get('path'))
        preproceced_text = main_PreProc(text)
        tokens = tokenizer(preproceced_text)
        bigrams = bigrams_(preproceced_text, tokens)
        trigrams = trigrams_(preproceced_text, tokens)
        counter = Counter(tokens)
        most_freq = counter.most_common(1) 
        element = {
                          'name' : re.sub('[\d+ .pdf !"#$%&()*+,./:;<=>?@[\]^_`{|}~‘’• CV cv ]','',path_.get('name')),
                          'raw_text' : text,
                          'tokens' : tokens,
                          'bigrams': bigrams,
                          'trigrams': trigrams,
                          'most_common_word' : most_freq[0][0],
                          'word_freq' : most_freq[0][1]/len(tokens),
                  } 
        
        List.append(element)
        return element
        
   

In [270]:
d = insert_text_into(r'..\stage\Resume')

In [271]:
d

[{'name': 'jonas',
  'raw_text': "   JONAS  Experience    Apr 2017 - Present  (1 year)   Sep 2015 - Mar 2017  (1 year 6 months)   Jul 2014 - Sep 2015  (1 year 2 months)   Jul 2013 - Jul 2014  (1 year)   Asc Degree   Asc Degree   Fund Accountant  Citco Fund Services (Singapore) Pte Ltd. |     Position level    Hedge fund related accounting      Financial Consultant  AXA Life Insurance Singapore Pte Ltd |     Position level    - Assisted clients to plan ahead through various financial planning models  - Provided consistent support for their insurance coverage and investment  portfolios  - Specialized in Life/General Insurance product knowledge  - Focused on needs-based selling  - Always going the extra mile to achieve high customer satisfaction level      Co-Partner & Finance Consultant  E-commerce |     Position level    - Maintained the company’s bookkeeping  - Ensured adherence to Government CPF policies and IRAS guidelines  - Adapted to different business platforms with openness to i

In [76]:
def get_keywords(field,keyword_list):
    
    l=[]
    
    if type(keyword_list) == str and type(field) == str :
        l.append(main_PreProc(keyword_list))
        element = {'field':field,'keyword_list':l}
    
    elif type(keyword_list) == list and type(field) == str:
        for i in keyword_list:
            l.append(main_PreProc(i))
        element =  {'field':field,'keyword_list':l}
    
    elif type(field) != str:
        raise Error('field must be string')
    return element

In [54]:
from fuzzywuzzy import fuzz

def word_sim(a,b):
    l =[fuzz.ratio(a,b),fuzz.partial_ratio(a,b),fuzz.token_sort_ratio(a,b),fuzz.token_set_ratio(a,b)]
    if len(a)==len(b):
        l.remove(fuzz.partial_ratio(a,b)) 
        return max(l)
    else:
        return max(l)

In [155]:
word_sim('data-sciene, machine-learning and deep learning','data')

('data sciene machine learn deep learning', 100)

In [77]:
def check_sim_keywords(keywords_dic, word):
    
    for j in keywords_dic.get('keyword_list'):
        
        if word_sim(main_PreProc(j),main_PreProc(word)) >=80 :
            c = 1
            break
        else:
            c = 0
    if c != 0:        
        return keywords_dic.get('keyword_list')
    else:
        raise('no match found') 

                   

In [71]:
from math import log
def tf_idf_(word,d):
    n=0
    counter=[]
    tot_docs = d.count_documents({})
    tf_idf_list=[]
    freq_list=[]
    for i in range(tot_docs):
        tokens = d.find({})[i].get('tokens')
        name=d.find({})[i].get('name')
        c=0
        n=0
        for j in tokens:
            score = word_sim(word,j)
            if score > 80 :
                c=1
                n+=1
        counter.append(c)
        freq_list.append((n/len(tokens),name))
    app_word_tot = sum(counter)    
    if app_word_tot!=0:
        idf = log(tot_docs/app_word_tot)
        for i in freq_list:
            tf_idf_list.append((i[0]*idf,i[1]))
        return tf_idf_list
    else:
        return 'no match'

In [72]:
import sys
import pymongo
from pymongo import MongoClient
Client = MongoClient("mongodb+srv://misooo123:miso1212@nlpintern.9ghch.mongodb.net/Resume?retryWrites=true&w=majority")

db = Client.NLPIntern

tf_idf_('data',db.Resume)

[(0.0009375779466983603, 'amy'),
 (0.0023004489826207192, 'cv-labia-oumaima'),
 (0.0, 'cv'),
 (0.012272895322281537, 'cvmohameel-eliem'),
 (0.009270058511569545, 'resumehajjajiima')]

In [74]:
tf_idf_(main_PreProc('finance'),db.Resume)

[(0.007699922116589538, 'amy'),
 (0.0, 'cv-labia-oumaima'),
 (0.0, 'cv'),
 (0.0, 'cvmohameel-eliem'),
 (0.008156890788197226, 'resumehajjajiima')]

In [75]:
tf_idf_(main_PreProc('data science and machine learning'),db.Resume)

[(0.0, 'amy'),
 (0.0, 'cv-labia-oumaima'),
 (0.0, 'cv'),
 (0.0, 'cvmohameel-eliem'),
 (0.0, 'resumehajjajiima')]

In [82]:
tf_idf_(main_PreProc('mathematics, probabilty and statistics'),db.Resume)

[(0.0, 'amy'),
 (0.028964339491886072, 'cv-labia-oumaima'),
 (0.0, 'cv'),
 (0.01787889683180968, 'cvmohameel-eliem'),
 (0.010610621265168946, 'resumehajjajiima')]

In [78]:
data = get_keywords('data',['machine-learning','data-science','data-engineering','mathematics','data','SQL',
                         'NOSQL','deep-learning','statistic descriptive','github','kaggle','linear regression',
                         'artifical intellgience','monte-carlo','AI','data-wrangling','data-vizualisation'])
data

{'field': 'data',
 'keyword_list': ['machine learning',
  'data science',
  'data engineering',
  'mathematics',
  'data',
  'sql',
  'nosql',
  'deep learning',
  'statistic descriptive',
  'github',
  'kaggle',
  'linear regression',
  'artifical intellgience',
  'monte carlo',
  'ai',
  'data wrgling',
  'data vizualisation']}

In [79]:
word_to_check = check_sim_keywords(data,'machine learning engineer')

In [81]:
for i in word_to_check:
    print(tf_idf_(i,db.Resume),i)

[(0.0, 'amy'), (0.0, 'cv-labia-oumaima'), (0.0, 'cv'), (0.0, 'cvmohameel-eliem'), (0.0, 'resumehajjajiima')] machine learning
[(0.0, 'amy'), (0.0023004489826207192, 'cv-labia-oumaima'), (0.010012851661535054, 'cv'), (0.015620048591994685, 'cvmohameel-eliem'), (0.011918646657732271, 'resumehajjajiima')] data science
[(0.0, 'amy'), (0.0046008979652414385, 'cv-labia-oumaima'), (0.011443259041754347, 'cv'), (0.013388613078852585, 'cvmohameel-eliem'), (0.011918646657732271, 'resumehajjajiima')] data engineering
[(0.0, 'amy'), (0.0052662435439792855, 'cv-labia-oumaima'), (0.0, 'cv'), (0.007662384356489861, 'cvmohameel-eliem'), (0.0030316060757625564, 'resumehajjajiima')] mathematics
[(0.0009375779466983603, 'amy'), (0.0023004489826207192, 'cv-labia-oumaima'), (0.0, 'cv'), (0.012272895322281537, 'cvmohameel-eliem'), (0.009270058511569545, 'resumehajjajiima')] data
[(0.0, 'amy'), (0.004723148102444098, 'cv-labia-oumaima'), (0.0, 'cv'), (0.009162907318741552, 'cvmohameel-eliem'), (0.0, 'resumeh

In [64]:
import sys
import pymongo
from pymongo import MongoClient
Client = MongoClient("mongodb+srv://misooo123:miso1212@nlpintern.9ghch.mongodb.net/Resume?retryWrites=true&w=majority")

db = Client.NLPIntern
C = db.text_cat
cur = db.Resume.find({}, {'trigrams': '','id':'0'})
db.Resume.count_documents({})

5

In [None]:
import sys
import pymongo
from pymongo import MongoClient
from collections import Counter

def mongo_db_insert_raw_text(connection_string, path):

    client = MongoClient(connection_string) 
    db = client.NLPIntern
    col = db.Resume
    
    to_insert = insert_text_into(path) 
    
    if type(to_insert) == dict : 
        col.insert_one(to_insert)

    elif type(to_insert) == list : 
        col.insert_many(to_insert)



""" 
        
def mongo_db_cat_text(connection_string):
    
    client = MongoClient(connection_string)
    db = client.NLPIntern
    C = db.text_cat
    Cursor = db.Resume.find({} ,{'raw_text' : '', 'name':'' })
    
    for i in Cursor :
        
        
        
        preproceced_text = main_PreProc(i.get('raw_text'))
        tokens = tokenizer(preproceced_text)
        counter = Counter(tokens)
        bigrams = bigrams_(preproceced_text,tokens)
        trigrams = trigrams_(preproceced_text,tokens)
        most_freq = counter.most_common(1)
        element = {
                    'name': i.get('name'),
                    'preproceced_text' : preproceced_text,
                    'tokens' : tokens,
                    'bigrams' : bigrams,
                    'trigrams' : trigrams,
                    'most_common_word' : most_freq,
                    'word_freq' : most_freq[0][1]/len(tokens),
                    
                   }
        
        C.insert_one(element)

      
def mongo_db_tokenizer_text(connection_string):

    client = MongoClient(connection_string)
    db = client.NLPIntern
    C = db.text_cat
    Cursor = db.text_cat.find({} ,{'preproceced_text' : '' })
    l = []
    for i in Cursor :
        tokens = tokenizer(i.get('preproceced_text'))
        element = {
                    'tokens' : tokens
                   }
        l.append(element)
    for i in range(len(l)):
        C.update( {},  { "$set": l[i] })
    
            
def mongo_db_bigrams(connection_string):
    client = MongoClient(connection_string) 
    db = client.NLPIntern
    C = db.text_cat
    Cursor = db.text_cat.find({} , { 'preproceced_text' : '' , 'tokens': [] })
    l=[]
    for i in Cursor :
        bigrams = bigrams_(i.get('preproceced_text'), i.get('tokens'))
        element = {
                    'bigrams' : bigrams
                   }
        l.append(element)
    for i in range(len(l)) : 
        C.update_one( {},  { "$set": l[i] })
        
            
def mongo_db_trigrams(connection_string):
    client = MongoClient(connection_string) 
    db = client.NLPIntern
    C = db.text_cat
    Cursor = db.text_cat.find({} , { 'preproceced_text' : '' , 'tokens': [] })
    
    for i in Cursor :
        trigrams = trigrams_(i.get('preproceced_text'), i.get('tokens'))
        element = {
                    'trigrams' : trigrams
                   }
        C.update_one(  {},  { "$set": trigrams})
            """

In [20]:
mongo_db_insert_raw_text("mongodb+srv://misooo123:miso1212@nlpintern.9ghch.mongodb.net/Resume?retryWrites=true&w=majority", r'..\stage\doubled')

In [675]:
#mongo_db_cat_text("mongodb+srv://misooo123:miso1212@nlpintern.9ghch.mongodb.net/Resume?retryWrites=true&w=majority")