In [36]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import csv
import datetime

import gensim
from gensim.models import Word2Vec

from nltk.stem.snowball import SnowballStemmer
import re


def clear_txt(txt):
    stemmer = SnowballStemmer('russian')
    txt = txt.lower()
    #txt = re.sub('[/+_!@#$A-Za-z0-9\n.,:()""«»;-]', ' ', txt)
    txt = re.sub('[/+_!@#$0-9\n.,:()""«»;-]', ' ', txt)
    new_txt = ''
    for t in txt.split(' '):
        if len(t) > 0:
            new_txt = new_txt + stemmer.stem(t) + ' '
    return new_txt[:-1]


def match_description(df3, exp_len, exp_desc):
    
    w2v_model = Word2Vec.load("w2v_description.model")
    
    df3['description_stem'] = df3['description_stem'].fillna('--')
    
    sentences = list(df3['description_stem'])
    new_sentences = [list(i.split(' ')) for i in sentences] 
    sentences = new_sentences

    target_sentence = exp_desc.replace(',', ' ').lower()
    target_sentence

    # use n_similarity to compute a cosine similarity (should be reasonably robust)
    sentences_similarity = np.zeros(len(sentences))
    indexes = np.zeros(len(sentences))

    target_sentence_words = [w for w in target_sentence.split(' ') if w in w2v_model.wv.index_to_key]

    for idx, sentence in enumerate(sentences):
        
        sentence_words = [w for w in sentence if w in w2v_model.wv.index_to_key]

        sim = w2v_model.wv.n_similarity(target_sentence_words, sentence_words)
        sentences_similarity[idx] = sim
        
        ss = ' '.join(sentence)
        indexes[idx] = df3[df3['description_stem']==ss].index[0]

    #result = list(zip(indexes, sentences_similarity, sentences))
    result = list(zip(indexes, sentences_similarity))
    result.sort(key=lambda item:item[1], reverse=True)
    print("Target:", target_sentence)
    #print(result)
    
    df = pd.DataFrame(zip(indexes, sentences_similarity), columns=['Indexes', 'Desc_Similarity'])
    
    return (df)


def match_skills(df3, exp_len, skills):
    
    df3['skills'] = df3['skills'].str.replace(',', ' ').str.lower().fillna('--')    
    
    w2v_model = Word2Vec.load("w2v_skills.model")
    
    sentences = list(df3['skills'])
    new_sentences = [list(i.split(' ')) for i in sentences] 
    sentences = new_sentences

    target_sentence = skills.replace(',', ' ').lower()
    target_sentence

    sentences_similarity = np.zeros(len(sentences))
    indexes = np.zeros(len(sentences))

    target_sentence_words = [w for w in target_sentence.split(' ') if w in w2v_model.wv.index_to_key]

    for idx, sentence in enumerate(sentences):
        sentence_words = [w for w in sentence if w in w2v_model.wv.index_to_key]

        sim = w2v_model.wv.n_similarity(target_sentence_words, sentence_words)
        sentences_similarity[idx] = sim

        ss = ' '.join(sentence)
        indexes[idx] = df3[df3['skills']==ss].index[0]

    result = list(zip(indexes, sentences_similarity, sentences))
    result.sort(key=lambda item:item[1], reverse=True)
    print("Target:", target_sentence)
    
    df = pd.DataFrame(zip(indexes, sentences_similarity), columns=['Indexes', 'Skill_Similarity'])
    
    return (df)


def match_vacancy(person):
    
    print(datetime.datetime.now())
    
    df3 = pd.read_csv(os.path.join('input', 'data_processed_add_stems.csv'), encoding = 'utf-16')

    df3.loc[df3['experience.id'] == 'between3And6', 'exp_length_hh'] = 3
    df3.loc[df3['experience.id'] == 'between1And3', 'exp_length_hh'] = 1
    df3.loc[df3['experience.id'] == 'noExperience', 'exp_length_hh'] = 0
    df3.loc[df3['experience.id'] == 'moreThan6', 'exp_length_hh'] = 6
    
    data = person.split(';')
    print(data)
    skills = data[1]    
    exp_len = data[0]
    experience = data[2]
    
    
    #s1 = re.sub('[^A-Za-zА-Яа-я]+', ' ', experience)
    s1 = re.sub('\W+',' ', experience )
    
    exp_processed = clear_txt(s1)
    
    
    print(df3['exp_length_hh'].unique())
    
    df3 = df3[df3['exp_length_hh']<= (int(exp_len) + 1)]
    
    # выберем 1000 ваканасий
    df3 = df3.sample(n = 5000)

    print(len(df3))

    x = match_description(df3, exp_len, exp_processed)
    y = match_skills(df3, exp_len, skills)
    
    out_df = x.merge(y, on='Indexes', how='inner')
    
    out_df['Combined_Similarity'] = out_df['Skill_Similarity'] * 0.6 + out_df['Desc_Similarity'] * 0.4
    
    ids = out_df.sort_values(by=['Combined_Similarity'], ascending=False)[:5]['Indexes']
    
    
    
    '''
    ids = []

    for x in result[:5]:
        ids.append (x[0])

    '''

    out =''
    

    for i in ids:
        print (df3.loc[i, 'alternate_url'], df3.loc[i, 'name'])
        print (df3.loc[i, 'skills'])
        print (df3.loc[i, 'requirement'])
        print ('\n')
        
        out = out + df3.loc[i, 'alternate_url'] + ' ' + df3.loc[i, 'name'] + os.linesep + \
                    df3.loc[i, 'experience.name'] + \
              os.linesep + df3.loc[i, 'requirement'] + os.linesep + os.linesep

    
    print(datetime.datetime.now())
    
    
    return (out)
    #return(ids)