In [1]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.externals import joblib
from string import punctuation
import nltk, pandas as pd, re


In [11]:
df_jobs = pd.read_csv('data/Stack_Overflow_Jobs2.csv')
df_jobs.shape

(1050, 19)

In [12]:
df_jobs.head()

Unnamed: 0,title,tags,perks,country,location,company,company-size,company-type,description,experience-level,industry,job-type,link,posted,moeda_original,role,salary_min,salary_max,salary_mean
0,Data Scientist,"python, medical, data-science",Paid relocation,"Massachusetts, EUA",Boston,Etiometry Inc.\n,50,Private,\nWe are currently seeking a Data Scientist to...,Mid-Level,"Data & Analytics, Healthcare, Medical Software",Full-time,/jobs/199604/data-scientist-etiometry-inc,2019-01-07 19:51:19.033107,-,Data Scientist,0,0,0.0
1,Senior Platform Developer (Backend Developer),"ruby-on-rails, elixir",,"Califórnia, EUA",San Francisco,The Real Real\n,5000,Private,\nThe RealReal is leading the way in authentic...,"Mid-Level, Senior, Lead","eCommerce, Fashion, Retail",Full-time,/jobs/199609/senior-platform-developer-backend...,2019-01-07 21:51:19.064124,-,Backend Developer,0,0,0.0
2,Senior Platform Engineer,"node.js, mysql, rest, ecmascript-6, javascript",,"Califórnia, EUA",San Francisco,Pear Therapeutics\n,200,Private,"\nAbout Pear Therapeutics \nAt Pear, our missi...","Mid-Level, Senior",Biotechnology,Full-time,/jobs/204842/senior-platform-engineer-pear-the...,2019-01-07 21:51:19.095266,-,Full Stack Developer,0,0,0.0
3,Senior Java Developer,"java-ee, spring, oracle, weblogic, jboss",Remote,"Califórnia, EUA",Los Angeles,Integrated Data Services (IDS)\n,200,Private,"\nSenior Java Developer\nLos Angeles, CA; Wash...",Senior,"Financial Technology, Government, Software Dev...",Full-time,/jobs/157266/senior-java-developer-integrated-...,2019-01-07 21:51:19.125580,-,Full Stack Developer,0,0,0.0
4,Senior Oracle PLSQL Developer,"plsql, oracle, sql, rdbms, database, sysadmin",Remote,"Califórnia, EUA",Los Angeles,Integrated Data Services (IDS)\n,200,Private,"\nSenior Oracle PLSQL Developer\nLos Angeles, ...",Senior,"Financial Technology, Government, Software Dev...",Full-time,/jobs/141988/senior-oracle-plsql-developer-int...,2019-01-07 21:51:19.160225,-,Database Administrator,0,0,0.0


In [13]:
df_tags = pd.read_csv('data/jobs_tags.csv')

In [14]:
vectorizer = None
stopwords = None
tags = None
stemmer = None

In [15]:

def _processar_texto(texto):
    
    texto = ' '.join([stemmer.stem(w) for w in nltk.word_tokenize(texto) 
                          if w not in punctuation and
                             w not in stopwords and
                            # Remove os números mas mantém as tags que os contém
                            not (re.match('.*[\d_].*', w) and w not in tags)
                     ])
    return texto

def _get_vectorizer():
    return TfidfVectorizer(
        lowercase=True,
        use_idf=True,
        max_df=1.0,
        #stop_words = 'english'
    )

def _aplicar_tfidf(palavras = []):
    global vectorizer

    if not vectorizer:
        vectorizer = _get_vectorizer()
        vectorizer.fit(palavras)
        
    matrix_tfidf = vectorizer.transform(palavras)
    
    return pd.DataFrame(
        matrix_tfidf.todense(),
        columns=vectorizer.get_feature_names()
    )

def jobs_similares(search, max_jobs):
    search = _processar_texto(search)
    search_tfidf = _aplicar_tfidf([search])
    
    df_tfidf = jobs_tfidf.append(search_tfidf, ignore_index=True)
    sim = cosine_similarity(df_tfidf)
    index_jobs = pd.Series(sim[-1]).sort_values(ascending=False).index
    
    return df_jobs.reindex(index_jobs[1:max_jobs+1])
    
    

def _inicializar():
    global stopwords, tags, stemmer
    
    stopwords = '' #nltk.corpus.stopwords.words('english')
    tags = df_tags.columns.tolist()
    stemmer = nltk.stem.RSLPStemmer()
    
_inicializar()


In [16]:
df_jobs.fillna('', inplace=True)

In [17]:
df_jobs['doc'] = df_jobs.apply(lambda row: 
                               '%s %s %s %s'%(
                                   row['description'],
                                   row['title'],
                                   row['tags'], 
                                   row['role']
                               ),
                               axis=1
                              )

In [18]:
df_jobs['doc'] = df_jobs['doc'].apply(_processar_texto)
df_jobs['doc'].iloc[1][:400]

'the realre is leading the way in authenticated luxury consignment onlin and in real lif at our brick and mort locatiom founded in we ’ re growing fast and fundamentally changing the way peopl buy and sell luxury — a multi-billion doll industry with a te of in-hous expert who inspect every it we sell our commitment to authenticity set us apart and creat a foundation of trust with shopp and consign '

In [None]:
jobs_tfidf = _aplicar_tfidf(df_jobs['doc'])

In [None]:
joblib.dump(vectorizer, 'data/vectorizer_jobs.dat')

In [None]:
jobs_tfidf.to_csv('data/jobs_tfidf.csv', index=False)