In [2]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.externals import joblib
from string import punctuation, digits
import nltk, pandas as pd, re

In [7]:
vectorizer_curso = None
stopwords = None
tags = None
#stemmer = None
df_cursos = None
df_cursos_tfidf = None

vectorizer_tags = None
df_tfidf_tags = None

In [13]:
def _processar_texto(texto):
    texto = ' '.join([w for w in nltk.word_tokenize(texto) 
                          if (w not in punctuation and w not in stopwords) 
                            and not (re.match('.*[\d_].*', w) and w not in tags)
                     ])
    return texto

def _aplicar_tfidf(textos):
    matrix_tfidf = vectorizer_curso.transform(textos)
    return _tfidf_to_dataframe(matrix_tfidf, vectorizer_curso)
    

def _tfidf_to_dataframe(tfidf, vectorizer):
    return pd.DataFrame(
        tfidf.todense(),
        columns=vectorizer.get_feature_names()
    )

def _get_vectorizer():
    return TfidfVectorizer(
        lowercase=True,
        use_idf=True,
        max_df=1.0,
        #stop_words = 'english'
    )
    
def _get_vectorizer_tags():
    return TfidfVectorizer(
        lowercase=True,
        use_idf=True,
        max_df=1.0,
        vocabulary=tags,
        token_pattern='(?u)\S\S+' #Tokeniza as palavras apenas por espaço.
    )

def cursos_similares(search, max_cursos):
    search = _processar_texto(search)
    search_tfidf = _aplicar_tfidf([search])
    
    df_tfidf = df_cursos_tfidf.append(search_tfidf, ignore_index=True)
    sim = cosine_similarity(df_tfidf)
    index_cursos = pd.Series(sim[-1]).sort_values(ascending=False).index
    
    return df_cursos.reindex(index_cursos[1:max_cursos+1]).to_dict(orient='records')

def _fit_vectorizer_tags_curso():
    vectorizer_tags = _get_vectorizer_tags()
    vectorizer_tags.fit(tags)
    tfidf = vectorizer_tags.transform(tags)
    df_tfidf = _tfidf_to_dataframe(tfidf, vectorizer_tags)
    return df_tfidf, vectorizer_tags
    
def descobrir_tags_do_curso(desc_curso, corte_sim=0.20, max_tags=8):
    tfidf = vectorizer_tags.transform(desc_curso)
    df_tfidf_cursos = _tfidf_to_dataframe(tfidf, vectorizer_tags)
    df = df_tfidf_tags.append(df_tfidf_cursos, ignore_index=True)
    
    sim = cosine_similarity(df)
    sim = pd.Series(sim[-1][:-1])
    sim = sim[sim > corte_sim]
    
    index_tags_sim = sim.sort_values(ascending=False).index
    return pd.Series(tags)[index_tags_sim][:max_tags]  
    

def _inicializar():
    global stopwords, df_cursos, df_cursos_tfidf, vectorizer_curso, tags, vectorizer_tags, df_tfidf_tags
    
    stopwords = nltk.corpus.stopwords.words('english')
    #stemmer = nltk.stem.RSLPStemmer()
    tags = pd.read_csv('../StackOverflowJobs/data/jobs_tags.csv').columns.tolist()
    
    df_cursos = pd.read_csv('data/cursos_udemy.csv')
    df_cursos.fillna('', inplace=True)
    if not 'doc' in df_cursos.columns:
        df_cursos['doc'] = df_cursos.apply(lambda row: '%s %s %s'.join([row['title'], row['description'], row['what_learn']]), axis=1)
        df_cursos['doc'] = df_cursos['doc'].apply(_processar_texto)
    
    vectorizer_curso = _get_vectorizer()
    vectorizer_curso.fit(df_cursos['doc'])
    df_cursos_tfidf = _aplicar_tfidf(df_cursos['doc'])
            
    df_tfidf_tags, vectorizer_tags = _fit_vectorizer_tags_curso()
    
_inicializar()


In [14]:
cursos_similares('pytho djando api', 10)

[{'description': "Are you tired of boring the outdated and incomplete courses , then let's dive in to the world of Rest Api's.Well I'm Asfend Microsoft Most Valuable Professional (MVP) and in this course I'll explain what basically the Rest Api is and how you can create the Rest Api's in Asp.Net and C#.\xa0\nThe main focus of this course is on the\xa0Restful Web Api via Asp.NET and\xa0C#. So if you're familiar with C# , Asp.Net and Entity Framework Or if you want to create the Restful web api's in\xa0Asp.Net\xa0then this is the right course for you\n\n\nIn this course you will learn how to create the REST\xa0API via Code First and Database First Approach using ASP.NET\xa0. Then you\xa0will learn how to test your web api via Postman , Later in this course\xa0you'll learn how to create Web Apps on Microsoft Azure and Publish your Restful Web Api's to Microsoft Azure Cloud. And then you'll\xa0cover all the advanced concepts of Rest Api's like Sorting ,\xa0Caching , Versioning , Security ,

In [15]:
df_cursos_tfidf.to_csv('data/cursos_tfidf.csv', index=False)

In [16]:
joblib.dump(vectorizer_curso, 'data/vectorizer_cursos.dat')

['data/vectorizer_cursos.dat']

In [17]:
joblib.dump(vectorizer_tags, 'data/vectorizer_tags.dat')

['data/vectorizer_tags.dat']

In [18]:
df_tfidf_tags.to_csv('data/tags_tfidf.csv', index=False)

In [21]:
df_cursos.iloc[0:20].shape

(20, 9)