In [None]:
import math

import numpy as np
from scipy.sparse import csr_matrix
import scipy.sparse
from sklearn.utils.extmath import safe_sparse_dot
from gensim.models import Word2Vec, KeyedVectors 
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

from preprocessing import prepare_doc
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.feature_extraction.text import CountVectorizer

In [None]:
def getW2V_model(data, embedding_dim=300, context_size=4, min_freq=4, 
                 iterations=5, skip_gram=0):
  """
  Function creates a word2vec model. 
  """

  w2v_model = Word2Vec(size=embedding_dim, window=context_size, 
                       min_count= min_freq, sg=skip_gram)
  w2v_model.build_vocab(data)
  w2v_model.train(data, total_examples=w2v_model.corpus_count, 
                  epochs=iterations)

  return w2v_model


In [None]:
def getD2V_model (docs:list, vector_size=5, window=2, min_count=1, 
                  dist_memory=1, seed=42):
  """
  Function creates a doc2vec model.
  """
  
  tagged_doc = [TaggedDocument(doc, [i]) for i, doc in enumerate(docs)]
  doc2vec_model = Doc2Vec(tagged_doc, vector_size=vector_size, 
                          window=window, min_count=min_count, 
                          workers=4, dm=dist_memory, seed=seed)
  return doc2vec_model


In [None]:
def getTFIDF(data, max_df=0.95, min_df=2):
  """
  Calculates the tf-idf for the diven data.
  """

  tfidf_vectorizer = TfidfVectorizer(analyzer=prepare_doc,
                                     max_df=max_df, min_df=min_df, 
                                     stop_words='english')
  tfidf = tfidf_vectorizer.fit_transform(data)

  return tfidf, tfidf_vectorizer

In [None]:
def getTF(data, max_df=0.95, min_df=2,):
  """
  Calculates the term frequency for the given data.
  """
  tf_vectorizer = CountVectorizer(max_df=max_df, 
                                  min_df=min_df, stop_words='english')
  tf = tf_vectorizer.fit_transform(data)

  return tf, tf_vectorizer 
