In [None]:
!pip -qq install plsa

[K     |████████████████████████████████| 1.5 MB 9.3 MB/s 
[K     |████████████████████████████████| 749 kB 39.9 MB/s 
[?25h

In [None]:
import numpy as np
import sys 
import plsa 
import re, unicodedata 
import nltk 
import inflect 
from bs4 import BeautifulSoup 
from nltk.corpus import stopwords 
from nltk.stem import LancasterStemmer,WordNetLemmatizer 
import pandas as pd
from IPython.display import clear_output
#nltk.download('omw-1.4')

In [None]:
def remove_non_ascii(words):
    'Remove non-ASCII characters from list of tokenized words'
    new_words = [] 
    for word in words:
         new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
         new_words.append(new_word) 
    return new_words


def to_lowercase (words):
    'Convert all characters to lowercase from list of tokenized words' 
    new_words = [] 
    for word in words:
         new_word = word.lower()
         new_words.append(new_word) 
    return new_words


def remove_punctuation(words):
  'Remove punctuation from list of tokenized words' 
  new_words = [] 
  for word in words:
      new_word = re.sub(r'[^\w\s]', '', word) 
      if new_word != '':
         new_words.append(new_word) 
  return new_words


def replace_numbers(words):
   'Replace all interger occurrences in list of tokenized words with textual representation'
   p = inflect.engine() 
   new_words = [] 
   for word in words: 
      if word.isdigit():
         new_word = p.number_to_words(word)
         new_words.append(new_word) 
      else:
            new_words.append(word)  
   return new_words


def remove_stopwords(words):
 'Remove stop words from list of tokenized words'
 new_words = [] 
 for word in words: 
     if word not in stopwords.words('english'):
        new_words.append(word) 
 return new_words

  
def stem_words(words):
   'Stem words in list of tokenized words' 
   stemmer = LancasterStemmer() 
   stems = [] 
   for word in words:
     stem = stemmer.stem(word)
     stems.append(stem) 
   return stems



def lemmatize_verbs(words):
   'Lemmatize verbs in list of tokenized words' 
   lemmatizer = WordNetLemmatizer() 
   lemmas = [] 
   for word in words:
       lemma = lemmatizer.lemmatize(word, pos='v')
       lemmas.append(lemma) 
   return lemmas

def normalize(words):
    words = remove_non_ascii(words) 
    words = to_lowercase(words) 
    words = remove_punctuation(words) 
    words = replace_numbers(words) 
    words = remove_stopwords(words) 
    return words


def stem_and_lemmatize(words):
    stems = stem_words(words) 
    lemmas = lemmatize_verbs(words) 
    return stems, lemmas


In [None]:
#To read  pickle_file
data_file = pd.read_pickle(r"database20190924_parsed.pickle")
data_file[:2]

[('Machine learning for discovering missing or wrong protein function annotations',
  ['Felipe Kenji Nakano', 'Mathias Lietaert', 'Celine Vens'],
  'Hierarchical multi-label classification;Protein function prediction;Benchmark datasets',
  'Machine Learning and Artificial Intelligence in Bioinformatics',
  'A massive amount of proteomic data is generated on a daily basis, nonetheless annotating all sequences is costly and often unfeasible. As a countermeasure, machine learning methods have been used to automatically annotate new protein functions. More specifically, many studies have investigated hierarchical multi-label classification (HMC) methods to predict annotations, using the Functional Catalogue (FunCat) or Gene Ontology (GO) label hierarchies. Most of these studies employed benchmark datasets created more than a decade ago, and thus train their models on outdated information. In this work, we provide an updated version of these datasets. By querying recent versions of FunCat a

In [None]:
#### PLAN FOR DATA PREPARATION
# 1. Scan to Find all unique words in the whole document :  These will be column names
# 1.1 Initialize to zero for all cells, matrices
# 1.2 Count all words for each column in each article (row), update the initially initialized counts in cell from step 1.1


# 2. Remove words with a total count of less than 3
# 3. Remove 20% of the most common words???? most common?? how??? what is the threshold?

# 4. Continue with PLSA implementation

In [None]:
#data = [(title,[authors],keywords,classification,abstract),(),(),()]
def preprocessing(data=data_file):
    for article in data:
        complete_article = article[0] +" "+ " ".join(article[1]) + article[2] + " " + article[3] + " "+article[4]
        words = nltk.word_tokenize(complete_article)
        words = normalize(words)
        _,lemmas = stem_and_lemmatize(words)
        yield lemmas


def scan_unique_words(data):
    words = []
    for article in data:
        words.extend(set(article))
    return set(words)

def data_matrix(data,column_names,threshold):
    # initialize an empty matrix
    data = list(data)
    rows = len(data) # change later
    cols = len(column_names)
    empty_data = np.zeros((rows,cols))
    print(f"Start Shape: {(rows,cols)}")
    # Count words in each article
    for article in range(rows):
        clear_output(wait=True) 
        for key_word in range(cols):
            empty_data[article,key_word] += " ".join(data[article]).count(column_names[key_word])
        print(f'{100*article/rows:.0f}% articles processed {article} of {rows}')        
    # Generate Pandas df
    df = pd.DataFrame(data=empty_data,columns=column_names)
    # Final Touch file
    clear_output(wait=True) 
    print("final_preprocessed_file data")
    df = final_processed_file(df,threshold)
    df.to_csv("processed_data.csv",index=False)
    print("All done ...")
    print(f"Start Shape: {(rows,cols)}")
    print(f"End Shape: {df.shape}")
    return df

def final_processed_file(df,threshold):
    '''
    threshold = (0,1)
    '''
    # drop colums with less than 3 counts in total
    # randomly drop columns with 20% of the total counts exceeding threshold
    # Remove some noise, words with less than 3 letters for example
    cols = df.columns
    i=0
    for c in cols:
        if len(c) < 3:
            df.drop(c,axis=1,inplace=True)
            i+=1

        

    sums = df.sum()
    cols = sums[sums<3].index


    if list(cols):
        print(f"Dropping columns with less than 3 counts in total.....\n{list(cols)}\n")
        df.drop(cols,axis=1,inplace=True)

    sums = df.sum()
    max_sum = sums.values.max()
    exceed_th = sums[sums >= threshold*max_sum].index

    if list(exceed_th):
        indx = np.random.permutation(int(len(exceed_th)*0.2))
        exceed_th_20p = exceed_th[indx]
        print(f"Randomly drop columns with 20% of the total counts exceeding threshold:{threshold*max_sum:.0f}\n{list(exceed_th_20p)}\n") 
        df.drop(exceed_th_20p,axis=1,inplace=True)

    return df





In [None]:
preprocessed = preprocessing(data_file)
preprocessed

<generator object preprocessing at 0x7f1f9bb67b50>

In [None]:
unique_words = list(scan_unique_words(preprocessed))

In [None]:
len(unique_words)

41381

In [None]:
df2 = data_matrix(preprocessing(data_file),unique_words,0.05)

In [None]:
df2.head()

Unnamed: 0,unknown,plus,jquery,subgraph,minimum,mens,immunotherapy,bpms,linedrug,diurnal,benjamin,wormtable,mirs,exac,japanese,consumer,evfoldmfdca,mutant,iris,barcoding,theor,novice,bioshell,postprocess,unless,ravo,akash,serine,banga,hope,hide,twopart,ethical,syringae,erin,cyril,santella,harper,ams,modern,...,organism,bruce,sspace,liver,hamper,constraintbased,seventy,nine hundred and fifty-eight,joaquin,mya,fpgas,bifurcation,nonspecific,competitive,perhaps,generality,kingdom,circular,hcc,kefed,supercoiling,four,lad,woods,terminus,gapfill,efficiency,anticancer,lymphocytic,rnac,variate,georeferenced,alan,yee,gtex,algorithmbayesian,metallopeptidaselike,beanalyzer,mone,establishment
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
df2 = pd.read_csv("/content/processed_data.csv")

py2

In [None]:
import numpy as np 
from sklearn.preprocessing import normalize
import sys

class Plsa(object):
      def __init__(self, dataset):
          self.data_set = dataset.values #data set 
          self.size_articles = 10 #number of articles taken 
          self.k = 10  #number of topics, by default 18 
          self.size_M = self.data_set.size

          
          '''We have try to have the same naming with the formula that is going to be implemented 
             So we make the naming similar to the formulas.'''
           # probabilities 
          self.D_W = [] #P ( d| w) 
          self.Z_D = [] # P(Z / d) 
          self.Z_D_W = [] # P(z / d,w) 
          self.W_Z = [] # P(w /z) 
          self.P_D = [] # P(d ) 
          self.W_D = [] # P(w/d)

      def ini_matrix_prob(self): 
          #initialize empty matrices and also generate matrices
          self.Z_D = np.zeros([self.k, self.size_articles], dtype = np.float32) 
          self.W_Z = np.zeros([self.size_M, self.k], dtype= np.float32) 
          self.P_D = np.zeros((self.size_articles), dtype=np.float32) 
          self.Z_D_W = np.zeros([self.k, self.size_articles, self.size_M], dtype=np.float32) 
          self.W_D = np.zeros([self.size_M, self.size_articles], dtype=np.float32) 
          self.D_W = np.zeros([self.size_articles, self.size_M], dtype=np.float32)

          #assign randon values 
          self.Z_D =np.random.random(size=(self.k, self.size_articles)) 
          self.W_Z = np.random.random(size=(self.size_M, self.k))

          #probability 
          p_d= 1.0 / self.size_articles 
          #  for i in range(self.size_articles):
          #      self.P_D[i] = p_d
          self.P_D = np.full(self.size_articles,p_d,dtype=np.float32)

      def Log_likelihood(self, iteration): # method of calculating the log-likelihood
         Loglik = 0.0 
         for d in range(self.size_articles): 
            for w in range(self.size_M):
                for z in range(self.k):
                    Loglik = Loglik + self.D_W[d, w]*(np.log(self.Z_D[z,d] * self.W_Z[w, z]) 
                    if self.Z_D[z, d]* self.W_Z[w, z] != 0 else 0)

         print('ITERATION #', iteration,' :') 
         print('New Log likelihood:', Loglik)

      def Probabilities(self): # method to calculates probabilities (w/ d) and P(d / w) 
          for d in range(self.size_articles): 
              for w in range(self.size_M):
                  self.W_D[w, d] = np.sum(self.W_Z[w, :] *self.Z_D[:, d]) 
              normalize(self.W_D, norm='l1', axis=0, copy=False) #normalization 
              for w in range(self.size_M):
                  self.D_W[:, w] = self.P_D[:] * self.W_D[w, :]

              normalize(self.D_W, norm= 'l1', axis=0, copy=False) #normalization
              

      def E_step(self): #method of Estep 
         for w in range(self.size_M): 
              for d in range(self.size_articles):
                  self.Z_D_W[:, d, w] = self.W_Z[w,:] * self.Z_D[:, d] 
                  normalize(self.Z_D_W[:, :, w], norm='l1', axis=0 , copy=False) # normalization

      def M_step(self): # method of H step
          self.W_Z = np.zeros([self.size_M, self.k], dtype=np.float)# P(z/ d) 
          for z in range(self.k): 
            for w in range(self.size_M):
                self.W_Z[w, z] = np.sum(self.data_set[z][:self.k] *self.Z_D_W[z, :, w])
                #normalize(self.W_Z, norm='l1', axis=0, copy=False) # normalization
         
          for d in range(self.size_articles): # update P(z I d) 
              for z in range(self.k):
                   self.Z_D[z][d] = np.sum(self.Z_D_W[z, d, :self.data_set.shape[1]] *self.data_set[d][:]) 
                   self.Z_D[z][d] = self.Z_D[z][d] / np.sum(self.data_set[d])\
                        if np.sum(self.data_set[d]) != 0 \
                         else 0


In [None]:
def main(argv):
     plsa_cal = Plsa(df2[df2.columns[:500]][:200])
     plsa_cal.ini_matrix_prob()
     for i in range(6):
         plsa_cal.E_step()
         plsa_cal.M_step() 
         plsa_cal.Probabilities() 
         plsa_cal.Log_likelihood(i)



if __name__ == "__main__":
  main(sys.argv)

ITERATION # 0  :
New Log likelihood: -1553168.1866551286
ITERATION # 1  :
New Log likelihood: -903019.0699608877
ITERATION # 2  :
New Log likelihood: -1418368.9453970077
ITERATION # 3  :
New Log likelihood: -2442773.7445479087
ITERATION # 4  :
New Log likelihood: -4502577.125588146
ITERATION # 5  :
New Log likelihood: -8783554.44859244
