In [1]:
#Import essential tools.
import os
import re
import pandas as pd

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

#Function removes punctuations from topics in a given list.
def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

#Function removes stop_words from given list, returns filtered list.
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]

#Creation of stop_words filter used to remove useless words from text.            
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
keywords_set = set()

#LDAModel Function. Input CSV file to recieve a list of keywords.
'''!!!!HARD-CODED-REQUIREMENT!!!! → CSV File MUST have 'Title' and 'Abstract' columns/fields.'''
def LDAModel(CSV_FILE, num_of_topics, topn_keywords):
  #Read in Dataset from CSV file.
  dataframe = pd.read_csv(CSV_FILE, encoding='unicode_escape')

  #Check for hard coded requirements, raise ValueError for invalid CSV files.
  if 'Titles' and 'Abstract' not in dataframe.columns: 
    raise ValueError("No 'Titles' or 'Abstract' fields in CSV file.")
  
  #Create Titles and Abstracts Dataframes.
  Titles = pd.DataFrame(dataframe.Title)
  Abstracts = pd.DataFrame(dataframe.Abstract)

  #Text Cleaning & Preprocessing
  Titles['Title'] = Titles['Title'].map(lambda x: re.sub('[,/\.!?]', '', str(x)))
  Titles['Title'] = Titles['Title'].map(lambda x: x.lower())

  Abstracts['Abstract'] = Abstracts['Abstract'].map(lambda x: re.sub('[,/\.!?]', '', str(x)))
  Abstracts['Abstract'] = Abstracts['Abstract'].map(lambda x: x.lower())

  #Clean datasets for LDA Input.
  titles_data = Titles['Title'].tolist()
  title_words = list(sent_to_words(titles_data))
  title_words = remove_stopwords(title_words)

  abstracts_data = Abstracts['Abstract'].tolist()
  abstracts_words = list(sent_to_words(abstracts_data))
  abstract_words = remove_stopwords(abstracts_words)
  
  #Create Corpora Dictionaries.
  title_dict = corpora.Dictionary(title_words)
  abstract_dict = corpora.Dictionary(abstract_words)

  #Convert dictionaries into bag-of-words format. 
  title_corpus = [title_dict.doc2bow(text) for text in title_words]
  abstract_corpus = [abstract_dict.doc2bow(text) for text in abstract_words]

  #Build & instantiate LDA Models
  titles_lda = gensim.models.LdaModel(corpus=title_corpus, id2word=title_dict, num_topics=num_of_topics)
  abtracts_lda = gensim.models.LdaModel(corpus=abstract_corpus, id2word=abstract_dict, num_topics=num_of_topics)

  #Create "Topn" dictionaries from LDA model's unsupervised classification.
  #Here we use our input parameters "num_of_topics" and "topn_keywords" to bound our search.
  #num_of_topics is how many classifications the model sets for itself.
  #topn_keywords is how many words the model associates with each classification.
  topn_title_words = {'Topic_' + str(i): [word for word, prob in titles_lda.show_topic(i, topn=topn_keywords)] for i in range(0, titles_lda.num_topics)}
  topn_abstract_words = {'Topic_' + str(i): [word for word, prob in abtracts_lda.show_topic(i, topn=topn_keywords)] for i in range(0, abtracts_lda.num_topics)}

  #Process keywords associated with each classification, add to keywords_set.
  #Titles keywords:
  for x, y in topn_title_words.items():
    for i in y:
      keywords_set.add(i)
  #Abstracts keywords:
  for x, y in topn_abstract_words.items():
    for i in y:
      keywords_set.add(i)

  #Print clusters found in LDA.
  print(topn_title_words)

  #Return lsit by converting set.
  return list(keywords_set)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
#Output topic keywords to textfile.
'''with open('CICI_Keywords.txt','w') as tfile:
	tfile.write('\n'.join(x))'''

"with open('CICI_Keywords.txt','w') as tfile:\n\ttfile.write('\n'.join(x))"

In [None]:
#Scan directory for csv files, run LDAModel on dataset, and write to corresponding file.
for x in os.listdir(os.getcwd()):
  if os.path.isfile(x):
    if '.csv' in x:
      print(x)
      print(type(x))
      c = LDAModel(os.path.basename(x), 10, 5)
      with open(x.strip('.csv')+'_'+'keywords.txt','w') as tfile:
          tfile.write('\n'.join(c))