Skip to content

Commit

Permalink
Merge pull request aneesha#9 from drkatnz/master
Browse files Browse the repository at this point in the history
Added setup.py to make module easily installable
  • Loading branch information
zelandiya committed Mar 2, 2018
2 parents b578b0b + eccade6 commit 14e153a
Show file tree
Hide file tree
Showing 12 changed files with 50 additions and 8 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ __pycache__/

# Distribution / packaging
.Python
.DS_STORE
env/
bin/
build/
Expand Down
1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
include data/stoplists/*
28 changes: 26 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,30 @@
RAKE
====
# RAKE
---

A Python implementation of the Rapid Automatic Keyword Extraction (RAKE) algorithm as described in: Rose, S., Engel, D., Cramer, N., & Cowley, W. (2010). Automatic Keyword Extraction from Individual Documents. In M. W. Berry & J. Kogan (Eds.), Text Mining: Theory and Applications: John Wiley & Sons.

The source code is released under the MIT License.

## Installing rake

To install rake as a package, run:

`python setup.py install`

## Example use

```python
from nlp_rake import rake

stoppath = 'data/stoplists/SmartStoplist.txt'

rake_object = rake.Rake(stoppath, 5, 3, 4)

sample_file = open("data/docs/fao_test/w2167e.txt", 'r', encoding="iso-8859-1")
text = sample_file.read()

keywords = rake_object.run(text)

# 3. print results
print("Keywords:", keywords)
```
1 change: 1 addition & 0 deletions __init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
import rake
Binary file added data/stoplists/.DS_Store
Binary file not shown.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
10 changes: 5 additions & 5 deletions rake.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Implementation of RAKE - Rapid Automatic Keyword Extraction algorithm
# as described in:
# Rose, S., D. Engel, N. Cramer, and W. Cowley (2010).
# Rose, S., D. Engel, N. Cramer, and W. Cowley (2010).
# Automatic keyword extraction from individual documents.
# In M. W. Berry and J. Kogan (Eds.), Text Mining: Applications and Theory.unknown: John Wiley and Sons, Ltd.
#
Expand All @@ -25,7 +25,7 @@
from collections import Counter

debug = False
test = True
test = False


def is_number(s):
Expand Down Expand Up @@ -270,13 +270,13 @@ def run(self, text):
return sorted_keywords


if test:
if test and __name__ == '__main__':
text = "Compatibility of systems of linear constraints over the set of natural numbers. Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and nonstrict inequations are considered. Upper bounds for components of a minimal set of solutions and algorithms of construction of minimal generating sets of solutions for all types of systems are given. These criteria and the corresponding algorithms for constructing a minimal supporting set of solutions can be used in solving all the considered types of systems and systems of mixed types."

# Split text into sentences
sentenceList = split_sentences(text)
# stoppath = "FoxStoplist.txt" #Fox stoplist contains "numbers", so it will not find "natural numbers" like in Table 1.1
stoppath = "SmartStoplist.txt" # SMART stoplist misses some of the lower-scoring keywords in Figure 1.5, which means that the top 1/3 cuts off one of the 4.0 score words in Table 1.1
stoppath = "data/stoplists/SmartStoplist.txt" # SMART stoplist misses some of the lower-scoring keywords in Figure 1.5, which means that the top 1/3 cuts off one of the 4.0 score words in Table 1.1
stopwordpattern = build_stop_word_regex(stoppath)

# generate candidate keywords
Expand All @@ -296,6 +296,6 @@ def run(self, text):
if debug: print(totalKeywords)
print(sortedKeywords[0:(totalKeywords // 3)])

rake = Rake("SmartStoplist.txt")
rake = Rake("data/stoplists/SmartStoplist.txt")
keywords = rake.run(text)
print(keywords)
2 changes: 1 addition & 1 deletion rake_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import io

# EXAMPLE ONE - SIMPLE
stoppath = "SmartStoplist.txt"
stoppath = "data/stoplists/SmartStoplist.txt"

# 1. initialize RAKE by providing a path to a stopwords file
rake_object = rake.Rake(stoppath, 5, 3, 4)
Expand Down
15 changes: 15 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#!/usr/bin/env python

from setuptools import setup

setup(name='nlp_rake',
version='1.0',
description='Rapid Automatic Keyword Extraction (RAKE) algorithm',
long_description='A Python implementation of the Rapid Automatic Keyword Extraction (RAKE) algorithm as described in: Rose, S., Engel, D., Cramer, N., & Cowley, W. (2010). Automatic Keyword Extraction from Individual Documents. In M. W. Berry & J. Kogan (Eds.), Text Mining: Theory and Applications: John Wiley & Sons.',
author='zelandiya, aneesha',
url='https://www.python.org/sigs/distutils-sig/',
packages=['nlp_rake'],
package_dir={'nlp_rake': './'},
package_data={'nlp_rake': ['data/']},
include_package_data = True,
)

0 comments on commit 14e153a

Please sign in to comment.