Skip to content

Commit

Permalink
Lots of bug fixes and upgrades to Julia 0.2
Browse files Browse the repository at this point in the history
  • Loading branch information
johnmyleswhite committed May 24, 2013
1 parent 6082f8a commit f022c37
Show file tree
Hide file tree
Showing 6 changed files with 70 additions and 20 deletions.
3 changes: 1 addition & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -458,5 +458,4 @@ as part of the State of the Union Address tradition.

T = tf_idf(D)

cl = k_means(D, 5)
cl = k_means(T, 5)
cl = kmeans(T, 5)
1 change: 1 addition & 0 deletions src/TextAnalysis.jl
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ module TextAnalysis
export remove_corrupt_utf8!
export remove_punctuation, remove_numbers, remove_case, remove_whitespace
export remove_punctuation!, remove_numbers!, remove_case!, remove_whitespace!
export remove_nonletters, remove_nonletters!
export remove_words, remove_stop_words, remove_articles
export remove_words!, remove_stop_words!, remove_articles!
export remove_definite_articles, remove_indefinite_articles
Expand Down
4 changes: 2 additions & 2 deletions src/corpus.jl
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ function DirectoryCorpus(dirname::String)
push!(docs, FileDocument(abspath(filename)))
end
if isdir(filename) && !islink(filename)
add_files(filename, f)
add_files(filename)
end
end
cd(starting_dir)
Expand Down Expand Up @@ -198,7 +198,7 @@ function update_inverse_index!(crps::Corpus)
doc = crps[i]
ngs = ngrams(doc)
for ngram in keys(ngs)
if has(crps.inverse_index, ngram)
if haskey(crps.inverse_index, ngram)
push!(crps.inverse_index[ngram], i)
else
crps.inverse_index[ngram] = [i]
Expand Down
8 changes: 6 additions & 2 deletions src/document.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
##############################################################################

type DocumentMetadata
language::AbstractKind
language::DataType
name::UTF8String
author::UTF8String
timestamp::UTF8String
Expand Down Expand Up @@ -33,7 +33,11 @@ type FileDocument <: AbstractDocument
filename::UTF8String
metadata::DocumentMetadata
end
FileDocument(f::String) = FileDocument(utf8(f), DocumentMetadata())
function FileDocument(f::String)
d = FileDocument(utf8(f), DocumentMetadata())
d.metadata.name = f
return d
end

##############################################################################
#
Expand Down
6 changes: 3 additions & 3 deletions src/dtm.jl
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ end

function DocumentTermMatrix(crps::Corpus)
lex = lexicon(crps)
terms = sort(keys(lex))
terms = sort(collect(keys(lex)))
column_indices = Dict{UTF8String, Int}()
for i in 1:length(terms)
term = terms[i]
Expand Down Expand Up @@ -84,14 +84,14 @@ function dtm_entries(d::AbstractDocument, lex::Dict{UTF8String, Int})
ngs = ngrams(d)
indices = Array(Int, 0)
values = Array(Int, 0)
terms = sort(keys(lex))
terms = sort(collect(keys(lex)))
column_indices = Dict{UTF8String, Int}()
for i in 1:length(terms)
term = terms[i]
column_indices[term] = i
end
for ngram in keys(ngs)
if has(column_indices, ngram)
if haskey(column_indices, ngram)
push!(indices, column_indices[ngram])
push!(values, ngs[ngram])
end
Expand Down
68 changes: 57 additions & 11 deletions src/preprocessing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
##############################################################################

function remove_corrupt_utf8(s::String)
r = Array(Char, length(s))
r = Array(Char, endof(s))
i = 0
for chr in s
i += 1
Expand Down Expand Up @@ -34,7 +34,7 @@ function remove_corrupt_utf8!(d::NGramDocument)
for token in keys(d.ngrams)
new_token = remove_corrupt_utf8(token)
if new_token != token
if has(d.ngrams, new_token)
if haskey(d.ngrams, new_token)
d.ngrams[new_token] = d.ngrams[new_token] + d.ngrams[token]
else
d.ngrams[new_token] = d.ngrams[token]
Expand All @@ -50,7 +50,9 @@ end
#
##############################################################################

remove_whitespace(s::String) = replace(s, r"\s+", " ")
const WHITESPACE_REGEX = Regex("\s+", 0)
#remove_whitespace(s::String) = replace(s, r"\s+", " ")
remove_whitespace(s::String) = replace(s, WHITESPACE_REGEX, " ")

function remove_whitespace!(d::FileDocument)
error("FileDocument's cannot be modified")
Expand All @@ -70,7 +72,7 @@ function remove_whitespace!(d::NGramDocument)
for token in keys(d.ngrams)
new_token = remove_whitespace(token)
if new_token != token
if has(d.ngrams, new_token)
if haskey(d.ngrams, new_token)
d.ngrams[new_token] = d.ngrams[new_token] + d.ngrams[token]
else
d.ngrams[new_token] = d.ngrams[token]
Expand All @@ -86,7 +88,9 @@ end
#
##############################################################################

remove_punctuation(s::String) = replace(s, r"[,;:.!?()]+", "")
const PUNCTUATION_REGEX = Regex("[,;:.!?()]+", 0)
#remove_punctuation(s::String) = replace(s, r"[,;:.!?()]+", "")
remove_punctuation(s::String) = replace(s, PUNCTUATION_REGEX, "")

function remove_punctuation!(d::FileDocument)
error("FileDocument's cannot be modified")
Expand All @@ -106,7 +110,45 @@ function remove_punctuation!(d::NGramDocument)
for token in keys(d.ngrams)
new_token = remove_punctuation(token)
if new_token != token
if has(d.ngrams, new_token)
if haskey(d.ngrams, new_token)
d.ngrams[new_token] = d.ngrams[new_token] + d.ngrams[token]
else
d.ngrams[new_token] = d.ngrams[token]
end
delete!(d.ngrams, token)
end
end
end

##############################################################################
#
# Remove non-letters
#
##############################################################################

const NONLETTER_REGEX = Regex("[^a-zA-Z\s]", 0)
#remove_nonletters(s::String) = replace(s, r"[^a-zA-Z]", "")
remove_nonletters(s::String) = replace(s, NONLETTER_REGEX, "")

function remove_nonletters!(d::FileDocument)
error("FileDocument's cannot be modified")
end

function remove_nonletters!(d::StringDocument)
d.text = remove_nonletters(d.text)
end

function remove_nonletters!(d::TokenDocument)
for i in 1:length(d.tokens)
d.tokens[i] = remove_nonletters(d.tokens[i])
end
end

function remove_nonletters!(d::NGramDocument)
for token in keys(d.ngrams)
new_token = remove_nonletters(token)
if new_token != token
if haskey(d.ngrams, new_token)
d.ngrams[new_token] = d.ngrams[new_token] + d.ngrams[token]
else
d.ngrams[new_token] = d.ngrams[token]
Expand Down Expand Up @@ -142,7 +184,7 @@ function remove_case!(d::NGramDocument)
for token in keys(d.ngrams)
new_token = remove_case(token)
if new_token != token
if has(d.ngrams, new_token)
if haskey(d.ngrams, new_token)
d.ngrams[new_token] = d.ngrams[new_token] + d.ngrams[token]
else
d.ngrams[new_token] = d.ngrams[token]
Expand All @@ -161,7 +203,9 @@ end
#
##############################################################################

remove_numbers(s::String) = replace(s, r"\d", "")
const NUMBER_REGEX = Regex("\d+", 0)
#remove_numbers(s::String) = replace(s, r"\d+", "")
remove_numbers(s::String) = replace(s, NUMBER_REGEX, "")

function remove_numbers!(d::FileDocument)
error("FileDocument's cannot be modified")
Expand All @@ -181,7 +225,7 @@ function remove_numbers!(d::NGramDocument)
for token in keys(d.ngrams)
new_token = remove_numbers(token)
if new_token != token
if has(d.ngrams, new_token)
if haskey(d.ngrams, new_token)
d.ngrams[new_token] = d.ngrams[new_token] + d.ngrams[token]
else
d.ngrams[new_token] = d.ngrams[token]
Expand All @@ -199,7 +243,8 @@ end

function remove_words{T <: String}(s::String, words::Vector{T})
for word in words
s = replace(s, Regex(strcat("\\b", word, "\\b")), " ")
#s = replace(s, Regex(strcat("\\b", word, "\\b")), " ")
s = replace(s, Regex(string("\\b", word, "\\b"), 0), " ")
end
return s
end
Expand All @@ -222,7 +267,7 @@ function remove_words!{T <: String}(d::NGramDocument, words::Vector{T})
for token in keys(d.ngrams)
new_token = remove_words(token, words)
if new_token != token
if has(d.ngrams, new_token)
if haskey(d.ngrams, new_token)
d.ngrams[new_token] = d.ngrams[new_token] + d.ngrams[token]
else
d.ngrams[new_token] = d.ngrams[token]
Expand Down Expand Up @@ -290,6 +335,7 @@ for f in (:remove_whitespace!,
:remove_punctuation!,
:remove_case!,
:remove_numbers!,
:remove_nonletters!,
:stem!,
:tag_pos!,
:remove_articles!,
Expand Down

0 comments on commit f022c37

Please sign in to comment.