Skip to content

Commit

Permalink
Minimal changes to work with 0.3
Browse files Browse the repository at this point in the history
  • Loading branch information
johnmyleswhite committed Aug 31, 2014
1 parent a7e55e4 commit d40a74e
Show file tree
Hide file tree
Showing 14 changed files with 374 additions and 263 deletions.
22 changes: 0 additions & 22 deletions run_tests.jl

This file was deleted.

63 changes: 36 additions & 27 deletions src/corpus.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,26 +4,34 @@
#
##############################################################################

# TODO: Make this a parametric type?

type Corpus
documents::Vector{GenericDocument}
total_terms::Int
lexicon::Dict{UTF8String, Int}
inverse_index::Dict{UTF8String, Vector{Int}}
h::TextHashFunction
end

function Corpus(docs::Vector{GenericDocument})
Corpus(docs,
0,
Dict{UTF8String, Int}(),
Dict{UTF8String, Vector{Int}}(),
TextHashFunction())
Corpus(
docs,
0,
Dict{UTF8String, Int}(),
Dict{UTF8String, Vector{Int}}(),
TextHashFunction()
)
end

function Corpus(docs::Vector{Any})
Corpus(convert(Array{GenericDocument,1}, docs),
0,
Dict{UTF8String, Int}(),
Dict{UTF8String, Vector{Int}}(),
TextHashFunction())
Corpus(
convert(Array{GenericDocument,1}, docs),
0,
Dict{UTF8String, Int}(),
Dict{UTF8String, Vector{Int}}(),
TextHashFunction()
)
end

##############################################################################
Expand Down Expand Up @@ -69,15 +77,15 @@ end
##############################################################################

documents(c::Corpus) = c.documents
length(crps::Corpus) = length(crps.documents)
Base.length(crps::Corpus) = length(crps.documents)

##############################################################################
#
# Convert a Corpus to a DataFrame
#
##############################################################################

function convert(::Type{DataFrame}, crps::Corpus)
function Base.convert(::Type{DataFrame}, crps::Corpus)
df = DataFrame()
n = length(crps)
df["Language"] = DataArray(UTF8String, n)
Expand All @@ -104,48 +112,48 @@ end
#
##############################################################################

start(crps::Corpus) = 1
next(crps::Corpus, ind::Int) = (crps.documents[ind], ind + 1)
done(crps::Corpus, ind::Int) = ind > length(crps.documents)
Base.start(crps::Corpus) = 1
Base.next(crps::Corpus, ind::Int) = (crps.documents[ind], ind + 1)
Base.done(crps::Corpus, ind::Int) = ind > length(crps.documents)

##############################################################################
#
# Treat a Corpus as a container
#
##############################################################################

push!(crps::Corpus, d::AbstractDocument) = push!(crps.documents, d)
pop!(crps::Corpus) = pop!(crps.documents)
Base.push!(crps::Corpus, d::AbstractDocument) = push!(crps.documents, d)
Base.pop!(crps::Corpus) = pop!(crps.documents)

unshift!(crps::Corpus, d::AbstractDocument) = unshift!(crps.documents, d)
shift!(crps::Corpus) = shift!(crps.documents)
Base.unshift!(crps::Corpus, d::AbstractDocument) = unshift!(crps.documents, d)
Base.shift!(crps::Corpus) = shift!(crps.documents)

function insert!(crps::Corpus, index::Int, d::AbstractDocument)
function Base.insert!(crps::Corpus, index::Int, d::AbstractDocument)
insert!(crps.documents, index, d)
end
delete!(crps::Corpus, index::Integer) = delete!(crps.documents, index)
Base.delete!(crps::Corpus, index::Integer) = delete!(crps.documents, index)

##############################################################################
#
# Indexing into a Corpus
#
#
# (a) Numeric indexing just provides the n-th document
# (b) String indexing is effectively a trivial search engine
#
##############################################################################

ref(crps::Corpus, ind::Real) = crps.documents[ind]
ref{T <: Real}(crps::Corpus, inds::Vector{T}) = crps.documents[inds]
ref(crps::Corpus, r::Ranges) = crps.documents[r]
ref(crps::Corpus, term::String) = get(crps.inverse_index, term, Int[])
Base.getindex(crps::Corpus, ind::Real) = crps.documents[ind]
Base.getindex{T <: Real}(crps::Corpus, inds::Vector{T}) = crps.documents[inds]
Base.getindex(crps::Corpus, r::Ranges) = crps.documents[r]
Base.getindex(crps::Corpus, term::String) = get(crps.inverse_index, term, Int[])

##############################################################################
#
# Assignment into a Corpus
#
##############################################################################

function assign(crps::Corpus, d::AbstractDocument, ind::Real)
function Base.setindex!(crps::Corpus, d::AbstractDocument, ind::Real)
crps.documents[ind] = d
return d
end
Expand Down Expand Up @@ -218,6 +226,7 @@ index_size(crps::Corpus) = length(keys(crps.inverse_index))
function hash_function(crps::Corpus)
return crps.h
end

function hash_function!(crps::Corpus, f::TextHashFunction)
crps.h = f
end
Expand Down
51 changes: 36 additions & 15 deletions src/document.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,13 @@ type DocumentMetadata
author::UTF8String
timestamp::UTF8String
end
DocumentMetadata() = DocumentMetadata(EnglishLanguage,
utf8("Unnamed Document"),
utf8("Unknown Author"),
utf8("Unknown Time"))

DocumentMetadata() = DocumentMetadata(
EnglishLanguage,
utf8("Unnamed Document"),
utf8("Unknown Author"),
utf8("Unknown Time")
)

##############################################################################
#
Expand All @@ -33,6 +36,7 @@ type FileDocument <: AbstractDocument
filename::UTF8String
metadata::DocumentMetadata
end

function FileDocument(f::String)
d = FileDocument(utf8(f), DocumentMetadata())
d.metadata.name = f
Expand All @@ -49,6 +53,7 @@ type StringDocument <: AbstractDocument
text::UTF8String
metadata::DocumentMetadata
end

StringDocument(txt::String) = StringDocument(utf8(txt), DocumentMetadata())

##############################################################################
Expand All @@ -61,13 +66,16 @@ type TokenDocument <: AbstractDocument
tokens::Vector{UTF8String}
metadata::DocumentMetadata
end

function TokenDocument(txt::String, dm::DocumentMetadata)
TokenDocument(tokenize(dm.language, utf8(txt)), dm)
end

function TokenDocument(txt::String)
dm = DocumentMetadata()
TokenDocument(tokenize(EnglishLanguage, utf8(txt)), dm)
end

function TokenDocument{T <: String}(tkns::Vector{T})
dm = DocumentMetadata()
TokenDocument(convert(Vector{UTF8String}, tkns), dm)
Expand All @@ -84,27 +92,32 @@ type NGramDocument <: AbstractDocument
n::Int
metadata::DocumentMetadata
end

function NGramDocument(txt::String, dm::DocumentMetadata)
NGramDocument(ngramize(dm.language, utf8(txt), 1),
1, dm)
end

function NGramDocument(txt::String, n::Integer)
dm = DocumentMetadata()
NGramDocument(ngramize(EnglishLanguage,
tokenize(dm.language, utf8(txt)), n),
n, dm)
end

function NGramDocument(txt::String)
dm = DocumentMetadata()
NGramDocument(ngramize(EnglishLanguage,
tokenize(dm.language, utf8(txt)), 1),
1, dm)
end

function NGramDocument{T <: String}(ng::Dict{T, Int}, n::Int)
dm = DocumentMetadata()
NGramDocument(convert(Dict{UTF8String, Int}, ng),
n, dm)
end

function NGramDocument{T <: String}(ng::Dict{T, Int})
dm = DocumentMetadata()
NGramDocument(convert(Dict{UTF8String, Int}, ng),
Expand Down Expand Up @@ -209,10 +222,11 @@ end
#
##############################################################################

function length(d::NGramDocument)
function Base.length(d::NGramDocument)
error("NGramDocument's do not have a well-defined length")
end
length(d::AbstractDocument) = length(text(d))

Base.length(d::AbstractDocument) = length(text(d))

##############################################################################
#
Expand All @@ -221,6 +235,7 @@ length(d::AbstractDocument) = length(text(d))
##############################################################################

ngram_complexity(ngd::NGramDocument) = ngd.n

function ngram_complexity(d::AbstractDocument)
error("$(typeof(d))'s have no n-gram complexity")
end
Expand All @@ -231,8 +246,12 @@ end
#
##############################################################################

typealias GenericDocument Union(FileDocument, StringDocument,
TokenDocument, NGramDocument)
typealias GenericDocument Union(
FileDocument,
StringDocument,
TokenDocument,
NGramDocument
)

##############################################################################
#
Expand Down Expand Up @@ -262,33 +281,35 @@ end
#
##############################################################################

function convert(::Type{StringDocument},
function Base.convert(::Type{StringDocument},
d::FileDocument)
new_d = StringDocument(text(d))
new_d.metadata = d.metadata
return new_d
end

function convert(::Type{TokenDocument},
function Base.convert(::Type{TokenDocument},
d::Union(FileDocument, StringDocument))
new_d = TokenDocument(tokens(d))
new_d.metadata = d.metadata
return new_d
end
convert(::Type{TokenDocument}, d::TokenDocument) = d

function convert(::Type{NGramDocument},
Base.convert(::Type{TokenDocument}, d::TokenDocument) = d

function Base.convert(::Type{NGramDocument},
d::Union(FileDocument, StringDocument, TokenDocument))
new_d = NGramDocument(ngrams(d))
new_d.metadata = d.metadata
return new_d
end
convert(::Type{NGramDocument}, d::NGramDocument) = d

Base.convert(::Type{NGramDocument}, d::NGramDocument) = d

##############################################################################
#
# ref() methods: StringDocument("This is text and that is not")["is"]
# getindex() methods: StringDocument("This is text and that is not")["is"]
#
##############################################################################

ref(d::AbstractDocument, term::String) = ngrams(d)[term]
Base.getindex(d::AbstractDocument, term::String) = ngrams(d)[term]
Loading

0 comments on commit d40a74e

Please sign in to comment.