Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions SYMBOLS_MANIFEST.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
Compress`ImportZIP
HTML`DataImport
HTML`FullDataImport
HTML`HyperlinksImport
Expand All @@ -12,7 +13,7 @@ ImportExport`RegisterExport
ImportExport`RegisterImport
Internal`RealValuedNumberQ
Internal`RealValuedNumericQ
JSON`Import`JSONImport
JSON`ImportJSON
System`$Aborted
System`$Assumptions
System`$BaseDirectory
Expand Down Expand Up @@ -582,9 +583,7 @@ System`ImageTake
System`ImageType
System`Implies
System`Import
System`ImportJSON
System`ImportString
System`ImportZIP
System`In
System`Increment
System`Indeterminate
Expand Down
10 changes: 7 additions & 3 deletions mathics/SystemFiles/Formats/JSON/Import.wl
Original file line number Diff line number Diff line change
@@ -1,15 +1,19 @@
(* ::Package:: *)

(* JSON Javascript Object Notation or JSON web service description Importer.
This is used by Import[] and, ImportString[].
*)

Begin["System`Convert`JSONDump`"]

(* JSON legacy element is Data even if Expression would be better. *)
$AvailableElements = {"Data", "Dataset"};

ImportExport`RegisterImport[
"JSON",
ImportJSON,
"JSON", (* WMA mime-type name *)
JSON`ImportJSON, (* Default Function name that handles this. *)
{},
"AvailableElements" -> $AvailableElements,
"AvailableElements" -> $AvailableElements, (* names retuned by "Elements" query *)
"FunctionChannels" -> {"FileNames"},
"DefaultElement" -> "Data"
]
Expand Down
10 changes: 5 additions & 5 deletions mathics/SystemFiles/Formats/ZIP/Import.wl
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
(* ::Package:: *)

(* ZIP compressed file and file archive Importer.
This is used by Import[].
(* Windows ZIP archive, ZIP compressed file and file archive Importer.
This is used by Import[] and, ImportString[].
*)

Begin["System`Convert`CommonArchiveDump`"]
Expand All @@ -21,13 +21,13 @@ GetElements[___] :=
];

ImportExport`RegisterImport[
"ZIP",
ImportZIP,
"ZIP", (* WMA mime-type name *)
Compress`ImportZIP, (* Default Function name that handles this. *)
{}, (* Post importer function(s) *)
FunctionChannels -> {"FileNames"},
(* WMA has this, but I (rocky) am not sure why or what it means:
AvailableElements -> $ZIPAvailableElements, *)
AvailableElements -> {"Filenames", "Summary"},
AvailableElements -> {"Filenames", "Summary"}, (* names retuned by "Elements" query *)
BinaryFormat -> True,
DefaultElement -> "FileNames",
HiddenElements -> $ZIPHiddenElements,
Expand Down
42 changes: 36 additions & 6 deletions mathics/builtin/fileformats/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,40 @@
"""
File Formats
r"""Import/Export File Formats, Importers and Exporters

The data of files on a filesystem or retrieved from the Internet often are structured \
according to a specific structures and rules. For example, consider different kinds of \
structuring used in a JSON file, versus an HTML files, or a compressed GZIP file.

In some cases, such as archive files, e.g., ZIP, TAR, and JAR, the file contains component parts, \
which in WMA terminology are called "members" which is part of the broader metadata items \
called "elements".

A MIME type is typically associated with each kind of format. \Mathics3, following WMA, \
uses a shortend name for this MIME type. For example \Mathics3 uses "HTML" as a shorthand \
for the MIME type "text/html".

Below is a list of file supported file types that we have builtin importers or exporters written \
in Python. Other importers, however, are written in \Mathics3.

Variable <url>
:\$ExportFormats:
/doc/reference-of-built-in-symbols/inputoutput-files-and-filesystem/importing-and-exporting/\$exportformats</url> \
contains a list of file formats that are supported by <url>
:Export:
/doc/reference-of-built-in-symbols/inputoutput-files-and-filesystem/importing-and-exporting/export</url>, \
while <url>
:\$ImportFormats:
/doc/reference-of-built-in-symbols/inputoutput-files-and-filesystem/importing-and-exporting/\$importformats</url> \
does the corresponding thing for <url>
:Import:
/doc/reference-of-built-in-symbols/inputoutput-files-and-filesystem/importing-and-exporting/import</url>.

Built-in Importers.
Many Import/Export functions are registered in SystemFiles/Formats/*.wl which is \
autoloaded on startup.

The Built-in Functions are defined in a separate context.
For example, HTML` or Compress`. This is done to not pollute the System` namespace.
"""

# The Built-in Functions are defined in a separate context under the
# System`. For example System`HTML` and System`XML. This is done to not
# pollute the System` namespace.
# This tells documentation how to sort this module
# Here we are also hiding "file_io" since this can erroneously appear at the top level.
sort_order = "mathics.builtin.importing-export-file-formats"
32 changes: 32 additions & 0 deletions mathics/builtin/fileformats/compression.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
"""
Compression & Archive Formats
"""

from mathics.core.builtin import Builtin, String
from mathics.core.evaluation import Evaluation
from mathics.eval.fileformats.compression import eval_ImportZIP

# See commit in __init__.py regarding the whacky way this gets called


class ImportZIP(Builtin):
"""
<url>:WMA link:https://reference.wolfram.com/language/ref/format/ZIP.html</url>

<dl>
<dt>'Compress`ImportZIP[path]'
<dd>Run zip for archive file $path$
</dl>

"""

context = "Compress`"
summary_text = "import a ZIP file"

def eval(self, path: String, evaluation: Evaluation):
"Compress`ImportZIP[path_String]"
return eval_ImportZIP(path, evaluation)

def eval_with_elements(self, path: String, elements, evaluation: Evaluation):
"Compress`ImportZIP[path_String, elements_]"
return eval_ImportZIP(path, evaluation, elements)
29 changes: 24 additions & 5 deletions mathics/builtin/fileformats/htmlformat.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"""
HTML

Basic implementation for a HTML importer.
HTML importer.
"""


Expand All @@ -15,6 +15,7 @@
from mathics.core.builtin import Builtin, MessageException
from mathics.core.convert.expression import to_expression, to_mathics_list
from mathics.core.convert.python import from_python
from mathics.core.evaluation import Evaluation
from mathics.core.expression import Expression
from mathics.core.list import ListExpression
from mathics.core.symbols import Symbol
Expand Down Expand Up @@ -126,7 +127,7 @@ class _TagImport(_HTMLBuiltin):
def _import(self, tree):
raise NotImplementedError

def eval(self, text, evaluation):
def eval(self, text: String, evaluation: Evaluation):
"""%(name)s[text_String]"""
tree = parse_html(parse_html_file, text, evaluation)
if isinstance(tree, Symbol): # $Failed?
Expand All @@ -135,6 +136,12 @@ def eval(self, text, evaluation):
to_expression(SymbolRule, self.tag_name, self._import(tree))
)

def eval_with_element(self, text, element, evaluation: Evaluation):
"""%(name)s[text_String, element_]"""
# FIXME: right now we aren't using element, and should use this to more
# efficiently extract part of the XML file that we want.
return self.eval(text, evaluation)

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mmatera Is there a better way to combine this with the def eval() above?

Even if that is the case, it might be useful to have this broken out as a stub for when this is revised to be able to handle the element passed.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not using the parameter element. In a proper implementation, this function should be more general than self.eval. Also, parse_html should have an extra attribute to filter a given element.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not using the parameter element.

That is exactly what FIXME says.

In a proper implementation, this function should be more general than self.eval. Also, parse_html should have an extra attribute to filter a given element.

Yep. Revising HTML and XML is left for later. I will be happy when we are able to "Import" and extract a JSON file from a ZIP import which is needed for being able to install paclets from the public paclet server.

This is the main reason why any work on this is currently being done.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, but the output with the second element produce something different.

In[8]:= Import["http://www2.fisica.unlp.edu.ar/index.html",{"Elements"}]        

Out[8]= {Data, FullData, Hyperlinks, ImageLinks, Images, Plaintext, Source, 
 
>    Title, XMLObject}

In[9]:= Import["http://www2.fisica.unlp.edu.ar/index.html",{"Title"}]           

Out[9]= Dto. de Fisica | Facultad de Cs. Exactas | UNLP

In[10]:= Import["http://www2.fisica.unlp.edu.ar/index.html",{"FullData"}]       

Out[10]= {{{}, {}, {}}, {{}, {}, {}, {}}, 
 
>    {{}, {{ }, { CC 67, 1900 La Plata, Argentina - Tel: +54-221-4246062 Fax:\
 
>        +54-221-4252006 - secre@fisica.unlp.edu.ar }}}, {{}}, 
 
>    {{}, {}, {}, {}, {}, {}, {}, {}, {}, {}}, {}}


In[11]:= Import["http://www2.fisica.unlp.edu.ar/index.html",{"Plaintext"}]      

Out[11]= Departamento de Física
           Facultad de Ciencias Exactas ,  Universidad Nacional de La Plata  
            
             
              CC 67, 1900 La Plata, Argentina - Tel: +54-221-4246062
          Fax: +54-221-4252006 - secre@fisica.unlp.edu.ar    
           Ciclo anual de charlas para alumnos  
           Ingreso a Webmail
           AFA Filial La Plata
           Biblioteca y Hemeroteca
          Museo de Física
           Instituto de Física La Plata

In[12]:= Import["http://www2.fisica.unlp.edu.ar/index.html",{"XMLObject"}]      

Out[12]= XMLObject[Document][{XMLObject[Declaration][Version -> 1.0, 
 
>      Standalone -> yes]}, XMLElement[html, 
 
>     {{http://www.w3.org/2000/xmlns/, xmlns} -> 
 
>       http://www.w3.org/1999/xhtml}, 
 
>     {XMLElement[head, {}, {XMLElement[link, 
 
>         {rel -> stylesheet, href -> dropdown.css, type -> text/css}, {}], 
 
>        XMLElement[script, {language -> JavaScript1.2, src -> menu_data.js}, 
 
>         {}], XMLElement[title, {}, 
 
>         {Dto. de Fisica | Facultad de Cs. Exactas | UNLP}]}], 
 
>      XMLElement[body, {link -> #428266, vlink -> #428266, 
 
>        bgcolor -> #ffffff}, {, 

 
>        XMLElement[font, {face -> verdana,arial,helvetica, size -> -1}, 
 
>         {, XMLElement[font, {size -> +2}, {Departamento de Física}], 


 
>          XMLElement[br, {clear -> none}, {}], , 

 
>          XMLElement[a, {shape -> rect, 
 
>            href -> http://www.exactas.unlp.edu.ar}, 
 
>           {Facultad de Ciencias Exactas}], ,, 

 
>          XMLElement[a, {shape -> rect, href -> http://www.unlp.edu.ar}, 
 
>           {Universidad Nacional de La Plata}], }], 

 
>        XMLElement[p, {}, {, XMLElement[script, 


 
>           {language -> JavaScript1.2, src -> menu_script.js}, {}], , 


 
>          XMLElement[table, {border -> 0}, 
 
>           {XMLElement[tr, {}, 
 
>             {XMLElement[td, {colspan -> 1, rowspan -> 1}, 
 
>               { , XMLElement[img, {src -> fisica.jpg}, {}],  }]}], 
                                                               
 
>            XMLElement[tr, {}, 
 
>             {XMLElement[td, {colspan -> 1, rowspan -> 1}, 
 
>               {, XMLElement[center, {}, 

 
>                 {, XMLElement[font, {size -> -3}, 

 
>                   {                                                      , 
                     CC 67, 1900 La Plata, Argentina - Tel: +54-221-4246062
 
>                    XMLElement[br, {clear -> none}, {}], 
 
>                                           , 
                     Fax: +54-221-4252006 - 
 
>                    XMLElement[a, 
 
>                     {shape -> rect, 
 
>                      href -> mailto:secre@fisica.unlp.edu.ar}, 
 
>                     {secre@fisica.unlp.edu.ar}], }]}]}]}]}], }], 


 
>        XMLElement[p, {}, {, XMLElement[a, 

 
>           {shape -> rect, href -> semin/alumnos}, 
 
>           {Ciclo anual de charlas para alumnos}], }], 


 
>        XMLElement[p, {}, {, XMLElement[a, 

 
>           {shape -> rect, href -> http://mail.fisica.unlp.edu.ar}, 
 
>           {Ingreso a Webmail}],  , XMLElement[br, {clear -> none}, {}], , 

 
>          XMLElement[a, {shape -> rect, 
 
>            href -> http://www2.fisica.unlp.edu.ar/filial/}, 
 
>           {AFA Filial La Plata}],  , XMLElement[br, {clear -> none}, {}], , 

 
>          XMLElement[a, {shape -> rect, 
 
>            href -> http://biblio.fisica.unlp.edu.ar/}, 
 
>           {Biblioteca y Hemeroteca}],  , 
 
>          XMLElement[br, {clear -> none}, {}],  , 

 
>          XMLElement[a, {shape -> rect, 
 
>            href -> http://museofisica.exactas.unlp.edu.ar/}, 
 
>           {Museo de Física}], XMLElement[br, {clear -> none}, {}], , 

 
>          XMLElement[a, {shape -> rect, 
 
>            href -> http://iflp.fisica.unlp.edu.ar/}, 
 
>           {Instituto de Física La Plata}], 
 
>          XMLElement[br, {clear -> none}, {}], }], XMLElement[p, {}, {}]}]}], 
 
>    {}]

In the Mathics3 master branch, this seems to work:

In[1]:= Import["http://www2.fisica.unlp.edu.ar/index.html",{"Title"}]
Out[1]= "Dto. de Fisica | Facultad de Cs. Exactas | UNLP"

In[2]:= Import["http://www2.fisica.unlp.edu.ar/index.html",{"Plaintext"}]
Out[2]= "Dto. de Fisica | Facultad de Cs. Exactas | UNLP
        Departamento de Física
        Facultad de Ciencias Exactas
        ,
        Universidad Nacional de La Plata
        CC 67, 1900 La Plata, Argentina - Tel: +54-221-4246062
        Fax: +54-221-4252006 -
        secre@fisica.unlp.edu.ar
        Ciclo anual de charlas para alumnos
        Ingreso a Webmail
        AFA Filial La Plata
        Biblioteca y Hemeroteca
        Museo de Física
        Instituto de Física La Plata"

So I do not see what this new method provides.

@rocky rocky Jun 27, 2026

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, but the output with the second element produce something different.

I don't see that. Below, I see lots of output, and I believe this is the same that you'd get from Wolframscript. The formatting of the text is different, but that's to be expected until we match up StandardForm output better.

If there is a specific difference, exactly what's different?

(Please try to give a small example of a difference.)

So I do not see what this new method provides.

It is a placeholder function (that indicates FIXME) and it is there to indicate that it should be filled out to remove gross inefficiency that can arise by reading in lots of stuff and then throwing away or filtering most of it.

Instead, that code should be filled out to pass information to other eval routines that handle element retrieval in a better way.



class _Get(_HTMLBuiltin):
context = "HTML`Parser`"
Expand Down Expand Up @@ -401,7 +408,7 @@ class SourceImport(_HTMLBuiltin):

summary_text = "import source code from a HTML file"

def eval(self, text, evaluation):
def eval(self, text, evaluation: Evaluation):
"""%(name)s[text_String]"""

def source(filename):
Expand All @@ -412,6 +419,12 @@ def source(filename):

return parse_html(source, text, evaluation)

def eval_with_element(self, text, element, evaluation: Evaluation):

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"""%(name)s[text_String, element_]"""
# FIXME: right now we aren't using element, and should use this to more
# efficiently extract part of the XML file that we want.
return self.eval(text, evaluation)


class TitleImport(_TagImport):
"""
Expand All @@ -437,7 +450,7 @@ def _import(self, tree):

class XMLObjectImport(_HTMLBuiltin):
"""
## <url>:native internal:</url>
<url>:WMA link:https://reference.wolfram.com/language/ref/XMLObject.html</url>

<dl>
<dt>'HTML`XMLObjectImport["filename"]'
Expand All @@ -450,7 +463,13 @@ class XMLObjectImport(_HTMLBuiltin):

summary_text = "import XML objects from a HTML file"

def eval(self, text, evaluation):
def eval(self, text, evaluation: Evaluation):
"""%(name)s[text_String]"""
xml = to_expression("HTML`Parser`HTMLGet", text).evaluate(evaluation)
return ListExpression(Expression(SymbolRule, String("XMLObject"), xml))

def eval_with_element(self, text, element, evaluation: Evaluation):
"""%(name)s[text_String, element_]"""
# FIXME: right now we aren't using element, and should use this to more
# efficiently extract part of the HTML file that we want.
return self.eval(text, evaluation)
29 changes: 13 additions & 16 deletions mathics/builtin/fileformats/jsonformat.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,29 @@
# -*- coding: utf-8 -*-

"""
JSON
JSON File Format

Basic implementation for an JSON importer.
JSON importer (via Python's "json" module).
"""

from mathics.core.builtin import Builtin
from mathics.core.expression import Evaluation
from mathics.core.builtin import Builtin, String
from mathics.core.evaluation import Evaluation
from mathics.eval.fileformats.jsonformat import eval_JSONImport


class JSONImport(Builtin):
class ImportJSON(Builtin):
"""
## <url>:native internal:</url>
<url>:WMA link:https://reference.wolfram.com/language/ref/format/JSON.html</url>

<dl>
<dt>'JSON`Import`JSONImport["file"]'
<dd>parses "string" as a JSON file, and returns the data as a nested \
list of rules.
<dt>'JSON`ImportJSON[path]'
<dd>Read $path$ as JSON and convert that to its corresponding Mathics3 equivalent.
</dl>

"""

summary_text = "import elements from json"
context = "JSON`Import`"
context = "JSON`"
messages = {"dec": "Decoding Error at `1`"}
summary_text = "import JSON file"

def eval(self, filename, evaluation: Evaluation):
"""%(name)s[filename_String]"""
return eval_JSONImport(filename.value, evaluation)
def eval(self, path: String, evaluation: Evaluation):
"JSON`ImportJSON[path_String]"
return eval_JSONImport(path, evaluation)
22 changes: 20 additions & 2 deletions mathics/builtin/fileformats/xmlformat.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
# -*- coding: utf-8 -*-

"""
XML
XML File Format

Basic implementation for an XML importer.
XML importer (via lxml).
"""


Expand Down Expand Up @@ -345,6 +345,12 @@ def lines():
plaintext = String("\n".join(lines()))
return to_mathics_list(to_expression("Rule", "Plaintext", plaintext))

def eval_with_element(self, text, element, evaluation: Evaluation):
"""%(name)s[text_String, element_]"""
# FIXME: right now we aren't using element, and should use this to more
# efficiently extract part of the XML file that we want.
return self.eval(text, evaluation)


class TagsImport(Builtin):
"""
Expand Down Expand Up @@ -381,6 +387,12 @@ def eval(self, text, evaluation: Evaluation):
return root
return to_mathics_list(to_expression("Rule", "Tags", self._tags(root)))

def eval_with_element(self, text, element, evaluation: Evaluation):
"""%(name)s[text_String, element_]"""
# FIXME: right now we aren't using element, and should use this to more
# efficiently extract part of the XML file that we want.
return self.eval(text, evaluation)


class XMLObjectImport(Builtin):
"""
Expand All @@ -405,3 +417,9 @@ def eval(self, text, evaluation: Evaluation):
"""%(name)s[text_String]"""
xml = to_expression("XML`Parser`XMLGet", text).evaluate(evaluation)
return to_mathics_list(to_expression("Rule", "XMLObject", xml))

def eval_with_element(self, text, element, evaluation: Evaluation):
"""%(name)s[text_String, element_]"""
# FIXME: right now we aren't using element, and should use this to more
# efficiently extract part of the XML file that we want.
return self.eval(text, evaluation)
32 changes: 0 additions & 32 deletions mathics/builtin/import_export/compression.py

This file was deleted.

Loading
Loading