In [111]:
import lxml.etree
import os
from IPython.display import display
import pandas as pd

In [122]:
class BLText: 
    def __init__(self, textdir): 
        self.textdir = textdir
        self.local_path = 'data/' + textdir
        self.tree = self.parseMetadata(textdir)
        self.title = self.getTitle()
        self.author = self.getAuthor() 

    def parseMetadata(self, textdir):
        fullpath = 'data/' + textdir + '/metadata/' + textdir + '.xml'
        return lxml.etree.parse(fullpath)
    
    def getText(self, xpath):
        ns = {'MODS': 'http://www.loc.gov/mods/v3'}
        out = self.tree.xpath(xpath + '/text()', namespaces=ns)
        if isinstance(out, list): 
            if len(out) == 1: 
                # No sense having a list of length one. Get just the string. 
                out = out[0]
        return out
    
    def getTitle(self): 
        return self.getText('//MODS:title')
        
    def getAuthor(self): 
        rawAuthor = self.getText('//MODS:name[@type="personal"]/MODS:namePart')
        # TODO: do some transformations to the text here. Get it in the appropriate case. 
        return rawAuthor

In [109]:

# A collection of BLText objects. 
class BLCorpus(): 
    def __init__(self): 
        thisdir = os.listdir()

        if 'data' not in thisdir: 
            raise Exception("Can't find data directory.")

        textdirs = os.listdir('data')
            
        texts = [ BLText(textdir) for textdir in textdirs ]
        
        self.metadata = [ [ text.textdir, text.title, text.author ] for text in texts ] 
        
        self.df = pd.DataFrame(self.metadata, columns=['ID', 'Title', 'Author'])

In [117]:
c = BLCorpus()

003233320
002670581
001718349
000624240


In [152]:
"""
Borrowed from GITenburg project. 
Makes an organized git repo of a book folder.
"""

import codecs
import logging
import os
from os.path import abspath, dirname

import jinja2
import sh

import logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

In [153]:
IGNORE_FILES = ""

class CdContext():
    """ A context manager using `sh` to cd to a directory and back
        `with CdContext(new path to go to)`
    """
    def __init__(self, path):
        self._og_directory = str(sh.pwd()).strip('\n')
        self._dest_directory = path

    def __enter__(self):
        sh.cd(self._dest_directory)

    def __exit__(self, exception_type, exception_value, traceback):
        sh.cd(self._og_directory)

In [123]:
      
class LocalRepo():

    def __init__(self, bltext):
        self.bltext = bltext
        logging.debug("Now attempting to initialize a local git repository for text: " 
                      + self.bltext.ID + " a.k.a. " + self.bltext.title )

    def add_file(self, filename):
        filetype = os.path.splitext(filename)[1]
        if filetype not in IGNORE_FILES:
            sh.git('add', filename)

    def add_all_files(self):
        with CdContext(self.book.local_path):
            sh.git.init('.')

            logging.debug("Files to add: " + str(sh.ls()))

            # NOTE: repo.untracked_files is unreliable with CdContext
            # using sh.ls() instead, this doesn't recognize .gitignore
            for _file in sh.ls():
                for _subpath in _file.split():
                    logging.debug("Adding file: " + str(_file))

                    self.add_file(_subpath)

    def commit(self, message):
        with CdContext(self.book.local_path):
            try:
                # note the double quotes around the message
                sh.git(
                    'commit',
                    '-m',
                    '"{message}"'.format(message=message)
                )
            except sh.ErrorReturnCode_1:
                print("Commit aborted for {0} with msg {1}".format(self.book.book_id, message))


class NewFilesHandler():
    """ NewFilesHandler - templates and copies additional files to book repos

    """
    README_FILENAME = 'README.rst'

    def __init__(self, book):
        self.book = book

        package_loader = jinja2.PackageLoader('gitenberg', 'templates')
        self.env = jinja2.Environment(loader=package_loader)

    def add_new_files(self):
        self.template_readme()
        self.copy_files()

    def template_readme(self):
        template = self.env.get_template('README.rst.j2')
        readme_text = template.render(
            title=self.book.meta.title,
            author=self.book.meta.author,
            book_id=self.book.book_id
        )

        readme_path = "{0}/{1}".format(
            self.book.local_path,
            self.README_FILENAME
        )
        with codecs.open(readme_path, 'w', 'utf-8') as readme_file:
            readme_file.write(readme_text)

    def copy_files(self):
        """ Copy the LICENSE and CONTRIBUTING files to each folder repo """
        files = [u'LICENSE', u'CONTRIBUTING.rst']
        this_dir = dirname(abspath(__file__))
        for _file in files:
            sh.cp(
                '{0}/templates/{1}'.format(this_dir, _file),
                '{0}/'.format(self.book.local_path)
            )

        # copy metadata rdf file
        sh.cp(
            self.book.meta.rdf_path,
            '{0}/'.format(self.book.local_path)
        )


def make(book):
    # Initial commit of book files
    local_repo = LocalRepo(book)
    local_repo.add_all_files()
    local_repo.commit("Initial import from Project Gutenberg")

    # New files commit
    NewFilesHandler(book)

    local_repo.add_all_files()
    local_repo.commit("Adds Readme, contributing and license files to book repo")


In [13]:
# Borrowed from the GITenberg project

"""
Syncs a local git book repo to a remote git repo (by default, github)
"""

import logging
from re import sub
import time

import github3
import sh

from .util.catalog import CdContext
try:
    from .secrets import GH_USER, GH_PASSWORD
except:
    print("no secrets file found, continuing without")


class GithubRepo():

    def __init__(self, book):
        self.book = book
        self.create_api_handler()

    def create_and_push(self):
        self.create_repo()
        self.add_remote_origin_to_local_repo()
        self.push_to_github()

    def create_api_handler(self):
        """ Creates an api handler and sets it on self """
        self.github = github3.login(username=GH_USER, password=GH_PASSWORD)
        if hasattr(self.github, 'set_user_agent'):
            self.github.set_user_agent('Project GITenberg: https://gitenberg.github.io/')
        self.org = self.github.organization(login='GITenberg')
        # FIXME: logging
        print("ratelimit: " + str(self.org.ratelimit_remaining))

    def format_desc(self):
        return u'{0} by {1}\n is a Project Gutenberg book, now on Github.'.format(
            self.book.meta.title, self.book.meta.author
        )

    def format_title(self):
        """ Takes a string and sanitizes it for Github's url name format """
        _title = sub("[ ',]+", '-', self.book.meta.title)
        title_length = 99 - len(str(self.book.book_id)) - 1
        if len(_title) > title_length:
            # if the title was shortened, replace the trailing _ with an ellipsis
            repo_title = "{0}__{1}".format(_title[:title_length], self.book.book_id)
        else:
            repo_title = "{0}_{1}".format(_title[:title_length], self.book.book_id)
        # FIXME: log debug, title creation
        print(len(repo_title), repo_title)
        return repo_title

    def create_repo(self):
        self.repo = self.org.create_repo(
            self.format_title(),
            description=self.format_desc(),
            homepage=u'https://GITenberg.github.io/',
            private=False,
            has_issues=True,
            has_wiki=False,
            has_downloads=True
        )

    def add_remote_origin_to_local_repo(self):
        with CdContext(self.book.local_path):
            try:
                sh.git('remote', 'add', 'origin', self.repo.ssh_url)
            except sh.ErrorReturnCode_128:
                print("We may have already added a remote origin to this repo")

    def push_to_github(self):
        with CdContext(self.book.local_path):
            try:
                sh.git('push', 'origin', 'master')
            except sh.ErrorReturnCode_128:
                logging.error(u"github repo not ready yet")
                time.sleep(10)
                sh.git('push', 'origin', 'master')

SyntaxError: invalid syntax (<ipython-input-13-64321bdbe0a0>, line 1)