# NLP - Project 1
## Rinehart Analysis
**Team**: *Jean Merlet, Konstantinos Georgiou, Matt Lane*

## Load Libraries and setup

In [22]:
import traceback
import argparse
from pprint import pprint

# Custom libs
from nlp_libs import Configuration, ColorizedLogger, ProcessedBook

### Libraries Overview
All the libraries are located under *"\<project root>/nlp_libs"*
- ***ProcessedBook***: Loc: **books/processed_book.py**, Desc: *Book Pre-processor*
- ***Configuration***: Loc: **configuration/configuration.py**, Desc: *Configuration Loader*
- ***ColorizedLogger***: Loc: **fancy_logger/colorized_logger.py**, Desc: *Logger with formatted text capabilities*

In [4]:
# The path of configuration and log save path
config_path = "confs/proj_1.yml"
log_path = "logs/logs_proj_1.log"

In [13]:
# The logger
logger = ColorizedLogger(logger_name='Notebook', color='blue')
ColorizedLogger.setup_logger(log_path=log_path, debug=False, clear_log=True)

2021-09-19 22:33:19 FancyLogger  INFO     [1m[37mLogger is set. Log file path: /home/drkostas/GDrive/Projects/UTK/rinehartAnalysis/logs/logs_proj_1.log[0m


In [14]:
# Load the configuration
conf = Configuration(config_src=config_path)
# Get the books dict
books = conf.get_config('data_loader')[0]['config']['urls']
pprint(books)  # Pretty print the books dict

2021-09-19 22:33:21 Config       INFO     [1m[37mConfiguration file loaded successfully from path: /home/drkostas/GDrive/Projects/UTK/rinehartAnalysis/confs/proj_1.yml[0m
2021-09-19 22:33:21 Config       INFO     [1m[37mConfiguration Tag: proj1[0m


{'A_Poor_Wise_Man': 'https://www.gutenberg.org/files/1970/1970-0.txt',
 'Affinities and Other Stories': 'https://www.gutenberg.org/cache/epub/41408/pg41408.txt',
 'Bab:_A_Sub-Deb': 'https://www.gutenberg.org/cache/epub/366/pg366.txt',
 'Dangerous_Days': 'https://www.gutenberg.org/files/1693/1693-0.txt',
 'K': 'https://www.gutenberg.org/files/9931/9931-0.txt',
 'Locked_Doors': 'https://www.gutenberg.org/files/54273/54273-0.txt',
 'Long_Live_the_King': 'https://www.gutenberg.org/files/2714/2714-0.txt',
 'More_Tish': 'https://www.gutenberg.org/cache/epub/19851/pg19851.txt',
 'Oh,_Well,_You_Know_How_Women_Are!': 'https://www.gutenberg.org/cache/epub/24259/pg24259.txt',
 'Sight_Unseen': 'https://www.gutenberg.org/files/1960/1960-0.txt',
 'Tenting_To-night': 'https://www.gutenberg.org/cache/epub/19475/pg19475.txt',
 'The_After_House': 'https://www.gutenberg.org/files/2358/2358-0.txt',
 'The_Amazing_Interlude': 'https://www.gutenberg.org/cache/epub/1590/pg1590.txt',
 'The_Bat': 'https://www.g

## Exploration

In [21]:
# Create ProcessedBook Object
staircase = ProcessedBook(title=books['The_Circular_Staircase'])
# Get chapter 1
chapt_1 = staircase.get_chapter(chapter=1)
chapt_1_joined = '\n'.join(chapt_1)
# Get chapter 2
chapt_2 = staircase.get_chapter(chapter=2)
chapt_2_joined = '\n'.join(chapt_2)
# Print
logger.info(f"Length of staircase raw: {len(staircase.raw)}", color='green', attrs=['underline'])
logger.info(f"Length of staircase clean: {len(staircase.clean)}", color='green', attrs=['underline'])
logger.info(f"Chapter 1:\n{chapt_1_joined[:45]} (..)", )
logger.info(f"Chapter 2:\n{chapt_2_joined[:45]} (..)")

2021-09-19 22:34:50 Notebook     INFO     [4m[32mLength of staircase raw: 410135[0m
2021-09-19 22:34:50 Notebook     INFO     [4m[32mLength of staircase clean: 6262[0m
2021-09-19 22:34:50 Notebook     INFO     [1m[34mChapter 1:
chapter i.
i take a country house
this is the (..)[0m
2021-09-19 22:34:50 Notebook     INFO     [1m[34mChapter 2:
chapter ii.
a link cuff-button
liddy’s knees  (..)[0m
