# NLP - Project 1
## Rinehart Analysis
**Team**: *Jean Merlet, Konstantinos Georgiou, Matt Lane*

In [2]:
# Import Jupyter Widgets
import os
import ipywidgets as widgets
from ipywidgets import interact, interact_manual
from IPython.display import display

In [3]:
# Clone the repository if you're in Google Collab
def clone_project(is_collab: bool = False):
    print("Cloning Project..")
    !git clone https://github.com/NLPaladins/rinehartAnalysis.git
    print("Project cloned.")
    print("Changing dir..")
    os.chdir('rinehartAnalysis')
    
print("Are you on Google Collab? Clone project? (If you do this you will ove)")
btn = widgets.Button(description="Yes, clone")
btn.on_click(clone_project)
display(btn)

Are you on Google Collab? Clone project?


Button(description='Yes, clone', style=ButtonStyle())

If you are on Google Collab, to save changes, click **File > Save a copy on Gihtub**

In [4]:
print("Current dir:")
print(os.getcwd())
print("Dir Contents:")
print(os.listdir())

Current dir:
/home/drkostas/GDrive/Projects/UTK/rinehartAnalysis
Dir Contents:
['.gitignore', 'nlp_libs', 'requirements.txt', '.ipynb_checkpoints', 'logs', 'main.py', '.git', '.idea', 'Makefile', 'setup.py', 'TODO.md', 'README.md', 'main.ipynb', 'confs']


## Load Libraries and setup

In [5]:
import traceback
import argparse
from importlib import reload as reload_lib
from pprint import pprint

# Custom libs
from nlp_libs import Configuration, ColorizedLogger, ProcessedBook
# Import this way the libs you want to dynamically change and reload 
import nlp_libs.books.processed_book as books_lib


### Libraries Overview
All the libraries are located under *"\<project root>/nlp_libs"*
- ***ProcessedBook***: Loc: **books/processed_book.py**, Desc: *Book Pre-processor*
- ***Configuration***: Loc: **configuration/configuration.py**, Desc: *Configuration Loader*
- ***ColorizedLogger***: Loc: **fancy_logger/colorized_logger.py**, Desc: *Logger with formatted text capabilities*

In [12]:
# The path of configuration and log save path
config_path = "confs/proj_1.yml"  # Open files > confs > proj_1.yml to edit temporalily. Commit to save permanently
!cat "$config_path"
log_path = "logs/proj_1.log"  # Open files > logs > proj_1.log to debug logs of previous runs

tag: proj1
data_loader:
  config:
    books:
      The_Circular_Staircase:
        url: https://www.gutenberg.org/files/434/434-0.txt
        detectives:
          - Rachel Innes
        suspects:
          - Liddy
          - Halsey
          - Gertrude
          - Paul Armstrong
          - Doctor Walker
          - Louise Armstrong
          - Arnold Armstrong
          - Mrs. Ralston
          - Thomas Johnson
          - Aunt Ray
          - Mary Anne
          - Burke
          - Joe Jefferson
          - Anne Watson
          - Eliza Klinefelter
          - Beulah
          - Jack Bailey
          - Mr. Jarvis
          - Warner
          - Mr. Jamieson
          - Mr. Harton
          - Rosie
          - Sam Bohannon
          - Beatrice Fairfax
          - Mrs. Ogden Fitzhugh
          - Mr. Trautman
          - Doctor Stewart
          - Doctor Wainwright
          - Alexander Graham
          - Nina Carrington
          - Doctor Willoug

In [10]:
# The logger
logger = ColorizedLogger(logger_name='Notebook', color='cyan')
ColorizedLogger.setup_logger(log_path=log_path, debug=False, clear_log=True)

2021-09-20 21:32:01 FancyLogger  INFO     [1m[37mLogger is set. Log file path: /home/drkostas/GDrive/Projects/UTK/rinehartAnalysis/logs/proj_1.log[0m


In [8]:
# Load the configuration
conf = Configuration(config_src=config_path)
# Get the books dict
books = conf.get_config('data_loader')['config']['books']
pprint(books)  # Pretty print the books dict

2021-09-20 21:31:10 Config       INFO     [1m[37mConfiguration file loaded successfully from path: /home/drkostas/GDrive/Projects/UTK/rinehartAnalysis/confs/proj_1.yml[0m
2021-09-20 21:31:10 Config       INFO     [1m[37mConfiguration Tag: proj1[0m


{'Oh,_Well,_You_Know_How_Women_Are!': {'crime_type': 'stabbing',
                                       'detectives': ['man1', 'man2'],
                                       'suspects': ['man3', 'man4'],
                                       'url': 'https://www.gutenberg.org/cache/epub/24259/pg24259.txt'},
 'The_Breaking_Point': {'crime_type': 'stabbing',
                        'detectives': ['man1', 'man2'],
                        'suspects': ['man3', 'man4'],
                        'url': 'https://www.gutenberg.org/files/1601/1601-0.txt'},
 'The_Circular_Staircase': {'crime_type': 'stabbing',
                            'detectives': ['man1', 'man2'],
                            'suspects': ['man3', 'man4'],
                            'url': 'https://www.gutenberg.org/files/434/434-0.txt'},
 'The_Man_in_Lower_Ten': {'crime_type': 'stabbing',
                          'detectives': ['man1', 'man2'],
                          'suspects': ['man3', 'man4'],
                        

## Exploration

In [11]:
# Reload the code of ProcessedBook class
reload_lib(books_lib)
# -- Create ProcessedBook Object for each book -- #
processed_books = {}
for title, metadata in books.items():
    logger.info(f"Book: {title}, Metadata: {metadata}")
    processed_book = ProcessedBook(title=title, metadata=metadata)
    processed_books[title] = processed_book
# pprint(processed_books)
# -- The_Circular_Staircase -- #
current_book = processed_books['The_Circular_Staircase']
# Get chapter 1
chapter_1 = current_book.get_chapter(chapter=1)
chapter_1_joined = '\n'.join(chapter_1)
# Get chapter 2
chapter_2 = current_book.get_chapter(chapter=2)
chapter_2_joined = '\n'.join(chapter_2)

logger.info(f"Length of staircase raw: {len(current_book.raw)}", color='green', attrs=['underline'])
logger.info(f"Length of staircase clean: {len(current_book.clean)}", color='green', attrs=['underline'])
logger.info(f"Chapter 1:\n{chapter_1_joined[:45]} (..)", )
logger.info(f"Chapter 2:\n{chapter_2_joined[:45]} (..)")

2021-09-20 21:32:10 Notebook     INFO     [1m[36mBook: The_Circular_Staircase, Metadata: {'url': 'https://www.gutenberg.org/files/434/434-0.txt', 'detectives': ['man1', 'man2'], 'suspects': ['man3', 'man4'], 'crime_type': 'stabbing'}[0m
2021-09-20 21:32:10 Notebook     INFO     [1m[36mBook: The_Man_in_Lower_Ten, Metadata: {'url': 'https://www.gutenberg.org/files/1869/1869-0.txt', 'detectives': ['man1', 'man2'], 'suspects': ['man3', 'man4'], 'crime_type': 'stabbing'}[0m
2021-09-20 21:32:11 Notebook     INFO     [1m[36mBook: The_Breaking_Point, Metadata: {'url': 'https://www.gutenberg.org/files/1601/1601-0.txt', 'detectives': ['man1', 'man2'], 'suspects': ['man3', 'man4'], 'crime_type': 'stabbing'}[0m
2021-09-20 21:32:11 Notebook     INFO     [1m[36mBook: Oh,_Well,_You_Know_How_Women_Are!, Metadata: {'url': 'https://www.gutenberg.org/cache/epub/24259/pg24259.txt', 'detectives': ['man1', 'man2'], 'suspects': ['man3', 'man4'], 'crime_type': 'stabbing'}[0m
2021-09-20 21:32:11 No

[]
