In [1]:
#!/usr/bin/python3
# -*- coding: UTF8 -*-
# Author: Nicolas Flandrois
# Date: Monday, May 20th, 2020

In [2]:
import os
import pandas as pd

pd.set_option("max_colwidth", None)

In [3]:
def fortune2dataframe(source_file):
    """Given the Source file path/name, this function will transform the raw
    data into a Pandas' dataframe. This function is specifically made to suite
    a fortune-mod standardized format, as an input."""
    with open(source_file, 'r') as f:
        data = f.read()
        quote_list = data.replace('\n', '').split('%')
        clean_list = [x.strip() for x in quote_list if x]
        splited_quotes = [n.split('     — ') for n in clean_list]

        df = pd.DataFrame(splited_quotes, columns=['Quotes', 'Authors'])

        return df

**Define the Dataframe** & **Sampling**

In [4]:
df = fortune2dataframe(os.path.abspath("zen"))
df

Unnamed: 0,Quotes,Authors
0,"Zen is not some kind of excitement, but concentration on our usual everyday routine.",Shunryu Suzuki
1,"Since the time we were born from our mother's womb, the only thing we have seen is the present. We have never seen the past and we have never seen the future. Wherever we are, whatever time it is, it is only the present.",Khenpo Tsultrim Rinpoche
2,"Rather than being your thoughts and emotions, be the awareness behind them.",Eckhart Tolle
3,Your suffering is never caused by the person you are blaming.,Byron Katie
4,Death is not an error. It is not a failure. It is taking off a tight shoe.,Ram Dass
...,...,...
711,The judgment of time is always done in the past tense.,Zen Proverb
712,He who clings to his work will create nothing that endures.,Lao Tzu
713,"Until you can see everything in the world as your friend, your work is not done.",Byron Katie
714,Be content with what you have; rejoice in the way things are.,Lao Tzu


# A bit of Checkup for cleanning
- Checking for **Duplicate rows** (same Quote & same Author)

In [5]:
df[df.duplicated()]

Unnamed: 0,Quotes,Authors


- Checking for **Duplicate Quotes** (Same Quote, Any Author)

In [6]:
df[df.duplicated(['Quotes'])]

Unnamed: 0,Quotes,Authors


- Checking for **unwanted data** (typical issues from initial parsing Twitter2Fortune Bot)

In [7]:
df[df['Quotes'].str.contains('http')]

df[df['Authors'].str.contains('http')]

df[df['Authors'].str.match('None')]

Unnamed: 0,Quotes,Authors


# Statistics
- **Mini Report**

In [8]:
total_quotes = df['Quotes'].count()
yr = int(total_quotes // 365.25)
wks = int(total_quotes % 365.25 // 7)
mth = f'{wks // 4} month(s), {wks % 4} week(s)'  
days = int(round(total_quotes % 365.25 % 7))
lasting = f'{yr} year(s), {mth}, and {days} day(s).'
auth_grp = df.groupby('Authors').size()
nb_auth = auth_grp.count()

print(f'Quotes:\t\t{total_quotes}\nAuthors:\t{nb_auth}\n1 quote a day, everyday, all year long would last:\t{lasting}')

Quotes:		716
Authors:	188
1 quote a day, everyday, all year long would last:	1 year(s), 12 month(s), 2 week(s), and 1 day(s).


- **Which Author is quoted the most frequently?**

In [9]:
with pd.option_context('display.max_rows', None):
    print(auth_grp.sort_values(ascending=False))

Authors
Lao Tzu                                                              66
Eckhart Tolle                                                        64
Alan Watts                                                           46
Zen Proverb                                                          39
Thich Nhat Hanh                                                      34
Dōgen Zenji                                                          27
Byron Katie                                                          25
B.D. Schiers                                                         20
Dalai Lama XIV                                                       18
Shunryu Suzuki                                                       17
Zhuangzi                                                             15
Bodhidharma                                                          14
Bruce Lee                                                            12
Buddhist Proverb                                        

- List of Authors in *alphabetical order*, with quotation frequency

In [10]:
with pd.option_context('display.max_rows', None):
    print(auth_grp)

Authors
17th Karmapa                                                          2
Adyashanti                                                            7
Aesop                                                                 1
African Proverb, Swahili                                              1
Ajahn Brahm                                                           3
Ajahn Chah                                                           10
Ajahn Sumedho                                                         1
Alan Watts                                                           46
Alan Wilson Watts                                                     1
Alfred Korzybski                                                      1
Alfred North Whitehead                                                1
Allen Ginsberg                                                        2
Anne Scottlin                                                         1
Anonymous                                               