In [1]:
#!/usr/bin/python3
# -*- coding: UTF8 -*-
# Author: Nicolas Flandrois
# Date: Monday, May 20th, 2020

In [2]:
import os
import pandas as pd

pd.set_option("max_colwidth", None)

In [3]:
def fortune2dataframe(source_file):
    """Given the Source file path/name, this function will transform the raw
    data into a Pandas' dataframe. This function is specifically made to suite
    a fortune-mod standardized format, as an input."""
    with open(source_file, 'r') as f:
        data = f.read()
        quote_list = data.replace('\n', '').split('%')
        clean_list = [x.strip() for x in quote_list if x]
        splited_quotes = [n.split('     — ') for n in clean_list]

        df = pd.DataFrame(splited_quotes, columns=['Quotes', 'Authors'])

        return df

**Define the Dataframe** & **Sampling**

In [4]:
df = fortune2dataframe(os.path.abspath("zen"))
df

Unnamed: 0,Quotes,Authors
0,"Zen is not some kind of excitement, but concentration on our usual everyday routine.",Shunryu Suzuki
1,"Since the time we were born from our mother's womb, the only thing we have seen is the present. We have never seen the past and we have never seen the future. Wherever we are, whatever time it is, it is only the present.",Khenpo Tsultrim Rinpoche
2,"Rather than being your thoughts and emotions, be the awareness behind them.",Eckhart Tolle
3,Your suffering is never caused by the person you are blaming.,Byron Katie
4,Death is not an error. It is not a failure. It is taking off a tight shoe.,Ram Dass
...,...,...
365,"Beyond all vanities, fights, and desires, omnipotent silence lies.",Dejan Stojanovic
366,"The nearer a man comes to a calm mind, the closer he is to strength.",Marcus Aurelius
367,Without silence there cannot be any real appreciation in life.,Deepak Chopra
368,"Whatever the circumstance, bodily movement or stillness, feeling well or distressed, with good concentration or scattered attention, everything can be brought back to awareness.",Kittisaro


# A bit of Checkup for cleanning
- Checking for **Duplicate rows** (same Quote & same Author)

In [5]:
df[df.duplicated()]

Unnamed: 0,Quotes,Authors


- Checking for **Duplicate Quotes** (Same Quote, Any Author)

In [6]:
df[df.duplicated(['Quotes'])]

Unnamed: 0,Quotes,Authors


- Checking for **unwanted data** (typical issues from initial parsing Twitter2Fortune Bot)

In [7]:
df[df['Quotes'].str.contains('http')]

df[df['Authors'].str.contains('http')]

df[df['Authors'].str.match('None')]

Unnamed: 0,Quotes,Authors


# Statistics
- **How Many Quotes do we have?**

In [8]:
df['Quotes'].count()

370

- **How many Authors do we have?**

In [9]:
auth_grp = df.groupby('Authors').size()
auth_grp.count()

124

- **Which Author is quoted the most frequently?**

In [10]:
with pd.option_context('display.max_rows', None):
    print(auth_grp.sort_values(ascending=False))

Authors
Eckhart Tolle                                                        45
Alan Watts                                                           28
Thich Nhat Hanh                                                      25
Dōgen Zenji                                                          20
Shunryu Suzuki                                                       16
Dalai Lama XIV                                                       12
Byron Katie                                                          12
Bruce Lee                                                            11
Lao Tzu                                                               9
Bodhidharma                                                           9
Ajahn Chah                                                            7
Adyashanti                                                            7
Haemin Sunim                                                          6
Lao Tzu, Tao Te Ching                                   

- List of Authors in *alphabetical order*, with quotation frequency

In [11]:
with pd.option_context('display.max_rows', None):
    print(auth_grp)

Authors
17th Karmapa                                                          1
Adyashanti                                                            7
Ajahn Brahm                                                           3
Ajahn Chah                                                            7
Alan Watts                                                           28
Alfred Korzybski                                                      1
Alfred North Whitehead                                                1
Allen Ginsberg                                                        1
B. D. Schiers                                                         3
Bodhidharma                                                           9
Bokar Rinpoche                                                        1
Brad Warner                                                           1
Bruce Lee                                                            11
Buddha                                                  