In [10]:
#!/usr/bin/python3
# -*- coding: UTF8 -*-
# Author: Nicolas Flandrois
# Date: Monday, May 20th, 2020

In [11]:
import os
import pandas as pd

pd.set_option("max_colwidth", None)

In [12]:
def fortune2dataframe(source_file):
    """Given the Source file path/name, this function will transform the raw
    data into a Pandas' dataframe. This function is specifically made to suite
    a fortune-mod standardized format, as an input."""
    with open(source_file, 'r') as f:
        data = f.read()
        quote_list = data.replace('\n', '').split('%')
        clean_list = [x.strip() for x in quote_list if x]
        splited_quotes = [n.split('     — ') for n in clean_list]

        df = pd.DataFrame(splited_quotes, columns=['Quotes', 'Authors'])

        return df

**Define the Dataframe** & **Sampling**

In [13]:
df = fortune2dataframe(os.path.abspath("zen"))
df

Unnamed: 0,Quotes,Authors
0,"Zen is not some kind of excitement, but concentration on our usual everyday routine.",Shunryu Suzuki
1,"Since the time we were born from our mother's womb, the only thing we have seen is the present. We have never seen the past and we have never seen the future. Wherever we are, whatever time it is, it is only the present.",Khenpo Tsultrim Rinpoche
2,"Rather than being your thoughts and emotions, be the awareness behind them.",Eckhart Tolle
3,Your suffering is never caused by the person you are blaming.,Byron Katie
4,Death is not an error. It is not a failure. It is taking off a tight shoe.,Ram Dass
...,...,...
354,Washing a dish can be an act of enlightenment. It's delightful to wash the dishes!!,Thich Nhat Hanh
355,The absolute can only be comprehended through experience.,Mingyur Rinpoche
356,"What you think in your own mind to be good, or what people of the world think is good, is not necessarily good.",Dōgen Zenji
357,"Why seek a doctrine? As soon as you have a doctrine, you fall into dualistic thought.",Huang Po


# A bit of Checkup for cleanning
- Checking for **Duplicate rows** (same Quote & same Author)

In [14]:
df[df.duplicated()]

Unnamed: 0,Quotes,Authors
351,Those who seek the easy way do not seek the true way.,Dōgen Zenji
352,Those who seek the truth by means of intellect and learning only get further and further away from it.,Huang Po
353,Be aware of your breathing. Notice how this takes attention away from your thinking and creates space.,Eckhart Tolle
354,Washing a dish can be an act of enlightenment. It's delightful to wash the dishes!!,Thich Nhat Hanh
355,The absolute can only be comprehended through experience.,Mingyur Rinpoche
356,"What you think in your own mind to be good, or what people of the world think is good, is not necessarily good.",Dōgen Zenji
357,"Why seek a doctrine? As soon as you have a doctrine, you fall into dualistic thought.",Huang Po
358,Let life live you for a while instead of trying to make yourself live life.,Alan Watts


- Checking for **Duplicate Quotes** (Same Quote, Any Author)

In [15]:
df[df.duplicated(['Quotes'])]

Unnamed: 0,Quotes,Authors
351,Those who seek the easy way do not seek the true way.,Dōgen Zenji
352,Those who seek the truth by means of intellect and learning only get further and further away from it.,Huang Po
353,Be aware of your breathing. Notice how this takes attention away from your thinking and creates space.,Eckhart Tolle
354,Washing a dish can be an act of enlightenment. It's delightful to wash the dishes!!,Thich Nhat Hanh
355,The absolute can only be comprehended through experience.,Mingyur Rinpoche
356,"What you think in your own mind to be good, or what people of the world think is good, is not necessarily good.",Dōgen Zenji
357,"Why seek a doctrine? As soon as you have a doctrine, you fall into dualistic thought.",Huang Po
358,Let life live you for a while instead of trying to make yourself live life.,Alan Watts


- Checking for **unwanted data** (typical issues from initial parsing Twitter2Fortune Bot)

In [16]:
df[df['Quotes'].str.contains('http')]

df[df['Authors'].str.contains('http')]

df[df['Authors'].str.match('None')]

Unnamed: 0,Quotes,Authors


# Statistics
- **How Many Quotes do we have?**

In [17]:
df.count()

Quotes     359
Authors    359
dtype: int64

- **Which Author is quoted the most frequently?**

In [18]:
with pd.option_context('display.max_rows', None):
    print(df['Authors'].value_counts())

Eckhart Tolle                                                        43
Alan Watts                                                           28
Thich Nhat Hanh                                                      24
Dōgen Zenji                                                          22
Shunryu Suzuki                                                       16
Dalai Lama XIV                                                       12
Byron Katie                                                          12
Bodhidharma                                                           9
Lao Tzu                                                               8
Ajahn Chah                                                            7
Lao Tzu, Tao Te Ching                                                 6
Adyashanti                                                            6
Huang Po                                                              6
Haemin Sunim                                                    