In [12]:
#!/usr/bin/python3
# -*- coding: UTF8 -*-
# Author: Nicolas Flandrois
# Date: Monday, May 20th, 2020

In [13]:
import os
import pandas as pd

pd.set_option("max_colwidth", None)

In [14]:
def fortune2dataframe(source_file):
    """Given the Source file path/name, this function will transform the raw
    data into a Pandas' dataframe. This function is specifically made to suite
    a fortune-mod standardized format, as an input."""
    with open(source_file, 'r') as f:
        data = f.read()
        quote_list = data.replace('\n', '').split('%')
        clean_list = [x.strip() for x in quote_list if x]
        splited_quotes = [n.split('     — ') for n in clean_list]

        df = pd.DataFrame(splited_quotes, columns=['Quotes', 'Authors'])

        return df

**Define the Dataframe** & **Sampling**

In [15]:
df = fortune2dataframe(os.path.abspath("zen"))
df

Unnamed: 0,Quotes,Authors
0,"Zen is not some kind of excitement, but concentration on our usual everyday routine.",Shunryu Suzuki
1,"Since the time we were born from our mother's womb, the only thing we have seen is the present. We have never seen the past and we have never seen the future. Wherever we are, whatever time it is, it is only the present.",Khenpo Tsultrim Rinpoche
2,"Rather than being your thoughts and emotions, be the awareness behind them.",Eckhart Tolle
3,Your suffering is never caused by the person you are blaming.,Byron Katie
4,Death is not an error. It is not a failure. It is taking off a tight shoe.,Ram Dass
...,...,...
380,"Life and love generate effort, but effort will not generate them.",Alan Watts
381,"Much suffering, much unhappiness arises when you take each thought that comes into your head for the truth.",Eckhart Tolle
382,Prefer to be defeated in the presence of the wise than to excel among fools.,Dōgen Zenji
383,"Normally, we do not so much look at things as overlook them.",Alan Watts


# A bit of Checkup for cleanning
- Checking for **Duplicate rows** (same Quote & same Author)

In [16]:
df[df.duplicated()]

Unnamed: 0,Quotes,Authors


- Checking for **Duplicate Quotes** (Same Quote, Any Author)

In [17]:
df[df.duplicated(['Quotes'])]

Unnamed: 0,Quotes,Authors


- Checking for **unwanted data** (typical issues from initial parsing Twitter2Fortune Bot)

In [18]:
df[df['Quotes'].str.contains('http')]

df[df['Authors'].str.contains('http')]

df[df['Authors'].str.match('None')]

Unnamed: 0,Quotes,Authors


# Statistics
- **How Many Quotes do we have?**

In [19]:
df['Quotes'].count()

385

- **How many Authors do we have?**

In [20]:
auth_grp = df.groupby('Authors').size()
auth_grp.count()

128

- **Which Author is quoted the most frequently?**

In [21]:
with pd.option_context('display.max_rows', None):
    print(auth_grp.sort_values(ascending=False))

Authors
Eckhart Tolle                                                        48
Alan Watts                                                           30
Thich Nhat Hanh                                                      26
Dōgen Zenji                                                          21
Shunryu Suzuki                                                       16
Dalai Lama XIV                                                       14
Byron Katie                                                          13
Bruce Lee                                                            11
Lao Tzu                                                               9
Bodhidharma                                                           9
Adyashanti                                                            7
Ajahn Chah                                                            7
Lao Tzu, Tao Te Ching                                                 6
Haemin Sunim                                            

- List of Authors in *alphabetical order*, with quotation frequency

In [22]:
with pd.option_context('display.max_rows', None):
    print(auth_grp)

Authors
17th Karmapa                                                          1
Adyashanti                                                            7
Ajahn Brahm                                                           3
Ajahn Chah                                                            7
Alan Watts                                                           30
Alfred Korzybski                                                      1
Alfred North Whitehead                                                1
Allen Ginsberg                                                        1
B. D. Schiers                                                         3
Basho                                                                 1
Bodhidharma                                                           9
Bokar Rinpoche                                                        1
Brad Warner                                                           1
Bruce Lee                                               