In [1]:
#!/usr/bin/python3
# -*- coding: UTF8 -*-
# Author: Nicolas Flandrois
# Date: Monday, May 20th, 2020

In [2]:
import os
import pandas as pd

pd.set_option("max_colwidth", None)

In [3]:
def fortune2dataframe(source_file):
    """Given the Source file path/name, this function will transform the raw
    data into a Pandas' dataframe. This function is specifically made to suite
    a fortune-mod standardized format, as an input."""
    with open(source_file, 'r') as f:
        data = f.read()
        quote_list = data.replace('\n', '').split('%')
        clean_list = [x.strip() for x in quote_list if x]
        splited_quotes = [n.split('     — ') for n in clean_list]

        df = pd.DataFrame(splited_quotes, columns=['Quotes', 'Authors'])

        return df

**Define the Dataframe** & **Sampling**

In [4]:
df = fortune2dataframe(os.path.abspath("zen"))
df

Unnamed: 0,Quotes,Authors
0,"Zen is not some kind of excitement, but concentration on our usual everyday routine.",Shunryu Suzuki
1,"Since the time we were born from our mother's womb, the only thing we have seen is the present. We have never seen the past and we have never seen the future. Wherever we are, whatever time it is, it is only the present.",Khenpo Tsultrim Rinpoche
2,"Rather than being your thoughts and emotions, be the awareness behind them.",Eckhart Tolle
3,Your suffering is never caused by the person you are blaming.,Byron Katie
4,Death is not an error. It is not a failure. It is taking off a tight shoe.,Ram Dass
...,...,...
342,"Whatever comes up, see what is without calling it right or wrong.",Pema Chödrön
343,You can't live at all unless you can live fully now.,Alan Watts
344,"My speaking is meant to shake you awake, not to tell you how to dream better.",Adyashanti
345,Tackling destructive emotions and practicing loving-kindness is how we should live in the here and now.,Dalai Lama XIV


# A bit of Checkup for cleanning
- Checking for **Duplicate rows** (same Quote & same Author)

In [5]:
df[df.duplicated()]

Unnamed: 0,Quotes,Authors


- Checking for **Duplicate Quotes** (Same Quote, Any Author)

In [6]:
df[df.duplicated(['Quotes'])]

Unnamed: 0,Quotes,Authors


- Checking for **unwanted data** (typical issues from initial parsing Twitter2Fortune Bot)

In [7]:
df[df['Quotes'].str.contains('http')]

df[df['Authors'].str.contains('http')]

df[df['Authors'].str.match('None')]

Unnamed: 0,Quotes,Authors


# Statistics
- **How Many Quotes do we have?**

In [8]:
df.count()

Quotes     347
Authors    347
dtype: int64

- **Which Author is quoted the most frequently?**

In [10]:
with pd.option_context('display.max_rows', None):
    print(df['Authors'].value_counts())

Eckhart Tolle                                                        42
Alan Watts                                                           27
Thich Nhat Hanh                                                      23
Dōgen Zenji                                                          20
Shunryu Suzuki                                                       15
Byron Katie                                                          12
Dalai Lama XIV                                                       12
Bodhidharma                                                           9
Lao Tzu                                                               8
Ajahn Chah                                                            7
Adyashanti                                                            6
Lao Tzu, Tao Te Ching                                                 6
Haemin Sunim                                                          6
Jack Kornfield                                                  