# Cleansing Meditations

In [None]:
import os
import pandas as pd
import re
from datetime import timedelta

In [None]:
expected_columns = ['teacher_id',
                    'meditation_id',
                    'med_list_batch_id',
                    'med_list_scrape_date',
                    'med_list_scrape_status',
                    'med_detail_batch_id',
                    'med_detail_scrape_date',
                    'med_detail_scrape_status',
                    'title',
                    'upload_date',
                    'duration',
                    'plays',
                    'rating',
                    'reviews',
                    'track_type',
                    'activity',
                    'suitable_for',
                    'topics',
                    'description',
                    'meditation_url',
                    'image_url' #need to distinguish between teacher and meditation images
                    ]

In [None]:
#For each batch data file in the teacher_batch_files subdirectory
batch_files_list = os.listdir('../data/med_detail_batch_files')

batch_df_list = []

for batch_file in batch_files_list:
    batch_df = pd.read_csv('../data/med_detail_batch_files/' + batch_file, index_col = 0)    

    #Verify the batch has the correct columns in the correct order.
    correct_columns = True

    batch_columns = list(batch_df.columns)
    if len(batch_columns) == len(expected_columns):
        for col_position in range(0,len(batch_columns)):
            if batch_columns[col_position] != expected_columns[col_position]:
                correct_columns = False
    else:
        correct_columns = False
        
#    batch_df_list = batch_df_list.append(batch_df)
    if correct_columns:
        batch_df_list = batch_df_list + [batch_df]
    else:
        print('ERROR COLUMNS NOT AS EXPECTED',batch_file)

meditations_df = pd.concat(batch_df_list)

In [None]:
meditations_df.shape

## Cleansing Rows
- Remove duplicates
- Remove page not found
- Remove title not found

### Page not found

In [None]:
#Show teachers whose page could not be found.
page_not_found_rows= meditations_df.loc[meditations_df.med_detail_scrape_status == 'page not found'].copy()
page_not_found_rows

In [None]:
#Remove teachers whose page could not be found.
meditations_df = meditations_df.loc[meditations_df.med_detail_scrape_status != 'page not found']
meditations_df.shape

### Title not found

In [None]:
#Show teachers whose name could not be found. Try rerunning their batches to try to get their info again.
title_not_found_rows = meditations_df.loc[meditations_df.med_detail_scrape_status == 'title not found'].copy()
title_not_found_rows

In [None]:
#Remove teachers whose name could not be found.
meditations_df = meditations_df.loc[meditations_df.med_detail_scrape_status != 'title not found']
meditations_df.shape

### Duplicates

In [None]:
meditations_df = meditations_df.sort_values(by='meditation_id')
meditations_df.loc[meditations_df[['meditation_id']].duplicated()]

In [None]:
meditations_df = meditations_df.loc[~meditations_df[['meditation_id']].duplicated()]
meditations_df.shape

## Cleansing Columns

### Take subset of columns

In [None]:
meditations_df = meditations_df[['teacher_id',
                                 'meditation_id',
                                #'med_list_batch_id',        Removing this column
                                #'med_list_scrape_date',     Removing this column
                                #'med_list_scrape_status',   Removing this column
                                'med_detail_batch_id',      #Removing this column
                                 'med_detail_scrape_date',
                                #'med_detail_scrape_status', Removing this column
                                 'title',
                                 'upload_date',
                                 'duration',
                                 'plays',
                                 'rating',
                                 'reviews',
                                 'track_type',
                                 'activity',
                                 'suitable_for',
                                 'topics',
                                 'description',
                                 'meditation_url',
                                 'image_url']]

In [None]:
for index, row in meditations_df.iterrows():
    
    #teacher_id -- no changes
    
    #meditation_id -- remove prefix
    if row.meditation_id[:20] == '/guided-meditations/':
        meditations_df.loc[index, 'meditation_id'] = row.meditation_id[20:]
    
    #med_detail_scrape_date   -- no changes
    #title                    -- no changes
    #upload_date              -- no changes -- example value: 2017-02-10T10:40:33.000Z
    
    #duration -- Example values: PT41M2S, PT6M28S, hours?, less than a minute?
    #time_digits = re.findall("\d", row.duration)
    #med_hours = re.search("", row.duration)
    #med_minutes = re.search("", row.duration)
    #med_seconds = re.search("", row.duration)
    #meditations_df.loc[index, 'duration'] = timedelta(hours=med_hours,
    #                                                  minutes=med_minutes,
    #                                                  seconds=med_seconds)
    
    #plays -- Example values: 465, 45k, 3m
    if str(row.plays)[-1:] == 'k':
        num_plays = int(float(row.plays[:-1]) * 1000)
    elif str(row.plays)[-1] == 'm':
        num_plays = int(float(row.plays[:-1]) * 1000000)
    elif pd.isna(row.plays):
        num_plays = 0
    else:
        num_plays = int(row.plays)
    meditations_df.loc[index, 'plays'] = int(num_plays)
    
    #rating         -- Example value: 4.63  Handling for null when change type to int?
    
    #reviews        -- no changes
    
    #track_type     -- no changes -- Values: guided, talks, music
    #activity       -- no changes -- value; meditation  (Are there others?)
    #suitable_for   -- no changes -- values: Everyone, Children, Experienced, Beginners
    
#topics -- separate table, will do later in notebook
    
    #description    -- no changes
    
    #meditation_url -- no changes
#image_url -- Will need to decide what, if anything, to store

In [None]:
#TEMP JUST TO TRY THINGS OUT
meditations_df.duration

### Set correct data types

In [None]:
#Datatypes dictionary

#Not currently used
meditation_datatypes = {'teacher_id':'str',
                        'meditation_id':'str',
                        'med_list_batch_id':'int',
                        #'med_list_scrape_date':'str',
                        'med_list_scrape_status':'str',
                        'title':'str',
                        #'upload_date':'str',
                        #'duration':'str',  A timediff column?
                        'plays':'int',
                        'rating':'float', #but None value will cause error?
                        'reviews':'int',
                        'track_type':'str',
                        'activity':'str',
                        'suitable_for':'str',
                        #'topics': 'str',
                        'description':'str',
                        'meditation_url':'str',
                        'image_url':'str'}

In [None]:
meditations_df['med_detail_scrape_date'] = pd.to_datetime(meditations_df.med_detail_scrape_date)
meditations_df['upload_date'] = pd.to_datetime(meditations_df.upload_date)
#meditations_df = meditations_df.astype(dtype=meditation_datatypes)

### Save meditations data to csv

In [None]:
meditations_df.to_csv('../data/meditations_df.csv')

In [None]:
meditations_df.info()