In [0]:
# Remove previously created layers
# dbutils.fs.rm('dbfs:/FileStore/raw/topSubscribed', True)
# dbutils.fs.rm('dbfs:/FileStore/refined/channels', True)
# dbutils.fs.rm('dbfs:/FileStore/refined/subscriptions', True)
# dbutils.fs.rm('dbfs:/FileStore/refined/videos', True)
# dbutils.fs.rm('dbfs:/FileStore/refined/categories', True)
dbutils.fs.rm('dbfs:/FileStore/raw', True)
dbutils.fs.rm('dbfs:/FileStore/refined', True)

# Print success message
print('All previously created layers removed!')

All previously created layers removed!


In [0]:
# Import libraries
import numpy as np
import pandas as pd
import pyspark as ps
from pyspark.sql import functions as F



# Extraction

## Import csv file as pyspark dataframe

In [0]:
# Set options for spark .read()
path_to_csv = '/FileStore/tables/topSubscribed.csv'

# Read csv file to spark dataframe
df = spark\
    .read\
    .format('csv')\
    .option('inferSchema', 'false')\
    .option('header', 'true')\
    .option('sep', ',')\
    .load(path_to_csv)

# Display raw dataframe data
display(df)

# Print raw dataframe schema
df.printSchema()

Rank,Youtube Channel,Subscribers,Video Views,Video Count,Category,Started
1,T-Series,234000000,212900271553,18515,Music,2006
2,YouTube Movies,161000000,0,0,Film & Animation,2015
3,Cocomelon - Nursery Rhymes,152000000,149084178448,846,Education,2006
4,SET India,150000000,137828094104,103200,Shows,2006
5,MrBeast,128000000,21549128785,733,Entertainment,2012
6,Music,118000000,0,0,https://us.youtubers.me/global/all/top-1000-most_subscribed-youtube-channels,2013
7,PewDiePie,111000000,28851883250,4694,Gaming,2010
8,✿ Kids Diana Show,106000000,86638570921,1056,People & Blogs,2015
9,Like Nastya,104000000,87202935675,754,People & Blogs,2016
10,Gaming,93300000,0,0,https://us.youtubers.me/global/all/top-1000-most_subscribed-youtube-channels,2013


root
 |-- Rank: string (nullable = true)
 |-- Youtube Channel: string (nullable = true)
 |-- Subscribers: string (nullable = true)
 |-- Video Views: string (nullable = true)
 |-- Video Count: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Started: string (nullable = true)



## Save raw dataframe as a parquet file (1st layer)

In [0]:
# Save dataframe as parquet file
df.write.format('parquet')\
        .option('header', 'true')\
        .save('dbfs:/FileStore/raw/topSubscribed')

# Transformation

## Clean column headers

In [0]:
# Show dataframe column headers and types
df.printSchema()

root
 |-- Rank: string (nullable = true)
 |-- Youtube Channel: string (nullable = true)
 |-- Subscribers: string (nullable = true)
 |-- Video Views: string (nullable = true)
 |-- Video Count: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Started: string (nullable = true)



In [0]:
# Create list with stripped, lowercased, snake-cased column headers
newColumns = [column.strip()\
                    .lower()\
                    .replace(' ', '_')
              for column
              in df.columns]

# Create dataframe with new column headers
df = df.toDF(*newColumns)

# Show dataframe with updated column headers
df.printSchema()

root
 |-- rank: string (nullable = true)
 |-- youtube_channel: string (nullable = true)
 |-- subscribers: string (nullable = true)
 |-- video_views: string (nullable = true)
 |-- video_count: string (nullable = true)
 |-- category: string (nullable = true)
 |-- started: string (nullable = true)



## Clean column values

In [0]:
# Show dataframe values
display(df)

rank,youtube_channel,subscribers,video_views,video_count,category,started
1,T-Series,234000000,212900271553,18515,Music,2006
2,YouTube Movies,161000000,0,0,Film & Animation,2015
3,Cocomelon - Nursery Rhymes,152000000,149084178448,846,Education,2006
4,SET India,150000000,137828094104,103200,Shows,2006
5,MrBeast,128000000,21549128785,733,Entertainment,2012
6,Music,118000000,0,0,https://us.youtubers.me/global/all/top-1000-most_subscribed-youtube-channels,2013
7,PewDiePie,111000000,28851883250,4694,Gaming,2010
8,✿ Kids Diana Show,106000000,86638570921,1056,People & Blogs,2015
9,Like Nastya,104000000,87202935675,754,People & Blogs,2016
10,Gaming,93300000,0,0,https://us.youtubers.me/global/all/top-1000-most_subscribed-youtube-channels,2013


In [0]:
# Create dataframe with clean values (trim left/right whitespace, lowercase STRs, remove commas from numbers)
df = df.withColumn('youtube_channel', F.trim(df['youtube_channel']))\
        .withColumn('youtube_channel', F.lower(F.col('youtube_channel')))\
        .withColumn('subscribers', F.translate('subscribers', ',', ''))\
        .withColumn('video_views', F.translate('video_views', ',', ''))\
        .withColumn('video_count', F.translate('video_count', ',', ''))\
        .withColumn('category', F.trim(df['category']))\
        .withColumn('category', F.lower(F.col('category')))\
        .withColumn('started', F.col('started'))

# Show dataframe with updated values
display(df)

rank,youtube_channel,subscribers,video_views,video_count,category,started
1,t-series,234000000,212900271553,18515,music,2006
2,youtube movies,161000000,0,0,film & animation,2015
3,cocomelon - nursery rhymes,152000000,149084178448,846,education,2006
4,set india,150000000,137828094104,103200,shows,2006
5,mrbeast,128000000,21549128785,733,entertainment,2012
6,music,118000000,0,0,https://us.youtubers.me/global/all/top-1000-most_subscribed-youtube-channels,2013
7,pewdiepie,111000000,28851883250,4694,gaming,2010
8,✿ kids diana show,106000000,86638570921,1056,people & blogs,2015
9,like nastya,104000000,87202935675,754,people & blogs,2016
10,gaming,93300000,0,0,https://us.youtubers.me/global/all/top-1000-most_subscribed-youtube-channels,2013


## Update data types

In [0]:
# Show dataframe data types
df.printSchema()

root
 |-- rank: string (nullable = true)
 |-- youtube_channel: string (nullable = true)
 |-- subscribers: string (nullable = true)
 |-- video_views: string (nullable = true)
 |-- video_count: string (nullable = true)
 |-- category: string (nullable = true)
 |-- started: string (nullable = true)



In [0]:
# Create dataframe with updated data types
df = df.withColumn('rank', F.col('rank').cast('int'))\
        .withColumn('subscribers', F.col('subscribers').cast('int'))\
        .withColumn('video_views', F.col('video_views').cast('long'))\
        .withColumn('video_count', F.col('video_count').cast('int'))\
        .withColumn('started', F.col('started').cast('int'))

# Show dataframe with updated data types
df.printSchema()

root
 |-- rank: integer (nullable = true)
 |-- youtube_channel: string (nullable = true)
 |-- subscribers: integer (nullable = true)
 |-- video_views: long (nullable = true)
 |-- video_count: integer (nullable = true)
 |-- category: string (nullable = true)
 |-- started: integer (nullable = true)



## Partition dataframe into smaller dataframes

In [0]:
# List columns for new dataframes
df_chans_cols = ['rank', 'youtube_channel', 'started']
df_subs_cols = ['rank', 'subscribers']
df_vids_cols = ['rank', 'video_views', 'video_count']
df_cats_cols = ['rank', 'category']

# Create a 'channels' dataframe
df_chans = df.select(df_chans_cols)

# Create a 'subscriptions' dataframe
df_subs = df.select(df_subs_cols)

# Create a 'videos' dataframe
df_vids = df.select(df_vids_cols)

# Create a 'categories' dataframe
df_cats = df.select(df_cats_cols)

# Display new dataframes
display(df_chans)
display(df_subs)
display(df_vids)
display(df_cats)

rank,youtube_channel,started
1,t-series,2006
2,youtube movies,2015
3,cocomelon - nursery rhymes,2006
4,set india,2006
5,mrbeast,2012
6,music,2013
7,pewdiepie,2010
8,✿ kids diana show,2015
9,like nastya,2016
10,gaming,2013


rank,subscribers
1,234000000
2,161000000
3,152000000
4,150000000
5,128000000
6,118000000
7,111000000
8,106000000
9,104000000
10,93300000


rank,video_views,video_count
1,212900271553,18515
2,0,0
3,149084178448,846
4,137828094104,103200
5,21549128785,733
6,0,0
7,28851883250,4694
8,86638570921,1056
9,87202935675,754
10,0,0


rank,category
1,music
2,film & animation
3,education
4,shows
5,entertainment
6,https://us.youtubers.me/global/all/top-1000-most_subscribed-youtube-channels
7,gaming
8,people & blogs
9,people & blogs
10,https://us.youtubers.me/global/all/top-1000-most_subscribed-youtube-channels


## Save new, refined dataframes as parquet files (2nd layer)

In [0]:
# Save 'channels' dataframe
df_chans.write\
    .format('parquet')\
    .option('header', 'true')\
    .mode('overwrite')\
    .save('dbfs:/FileStore/refined/channels')

# Save 'subscriptions' dataframe
df_subs.write\
    .format('parquet')\
    .option('header', 'true')\
    .mode('overwrite')\
    .save('dbfs:/FileStore/refined/subscriptions')

# Save 'videos' dataframe
df_vids.write\
    .format('parquet')\
    .option('header', 'true')\
    .mode('overwrite')\
    .save('dbfs:/FileStore/refined/videos')

# Save 'categories' dataframe
df_cats.write\
    .format('parquet')\
    .option('header', 'true')\
    .mode('overwrite')\
    .save('dbfs:/FileStore/refined/categories')