In [0]:
from pyspark.sql import functions as F

# 1. Carregandos dados do DBFS

In [0]:
topChannels = spark.read.parquet('dbfs:/FileStore/refined/topChannels')
topChannelsMetrics = spark.read.parquet('dbfs:/FileStore/refined/topChannelsMetrics')
topVideos = spark.read.parquet('dbfs:/FileStore/refined/topVideos')
topVideosMetrics = spark.read.parquet('dbfs:/FileStore/refined/topVideosMetrics')
mostSubscribed = spark.read.parquet('dbfs:/FileStore/refined/mostSubscribed')
mostSubscribedMetrics = spark.read.parquet('dbfs:/FileStore/refined/mostSubscribedMetrics')
mostPopularChannels = spark.read.parquet('dbfs:/FileStore/refined/mostPopularChannels')
mostPopularChannelsMetrics = spark.read.parquet('dbfs:/FileStore/refined/mostPopularChannelsMetrics')
mostPopularVideos = spark.read.parquet('dbfs:/FileStore/refined/mostPopularVideos')
mostPopularVideosMetrics = spark.read.parquet('dbfs:/FileStore/refined/mostPopularVideosMetrics')
youtuberName = spark.read.parquet('dbfs:/FileStore/refined/youtuberName')

# 2. Visualização dos dados

In [0]:
print('\ntopChannels')
display(topChannels)

print('\ntopChannelsMetrics')
display(topChannelsMetrics)

print('\ntopVideos')
display(topVideos)

print('\ntopVideosMetrics')
display(topVideosMetrics)

print('\nmostSubscribed')
display(mostSubscribed)

print('\nmostSubscribedMetrics')
display(mostSubscribedMetrics)

print('\nmostPopularChannels')
display(mostPopularChannels)

print('\nmostPopularChannelsMetrics')
display(mostPopularChannelsMetrics)

print('\nmostPopularVideos')
display(mostPopularVideos)

print('\nmostPopularVideosMetrics')
display(mostPopularVideosMetrics)

print('\nyoutuberName')
display(youtuberName)

# 3. Análise descritiva

## Top canais que também estão na lista de mais inscritos

In [0]:
display(topChannels.join(mostSubscribed.select('youtuber_id'), topChannels.youtuber_id == mostSubscribed.youtuber_id, 'inner').orderBy('rank'))

## Canais que ocupam a mesma posição no ranking em Mais Inscritos e Top Canais

In [0]:
rank_igual = topChannels.join(mostSubscribed, [topChannels.rank == mostSubscribed.rank, topChannels.youtuber_id == mostSubscribed.youtuber_id], 'inner').select(topChannels['youtuber_id'], topChannels['rank']).orderBy(F.col('rank').asc())
display(rank_igual)

## Top canais que não estão na lista de mais inscritos

In [0]:
display(topChannels.join(mostSubscribed.select('youtuber_id'), topChannels.youtuber_id == mostSubscribed.youtuber_id, 'left_anti').orderBy('rank'))

## Canais mais incritos que não são Top canais

In [0]:
display(mostSubscribed.join(topChannels.select('youtuber_id'), topChannels.youtuber_id == mostSubscribed.youtuber_id, 'left_anti').orderBy('rank'))

## Top channels com quantidade de subscribers maior ou igual a média de subscribers da tabela

In [0]:
mean_subscribers = topChannels.select(F.mean(F.col('subscribers'))).collect()[0][0]
display(topChannels.filter(topChannels.subscribers >= mean_subscribers))

## Média de views por vídeo do canal

In [0]:
display(topChannels.withColumn('views/count', F.round(F.col('video_views') / F.col('video_count'))))

## Nome do canal e categoria dos youtubers mais antigos

In [0]:
oldestesChannels = topChannels.orderBy(F.col('started')).limit(10)
display(oldestesChannels.join(youtuberName, oldestesChannels.youtuber_id == youtuberName.youtuber_id, 'inner').select('started', 'youtuber', 'category'))

## Nome do canal e categoria dos youtubers mais recentes

In [0]:
newchannels = topChannels.orderBy(F.col('started').desc()).limit(10)
display(newchannels.join(youtuberName, newchannels.youtuber_id == youtuberName.youtuber_id, 'inner').select('started', 'youtuber', 'category'))

## Quantidade de canais por categoria

In [0]:
display(mostSubscribed.groupBy('category').count())

## Categoria com maior quantidade de views

In [0]:
display(mostSubscribed.groupBy('category').sum('video_views').orderBy(F.col('sum(video_views)').desc()))

## Média de subscribers por ano de cada canal

In [0]:
subsPerYear = mostSubscribed.withColumn('age', F.lit(2023) - F.col('started')).withColumn('subscribers/age', F.round(F.col('subscribers') / F.col('age'))).select('subscribers', 'age', 'subscribers/age', 'youtuber_id')
display(subsPerYear.join(youtuberName, subsPerYear.youtuber_id == youtuberName.youtuber_id, 'inner').select('youtuber', 'age','subscribers' ,'subscribers/age'))

## Categorias com mais canais no Top Canais

In [0]:
mostviewed_category = topChannels.groupBy('category').count()
mostviewed_category = mostviewed_category.orderBy(F.col('count').desc()).limit(10)
display(mostviewed_category)

##Categorias que mais aparecem nas tabelas Top Canais, Canais Mais Populares e Mais Inscritos

In [0]:
top_inscritos_pop = topChannels.join(mostSubscribed,topChannels.category == mostSubscribed.category, 'inner').select(topChannels['category'])
top_inscritos_pop = top_inscritos_pop.join(mostPopularChannels, mostPopularChannels.category == top_inscritos_pop.category, 'inner').select(top_inscritos_pop['category']).groupBy('category').count().orderBy(F.col('count').desc())
display(top_inscritos_pop)