## Overview

1. ConsoleGames.csv
a historic list of all console games released between 1980 and 2015

2. ConsoleDates.csv
a historic list of all console platforms (Such as wii, Play Station, Xbox) and information about them


### Queries

1. Calculate what % of Global Sales were made in North America
2. Extract a view of the console game titles ordered by platfomr name in Ascending order and Year of release in descending order
3. For each game title extract the first four letters of the publisher's name
4. Display all console platfomrs wich were released either just before Black Friday or just before Christmas (in any year)
5. Order the platforms by their longevity in ascending order (i.e. the platform wich was available for the longest at the bottom)

In [2]:
from pyspark.sql.functions import count, desc , col, max, struct, substring, to_date, asc, sum

In [3]:
# File location and type
file_location_consolegames = "/FileStore/tables/P9_ConsoleGames.csv"
file_location_consoledates = "/FileStore/tables/P9_ConsoleDates.csv"
file_type = "csv"

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
games_df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location_consolegames)

dates_df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location_consoledates)

display(dates_df)

In [4]:
display(games_df)

In [5]:
# Calculate what % of Global Sales were made in North America

q1_0 = games_df.select((sum('NA_Sales') + sum('EU_Sales') + sum('JP_Sales') + sum('Other_Sales')).alias('GlobalSales'), sum('EU_Sales').alias('Total_EU_Sales'))

q1_1 = q1_0.select(((q1_0.Total_EU_Sales/q1_0.GlobalSales)*100).alias('GlobalSalesUSPercentage'))
display(q1_0)
display(q1_1)

In [6]:
#Extract a view of the console game titles ordered by platfomr name in Ascending order and Year of release in descending order

q2 = games_df.select('Name', 'Platform', 'Year').orderBy(desc('Year')).orderBy(asc('Platform'))

display(q2)

In [7]:
#For each game title extract the first four letters of the publisher's name

q3 = games_df.select('Name', substring(games_df.Publisher, 1,4).alias('Publisher')).groupby('Name', 'Publisher').agg(count('name'))
display(q3)

substring(df.s, 1, 2).alias('s')

In [8]:
#Order the platforms by their longevity in ascending order (i.e. the platform wich was available for the longest at the bottom)
q4 = dates_df.select('Platform', to_date(dates_df.FirstRetailAvailability).alias('Release_Date')).orderBy(desc('Release_Date'))

display(q4)