## Importing

In [0]:
import pyspark

from pyspark.sql import SparkSession

from pyspark.sql.functions import col, countDistinct

spark = SparkSession.builder.appName("imputer").getOrCreate()

## Loading Data

In [0]:
raw_df = spark\
.read\
.load(
    "/FileStore/tables/gc_stats.csv", 
    header=True, 
    format = "csv", 
    sep = ',', 
    inferSchema=True)

## Dataset Description

#### Taking a Look at the first rows

In [0]:
display(raw_df.head(5))

idLobbyGame,idPlayer,idRoom,qtKill,qtAssist,qtDeath,qtHs,qtBombeDefuse,qtBombePlant,qtTk,qtTkAssist,qt1Kill,qt2Kill,qt3Kill,qt4Kill,qt5Kill,qtPlusKill,qtFirstKill,vlDamage,qtHits,qtShots,qtLastAlive,qtClutchWon,qtRoundsPlayed,descMapName,vlLevel,qtSurvived,qtTrade,qtFlashAssist,qtHitHeadshot,qtHitChest,qtHitStomach,qtHitLeftAtm,qtHitRightArm,qtHitLeftLeg,qtHitRightLeg,flWinner,dtCreatedAt
1,1,1,5,1,16,2,0,0,0.0,0.0,3,1,0,0,0,0,1,911,25.0,212,0.0,0,17,de_mirage,10,1.0,2.0,0.0,3.0,13.0,4.0,2.0,2.0,1.0,0.0,0,2022-01-21T19:45:44.000+0000
2,1,2,24,3,18,6,0,4,0.0,1.0,9,4,1,1,0,0,3,2933,54.0,597,1.0,1,27,de_nuke,9,8.0,1.0,0.0,7.0,26.0,14.0,2.0,1.0,1.0,3.0,1,2022-02-04T02:09:47.000+0000
3,2,3,6,4,23,2,0,1,0.0,1.0,4,1,0,0,0,0,2,1117,31.0,207,0.0,0,25,de_mirage,2,2.0,3.0,0.0,3.0,15.0,8.0,1.0,2.0,0.0,2.0,0,2021-09-18T18:07:43.000+0000
3,391,27508,10,5,20,4,1,0,0.0,0.0,6,2,0,0,0,0,1,1740,63.0,411,1.0,1,25,de_mirage,15,3.0,4.0,0.0,6.0,27.0,10.0,1.0,7.0,6.0,6.0,1,2021-09-18T18:07:43.000+0000
4,2,4,8,4,26,6,0,2,0.0,0.0,4,2,0,0,0,0,1,1696,51.0,324,0.0,0,30,de_nuke,2,4.0,4.0,2.0,8.0,19.0,12.0,2.0,3.0,2.0,5.0,0,2021-09-27T00:17:45.000+0000


In [0]:
print('Dataset cols: \n' + ', '.join([col for col in raw_df.schema.names]))

Dataset cols: 
idLobbyGame, idPlayer, idRoom, qtKill, qtAssist, qtDeath, qtHs, qtBombeDefuse, qtBombePlant, qtTk, qtTkAssist, qt1Kill, qt2Kill, qt3Kill, qt4Kill, qt5Kill, qtPlusKill, qtFirstKill, vlDamage, qtHits, qtShots, qtLastAlive, qtClutchWon, qtRoundsPlayed, descMapName, vlLevel, qtSurvived, qtTrade, qtFlashAssist, qtHitHeadshot, qtHitChest, qtHitStomach, qtHitLeftAtm, qtHitRightArm, qtHitLeftLeg, qtHitRightLeg, flWinner, dtCreatedAt


#### Getting the dataset shape

In [0]:
print(f"Number of rows: {raw_df.count()}")
print(f"Number of columns: {len(raw_df.columns)}")

Number of rows: 184152
Number of columns: 38


#### Getting column's schemas

It'll be needed gurther investigation to check the data type consistency of each column

In [0]:
raw_df.printSchema()

root
 |-- idLobbyGame: integer (nullable = true)
 |-- idPlayer: integer (nullable = true)
 |-- idRoom: integer (nullable = true)
 |-- qtKill: integer (nullable = true)
 |-- qtAssist: integer (nullable = true)
 |-- qtDeath: integer (nullable = true)
 |-- qtHs: integer (nullable = true)
 |-- qtBombeDefuse: integer (nullable = true)
 |-- qtBombePlant: integer (nullable = true)
 |-- qtTk: double (nullable = true)
 |-- qtTkAssist: double (nullable = true)
 |-- qt1Kill: integer (nullable = true)
 |-- qt2Kill: integer (nullable = true)
 |-- qt3Kill: integer (nullable = true)
 |-- qt4Kill: integer (nullable = true)
 |-- qt5Kill: integer (nullable = true)
 |-- qtPlusKill: integer (nullable = true)
 |-- qtFirstKill: integer (nullable = true)
 |-- vlDamage: integer (nullable = true)
 |-- qtHits: double (nullable = true)
 |-- qtShots: integer (nullable = true)
 |-- qtLastAlive: double (nullable = true)
 |-- qtClutchWon: integer (nullable = true)
 |-- qtRoundsPlayed: integer (nullable = true)
 |-- 

#### Counting the number of distinct games

In [0]:
raw_df.select(
    countDistinct("idLobbyGame")
).show()

+---------------------------+
|count(DISTINCT idLobbyGame)|
+---------------------------+
|                     172911|
+---------------------------+



#### Counting the number of unique players

In [0]:
raw_df.select(
    countDistinct("idPlayer")
).show()

+------------------------+
|count(DISTINCT idPlayer)|
+------------------------+
|                    2469|
+------------------------+



#### Selecting the top 10 players with more games

In [0]:
display(
    raw_df.groupBy("idPlayer") \
    .count() \
    .select(
        col('idPlayer'), col('count').alias('counting')) \
    .sort('counting', ascending=False) \
    .head(10)
)

idPlayer,counting
1922,873
1663,746
2289,677
20,666
1335,625
10,615
2264,610
2077,603
321,600
65,586
