# Ex2 - Filtering and Sorting Data

This time we are going to pull data directly from the internet.

### Step 1. Import the necessary libraries

In [None]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=1658327267f5ae19ab175caf422a085d4f54fefbc24208ff9d2c4b570eef4b08
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType, FloatType
from pyspark.sql.functions import expr, col
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [None]:
from pyspark.sql.functions import expr, col, mean

### Step 2. Import the dataset from this [address](https://raw.githubusercontent.com/guipsamora/pandas_exercises/master/02_Filtering_%26_Sorting/Euro12/Euro_2012_stats_TEAM.csv).

In [None]:
!wget -O Euro_2012_stats_TEAM.csv https://raw.githubusercontent.com/guipsamora/pandas_exercises/master/02_Filtering_%26_Sorting/Euro12/Euro_2012_stats_TEAM.csv

--2024-04-09 14:57:33--  https://raw.githubusercontent.com/guipsamora/pandas_exercises/master/02_Filtering_%26_Sorting/Euro12/Euro_2012_stats_TEAM.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2319 (2.3K) [text/plain]
Saving to: ‘Euro_2012_stats_TEAM.csv’


2024-04-09 14:57:33 (28.7 MB/s) - ‘Euro_2012_stats_TEAM.csv’ saved [2319/2319]



### Step 3. Assign it to a variable called euro12.

In [None]:
euro12 = spark.read.csv("Euro_2012_stats_TEAM.csv", sep=',', header=True, inferSchema=True)

In [None]:
euro12.schema

StructType([StructField('Team', StringType(), True), StructField('Goals', IntegerType(), True), StructField('Shots on target', IntegerType(), True), StructField('Shots off target', IntegerType(), True), StructField('Shooting Accuracy', StringType(), True), StructField('% Goals-to-shots', StringType(), True), StructField('Total shots (inc. Blocked)', IntegerType(), True), StructField('Hit Woodwork', IntegerType(), True), StructField('Penalty goals', IntegerType(), True), StructField('Penalties not scored', IntegerType(), True), StructField('Headed goals', IntegerType(), True), StructField('Passes', IntegerType(), True), StructField('Passes completed', IntegerType(), True), StructField('Passing Accuracy', StringType(), True), StructField('Touches', IntegerType(), True), StructField('Crosses', IntegerType(), True), StructField('Dribbles', IntegerType(), True), StructField('Corners Taken', IntegerType(), True), StructField('Tackles', IntegerType(), True), StructField('Clearances', IntegerT

In [None]:
euro12.show(10)

+--------------+-----+---------------+----------------+-----------------+----------------+--------------------------+------------+-------------+--------------------+------------+------+----------------+----------------+-------+-------+--------+-------------+-------+----------+-------------+-------------------+------------+------+--------------+----------+--------------------+---------+--------------+--------+------------+---------+-------+--------+------------+
|          Team|Goals|Shots on target|Shots off target|Shooting Accuracy|% Goals-to-shots|Total shots (inc. Blocked)|Hit Woodwork|Penalty goals|Penalties not scored|Headed goals|Passes|Passes completed|Passing Accuracy|Touches|Crosses|Dribbles|Corners Taken|Tackles|Clearances|Interceptions|Clearances off line|Clean Sheets|Blocks|Goals conceded|Saves made|Saves-to-shots ratio|Fouls Won|Fouls Conceded|Offsides|Yellow Cards|Red Cards|Subs on|Subs off|Players Used|
+--------------+-----+---------------+----------------+-------------

### Step 4. Select only the Goal column.

In [None]:
euro12.select(col('Goals')).show()

+-----+
|Goals|
+-----+
|    4|
|    4|
|    4|
|    5|
|    3|
|   10|
|    5|
|    6|
|    2|
|    2|
|    6|
|    1|
|    5|
|   12|
|    5|
|    2|
+-----+



### Step 5. How many team participated in the Euro2012?

In [None]:
euro12.select(col('Team')).distinct().count()

16

### Step 6. What is the number of columns in the dataset?

In [None]:
euro12.columns

['Team',
 'Goals',
 'Shots on target',
 'Shots off target',
 'Shooting Accuracy',
 '% Goals-to-shots',
 'Total shots (inc. Blocked)',
 'Hit Woodwork',
 'Penalty goals',
 'Penalties not scored',
 'Headed goals',
 'Passes',
 'Passes completed',
 'Passing Accuracy',
 'Touches',
 'Crosses',
 'Dribbles',
 'Corners Taken',
 'Tackles',
 'Clearances',
 'Interceptions',
 'Clearances off line',
 'Clean Sheets',
 'Blocks',
 'Goals conceded',
 'Saves made',
 'Saves-to-shots ratio',
 'Fouls Won',
 'Fouls Conceded',
 'Offsides',
 'Yellow Cards',
 'Red Cards',
 'Subs on',
 'Subs off',
 'Players Used']

### Step 7. View only the columns Team, Yellow Cards and Red Cards and assign them to a dataframe called discipline

In [None]:
discipline = euro12.select('Team', 'Yellow Cards', 'Red Cards')

In [None]:
discipline.show()

+-------------------+------------+---------+
|               Team|Yellow Cards|Red Cards|
+-------------------+------------+---------+
|            Croatia|           9|        0|
|     Czech Republic|           7|        0|
|            Denmark|           4|        0|
|            England|           5|        0|
|             France|           6|        0|
|            Germany|           4|        0|
|             Greece|           9|        1|
|              Italy|          16|        0|
|        Netherlands|           5|        0|
|             Poland|           7|        1|
|           Portugal|          12|        0|
|Republic of Ireland|           6|        1|
|             Russia|           6|        0|
|              Spain|          11|        0|
|             Sweden|           7|        0|
|            Ukraine|           5|        0|
+-------------------+------------+---------+



### Step 8. Sort the teams by Red Cards, then to Yellow Cards

In [None]:
discipline.orderBy('Red Cards', 'Yellow Cards', ascending=False).show()

+-------------------+------------+---------+
|               Team|Yellow Cards|Red Cards|
+-------------------+------------+---------+
|             Greece|           9|        1|
|             Poland|           7|        1|
|Republic of Ireland|           6|        1|
|              Italy|          16|        0|
|           Portugal|          12|        0|
|              Spain|          11|        0|
|            Croatia|           9|        0|
|     Czech Republic|           7|        0|
|             Sweden|           7|        0|
|             France|           6|        0|
|             Russia|           6|        0|
|            England|           5|        0|
|        Netherlands|           5|        0|
|            Ukraine|           5|        0|
|            Denmark|           4|        0|
|            Germany|           4|        0|
+-------------------+------------+---------+



### Step 9. Calculate the mean Yellow Cards given per Team

In [None]:
discipline.select(mean(col('Yellow Cards'))).show()

+-----------------+
|avg(Yellow Cards)|
+-----------------+
|           7.4375|
+-----------------+



### Step 10. Filter teams that scored more than 6 goals

In [None]:
euro12.filter(col('Goals')>6).show()

+-------+-----+---------------+----------------+-----------------+----------------+--------------------------+------------+-------------+--------------------+------------+------+----------------+----------------+-------+-------+--------+-------------+-------+----------+-------------+-------------------+------------+------+--------------+----------+--------------------+---------+--------------+--------+------------+---------+-------+--------+------------+
|   Team|Goals|Shots on target|Shots off target|Shooting Accuracy|% Goals-to-shots|Total shots (inc. Blocked)|Hit Woodwork|Penalty goals|Penalties not scored|Headed goals|Passes|Passes completed|Passing Accuracy|Touches|Crosses|Dribbles|Corners Taken|Tackles|Clearances|Interceptions|Clearances off line|Clean Sheets|Blocks|Goals conceded|Saves made|Saves-to-shots ratio|Fouls Won|Fouls Conceded|Offsides|Yellow Cards|Red Cards|Subs on|Subs off|Players Used|
+-------+-----+---------------+----------------+-----------------+----------------

### Step 11. Select the teams that start with G

In [None]:
euro12.filter(col('Team').startswith('G')).show()

+-------+-----+---------------+----------------+-----------------+----------------+--------------------------+------------+-------------+--------------------+------------+------+----------------+----------------+-------+-------+--------+-------------+-------+----------+-------------+-------------------+------------+------+--------------+----------+--------------------+---------+--------------+--------+------------+---------+-------+--------+------------+
|   Team|Goals|Shots on target|Shots off target|Shooting Accuracy|% Goals-to-shots|Total shots (inc. Blocked)|Hit Woodwork|Penalty goals|Penalties not scored|Headed goals|Passes|Passes completed|Passing Accuracy|Touches|Crosses|Dribbles|Corners Taken|Tackles|Clearances|Interceptions|Clearances off line|Clean Sheets|Blocks|Goals conceded|Saves made|Saves-to-shots ratio|Fouls Won|Fouls Conceded|Offsides|Yellow Cards|Red Cards|Subs on|Subs off|Players Used|
+-------+-----+---------------+----------------+-----------------+----------------

### Step 12. Select the first 7 columns

In [None]:
columns = euro12.columns
columns

['Team',
 'Goals',
 'Shots on target',
 'Shots off target',
 'Shooting Accuracy',
 '% Goals-to-shots',
 'Total shots (inc. Blocked)',
 'Hit Woodwork',
 'Penalty goals',
 'Penalties not scored',
 'Headed goals',
 'Passes',
 'Passes completed',
 'Passing Accuracy',
 'Touches',
 'Crosses',
 'Dribbles',
 'Corners Taken',
 'Tackles',
 'Clearances',
 'Interceptions',
 'Clearances off line',
 'Clean Sheets',
 'Blocks',
 'Goals conceded',
 'Saves made',
 'Saves-to-shots ratio',
 'Fouls Won',
 'Fouls Conceded',
 'Offsides',
 'Yellow Cards',
 'Red Cards',
 'Subs on',
 'Subs off',
 'Players Used']

In [None]:
euro12.select([col('`' + c + '`') for c in euro12.columns[0:7]]).show()

+-------------------+-----+---------------+----------------+-----------------+----------------+--------------------------+
|               Team|Goals|Shots on target|Shots off target|Shooting Accuracy|% Goals-to-shots|Total shots (inc. Blocked)|
+-------------------+-----+---------------+----------------+-----------------+----------------+--------------------------+
|            Croatia|    4|             13|              12|            51.9%|           16.0%|                        32|
|     Czech Republic|    4|             13|              18|            41.9%|           12.9%|                        39|
|            Denmark|    4|             10|              10|            50.0%|           20.0%|                        27|
|            England|    5|             11|              18|            50.0%|           17.2%|                        40|
|             France|    3|             22|              24|            37.9%|            6.5%|                        65|
|            Ger

### Step 13. Select all columns except the last 3.

In [None]:
euro12.select([col('`' + c + '`')  for c in euro12.columns[-3:]]).show()

+-------+--------+------------+
|Subs on|Subs off|Players Used|
+-------+--------+------------+
|      9|       9|          16|
|     11|      11|          19|
|      7|       7|          15|
|     11|      11|          16|
|     11|      11|          19|
|     15|      15|          17|
|     12|      12|          20|
|     18|      18|          19|
|      7|       7|          15|
|      7|       7|          17|
|     14|      14|          16|
|     10|      10|          17|
|      7|       7|          16|
|     17|      17|          18|
|      9|       9|          18|
|      9|       9|          18|
+-------+--------+------------+



In [None]:
euro12.columns[-3:]


['Subs on', 'Subs off', 'Players Used']

### Step 14. Present only the Shooting Accuracy from England, Italy and Russia

In [None]:
euro12.select(col('Shooting Accuracy'), col('Team')).filter(col('Team').isin('England', 'Italy', 'Russia')).show()

+-----------------+-------+
|Shooting Accuracy|   Team|
+-----------------+-------+
|            50.0%|England|
|            43.0%|  Italy|
|            22.5%| Russia|
+-----------------+-------+

