In [1]:
#initializing pyspark
import findspark
findspark.init()

In [2]:
#building sparksession
import pyspark
import pyspark.sql.functions as sqlfunc
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
spark

In [3]:
#reading CSV
df = spark.read.csv("challenge.csv", header = True)
df.show(5)

+--------------+---------+----------------+----------+
|    ip_address|  Country|     Domain Name|Bytes_used|
+--------------+---------+----------------+----------+
| 52.81.192.172|    China|odnoklassniki.ru|       463|
|119.239.207.13|    China|        youtu.be|        51|
| 68.69.217.210|    China|       adobe.com|        10|
|  7.191.21.223| Bulgaria|    linkedin.com|       853|
|  211.13.10.68|Indonesia|         hud.gov|        29|
+--------------+---------+----------------+----------+
only showing top 5 rows



In [4]:
#creating a new column 'is_country_mexico?'
from pyspark.sql.functions import *
df_1 = df.withColumn("is_country_mexico?", when(df.Country == "Mexico", "Yes").otherwise("No"))
df_1.show(50)

+---------------+--------------------+--------------------+----------+------------------+
|     ip_address|             Country|         Domain Name|Bytes_used|is_country_mexico?|
+---------------+--------------------+--------------------+----------+------------------+
|  52.81.192.172|               China|    odnoklassniki.ru|       463|                No|
| 119.239.207.13|               China|            youtu.be|        51|                No|
|  68.69.217.210|               China|           adobe.com|        10|                No|
|   7.191.21.223|            Bulgaria|        linkedin.com|       853|                No|
|   211.13.10.68|           Indonesia|             hud.gov|        29|                No|
|   239.80.21.97|            Suriname|          smh.com.au|       218|                No|
|106.214.106.233|             Jamaica|       amazonaws.com|        95|                No|
| 127.242.24.138|               China|    surveymonkey.com|       123|                No|
|     99.2

In [5]:
#applying 'groupby' condition on 'is_country_mexico' to get the total_Bytes_used
df_2 = df_1.groupby("is_country_mexico?").agg(sqlfunc.sum(df_1.Bytes_used).alias("total_used"))
df_2.show()

+------------------+----------+
|is_country_mexico?|total_used|
+------------------+----------+
|                No|  508076.0|
|               Yes|    6293.0|
+------------------+----------+



In [6]:
#applying 'groupby' condition on 'Country' to get the distinct_count_of_ip_addresses in descending order
df_3 = df_1.groupby('Country').agg(sqlfunc.countDistinct(df_1.ip_address).alias('distinct_number_of_ip_address'))
df_3.sort(col("distinct_number_of_ip_address").desc()).show()

+--------------+-----------------------------+
|       Country|distinct_number_of_ip_address|
+--------------+-----------------------------+
|         China|                          172|
|     Indonesia|                          114|
|   Philippines|                           65|
|        Russia|                           56|
|        Brazil|                           35|
|        Poland|                           31|
|        Sweden|                           28|
|         Japan|                           25|
|Czech Republic|                           23|
|      Portugal|                           23|
|        France|                           21|
|          Peru|                           19|
|      Colombia|                           17|
| United States|                           15|
|     Argentina|                           14|
|       Ukraine|                           14|
|        Mexico|                           13|
|      Thailand|                           12|
|       Niger