### Configuration

In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession\
        .builder\
        .appName("aggregation")\
        .config("spark.mongodb.input.uri","mongodb://127.0.0.1:27017/spark.practice")\
        .config("spark.mongodb.output.uri","mongodb://127.0.0.1:27017/spark.practice")\
        .config("spark.jars.packages","org.mongodb.spark:mongo-spark-connector_2.12:3.0.1")\
        .getOrCreate()

spark

In [2]:
import os
path= "../../data/"

df_list =[]
for filename in os.listdir(path):
    if filename.endswith(".csv"):
        splitter = filename.split('.')
        df_name = splitter[0]
        df = spark.read.csv(path+filename,inferSchema=True,header=True)
        exec(df_name +"=df")
        df_list.append(df_name)


In [3]:
df_list

['fifa19',
 'googleplaystore',
 'nyc_air_bnb',
 'pga_tour_historical',
 'rec_crime_pfa',
 'Rep_vs_Dem_tweets',
 'students',
 'supermarket_sales',
 'Weather',
 'youtubevideos',
 'zomato']

In [4]:
supermarket_sales.printSchema()

root
 |-- Invoice ID: string (nullable = true)
 |-- Branch: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Customer type: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Product line: string (nullable = true)
 |-- Unit price: double (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Tax 5%: double (nullable = true)
 |-- Total: double (nullable = true)
 |-- Date: string (nullable = true)
 |-- Time: string (nullable = true)
 |-- Payment: string (nullable = true)
 |-- cogs: double (nullable = true)
 |-- gross margin percentage: double (nullable = true)
 |-- gross income: double (nullable = true)
 |-- Rating: double (nullable = true)



In [5]:
supermarket_sales.limit(3).toPandas()

Unnamed: 0,Invoice ID,Branch,City,Customer type,Gender,Product line,Unit price,Quantity,Tax 5%,Total,Date,Time,Payment,cogs,gross margin percentage,gross income,Rating
0,750-67-8428,A,Yangon,Member,Female,Health and beauty,74.69,7,26.1415,548.9715,1/5/2019,13:08,Ewallet,522.83,4.761905,26.1415,9.1
1,226-31-3081,C,Naypyitaw,Normal,Female,Electronic accessories,15.28,5,3.82,80.22,3/8/2019,10:29,Cash,76.4,4.761905,3.82,9.6
2,631-41-3108,A,Yangon,Normal,Male,Home and lifestyle,46.33,7,16.2155,340.5255,3/3/2019,13:23,Credit card,324.31,4.761905,16.2155,7.4


### Write this dataframe TO Mongo

In [6]:
supermarket_sales.write.format("mongo").mode("append").option("database","spark").option("collection","supermarket").save()


# student.write.format("mongo").mode("append").option("database",
# "spark").option("collection", "practice").save()


### Load the same dataset FROM Mongo

In [8]:
df = spark.read.format("mongo").option("spark.mongodb.input.uri","mongodb://127.0.0.1/spark.supermarket").load()

In [9]:
df.toPandas()

Unnamed: 0,Branch,City,Customer type,Date,Gender,Invoice ID,Payment,Product line,Quantity,Rating,Tax 5%,Time,Total,Unit price,_id,cogs,gross income,gross margin percentage
0,A,Yangon,Member,1/5/2019,Female,750-67-8428,Ewallet,Health and beauty,7,9.1,26.1415,13:08,548.9715,74.69,"(6144311c7a60c3187fbc1986,)",522.83,26.1415,4.761905
1,C,Naypyitaw,Normal,3/8/2019,Female,226-31-3081,Cash,Electronic accessories,5,9.6,3.8200,10:29,80.2200,15.28,"(6144311c7a60c3187fbc1987,)",76.40,3.8200,4.761905
2,A,Yangon,Normal,3/3/2019,Male,631-41-3108,Credit card,Home and lifestyle,7,7.4,16.2155,13:23,340.5255,46.33,"(6144311c7a60c3187fbc1988,)",324.31,16.2155,4.761905
3,A,Yangon,Member,1/27/2019,Male,123-19-1176,Ewallet,Health and beauty,8,8.4,23.2880,20:33,489.0480,58.22,"(6144311c7a60c3187fbc1989,)",465.76,23.2880,4.761905
4,A,Yangon,Normal,2/8/2019,Male,373-73-7910,Ewallet,Sports and travel,7,5.3,30.2085,10:37,634.3785,86.31,"(6144311c7a60c3187fbc198a,)",604.17,30.2085,4.761905
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,C,Naypyitaw,Normal,1/29/2019,Male,233-67-5758,Ewallet,Health and beauty,1,6.2,2.0175,13:46,42.3675,40.35,"(6144311c7a60c3187fbc1d69,)",40.35,2.0175,4.761905
996,B,Mandalay,Normal,3/2/2019,Female,303-96-2227,Ewallet,Home and lifestyle,10,4.4,48.6900,17:16,1022.4900,97.38,"(6144311c7a60c3187fbc1d6a,)",973.80,48.6900,4.761905
997,A,Yangon,Member,2/9/2019,Male,727-02-1313,Cash,Food and beverages,1,7.7,1.5920,13:22,33.4320,31.84,"(6144311c7a60c3187fbc1d6b,)",31.84,1.5920,4.761905
998,A,Yangon,Normal,2/22/2019,Male,347-56-2442,Cash,Home and lifestyle,1,4.1,3.2910,15:33,69.1110,65.82,"(6144311c7a60c3187fbc1d6c,)",65.82,3.2910,4.761905


### Load the dataset using Aggregation


In [22]:
#specify pipeline you want to apply
pipeline1 = {"$match":{"Gender":"Male","Payment":"Ewallet"}}
pipeline2 = {"$project":{"Branch":1,"City":1,"_id":0,"Gender":1}}            

df1 = spark.read.format("mongo")\
        .option("spark.mongodb.input.uri","mongodb://127.0.0.1/spark.supermarket")\
        .option("pipeline",[pipeline1,pipeline2]).load()

In [23]:
df1.toPandas()

Unnamed: 0,Branch,City,Gender
0,A,Yangon,Male
1,A,Yangon,Male
2,C,Naypyitaw,Male
3,A,Yangon,Male
4,C,Naypyitaw,Male
...,...,...,...
180,B,Mandalay,Male
181,C,Naypyitaw,Male
182,A,Yangon,Male
183,B,Mandalay,Male
