In [1]:
from pyspark.sql import SparkSession, functions as F

spark = SparkSession.builder \
    .appName("FinalProject") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()

spark

Con la siguiente clase se busca unificar los archivos respectivos a un estado del dataset de Google. En el caso de la metadata, se encontró con un problema de compatibilidad de las columnas de los archivos, por lo que, como solución temporal, se agregó un script específico (comentado) para ese caso; buscando filtrar sólo las columnas que aporten información.

In [7]:
class UnifyFiles:
    def __init__(self, estado:str, directorio:str, num_archivos:int) -> None:
        self.path = directorio
        self.n = num_archivos
        self.files = []
        self.state = estado
        self.df = None

    def ReadFiles(self):
        for i in range(1, self.n + 1):
            fl = spark.read.json(self.path + f"/{i}.json")

            fl = fl.select('gmap_id','avg_rating','category')         ## Este código se usa sólo para la metadata

            self.files.append(fl)
        return "Done!"
    
    def PrintFileN(self, Archivo:int):
        fl = self.files[Archivo - 1]
        fl.show()
        return "Done!"
    
    def Unify(self):
        unified = self.files[0].union(self.files[1])

        for i in self.files[3:]:
            unified = unified.union(i)
        
        unified.dropDuplicates()

        self.df = unified

        return "Done!"
    
    def DropColumns(self):
        self.df = self.df.drop('pics').drop('resp')  # Se eliminan las columnas ya que no aportan información.
        return "Done!"
    
    def PrintUnified(self):
        self.df.show()
        return "Done!"
    
    def CountUnified(self):
        return self.df.count()
    
    def WriteJson(self):
        self.df.toPandas().to_json(f"{self.state}.json")

        return "Done!"

### **`Massachussets`**

In [3]:
ma = UnifyFiles("masachussets", r"Datasets\review-Massachusetts", 16)

ma.ReadFiles()

ma.Unify()

ma.DropColumns()

ma.WriteJson()

'Done!'

In [4]:
ma.PrintFileN(16)

+--------------------+---------------+----+------+----+--------------------+-------------+--------------------+
|             gmap_id|           name|pics|rating|resp|                text|         time|             user_id|
+--------------------+---------------+----+------+----+--------------------+-------------+--------------------+
|0x89e40887d6d5823...|Carmen De Jesus|NULL|     5|NULL|           Good wine|1532303239396|11188308459885990...|
|0x89e40887d6d5823...|kathie Chartier|NULL|     5|NULL|   Awesome selection|1534592447635|10334923867856989...|
|0x89e40887d6d5823...|  Francis Smith|NULL|     5|NULL|Liquor selection ...|1575371884502|10938987617230154...|
|0x89e40887d6d5823...|    Kerrie Topi|NULL|     4|NULL|Great wine selection|1526776318490|10763045646989684...|
|0x89e40887d6d5823...|  Jason Schmidt|NULL|     5|NULL|             Not bad|1581980567039|10254819721669894...|
|0x89e40887d6d5823...|   Ralph Taylor|NULL|     4|NULL|Always has what y...|1569525575470|10782868373440

'Done!'

In [5]:
ma.CountUnified()

2250000

### **`Metadata`**

In [8]:
metadata = UnifyFiles('metadata',r'Datasets\metadata-sitios', 11)

metadata.ReadFiles()

metadata.Unify()

metadata.WriteJson()

'Done!'

In [10]:
metadata.PrintFileN(4)

+--------------------+----------+--------------------+
|             gmap_id|avg_rating|            category|
+--------------------+----------+--------------------+
|0x88f16e41928ff68...|       4.9|          [Pharmacy]|
|0x8834f5bb828394c...|       3.9|[Gymnastics cente...|
|0x8850fa407ebe7c7...|       4.5|[Electrician, Aud...|
|0x8834f503bec584f...|       4.3|[Florist, Gift ba...|
|0x88508c6d4a58520...|       4.6|        [Auditorium]|
|0x8834f1669a8a4f8...|       5.0|[Dance company, C...|
|0x89003f45d6bb196...|       4.4|[Home builder, Re...|
|0x88345fbbfbe695a...|       3.0|[Industrial equip...|
|0x89003126fbe8bc9...|       4.8|[Civil engineerin...|
|0x88d9c4182c6fb0d...|       4.9|[Auto insurance a...|
|0x89003f5a27eb3ec...|       4.2|[Real estate agency]|
|0x880e32b4dd53737...|       4.9|[HVAC contractor,...|
|0x88d9b7a4d735609...|       4.0|  [Auto repair shop]|
|0x88d9c4182f60f42...|       5.0|[High school, Pri...|
|0x88c2bd9f29a0e93...|       4.2|[Home builder, Re...|
|0x865c5e1

'Done!'