Para este caso retomamos los datos guardados en la tabla "apartamentRental"

# Consultamos los datos

Importamos librerias y configuramos la conexión con postgres

In [1]:
import findspark
findspark.init('/opt/spark')

from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, date_format, udf 
from pyspark.sql.types import DateType
from pyspark.sql import Window
import pyspark.sql.functions as F
import numpy as np

#path del archivo postgresql-42.1.4.jar
dir = "/Users/jasonsolano/Documents/BigData/BigDataTEC/Clase1/DB/"
spark = SparkSession \
    .builder \
    .appName("Basic JDBC pipeline") \
    .config("spark.driver.extraClassPath", dir+"postgresql-42.1.4.jar") \
    .config("spark.executor.extraClassPath", dir+"postgresql-42.1.4.jar") \
    .getOrCreate()

Consultamos los datos

In [2]:
df = spark \
    .read \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://localhost/ImmoDataBase") \
    .option("user", "postgres") \
    .option("password", "102800") \
    .option("dbtable", "apartmentrental") \
    .load()

df.limit(5).toPandas().head()

Unnamed: 0,regio1,serviceCharge,heatingType,telekomTvOffer,telekomHybridUploadSpeed,newlyConst,balcony,electricityBasePrice,picturecount,pricetrend,...,numberOfFloors,noRoomsRange,garden,livingSpaceRange,regio2,regio3,heatingCosts,energyEfficiencyClass,lastRefurbish,date
0,Baden_Württemberg,350.0,floor_heating,ONE_YEAR_FREE,,True,True,,12,5.07,...,,3,False,6,Reutlingen_Kreis,Pfullingen,,A,,May19
1,Nordrhein_Westfalen,100.0,,ONE_YEAR_FREE,10.0,False,False,,29,3.52,...,,2,False,2,Essen,Karnap,,,,May19
2,Sachsen,90.0,,ONE_YEAR_FREE,10.0,False,False,,11,0.59,...,,2,False,2,Zwickau_Kreis,Werdau,,,,May19
3,Nordrhein_Westfalen,84.0,central_heating,ONE_YEAR_FREE,10.0,False,True,90.76,4,4.92,...,2.0,3,False,3,Steinfurt_Kreis,Emsdetten,,,,Sep18
4,Nordrhein_Westfalen,195.0,central_heating,ONE_YEAR_FREE,,False,True,,3,3.58,...,,2,False,3,Bielefeld,Schildesche,,,,Oct19


Observamos datos importantes

In [3]:
df.describe().toPandas().head()

Unnamed: 0,summary,regio1,serviceCharge,heatingType,telekomTvOffer,telekomHybridUploadSpeed,electricityBasePrice,picturecount,pricetrend,telekomUploadSpeed,...,floor,numberOfFloors,noRoomsRange,livingSpaceRange,regio2,regio3,heatingCosts,energyEfficiencyClass,lastRefurbish,date
0,count,198379,193267.0,165769,174578,36865.0,47185.0,198379.0,197158.0,174011.0,...,160750.0,126549.0,198361.0,198361.0,198361,198361,63195.0,55008,59275.0,198361
1,mean,,151.43279711487244,,,10.0,89.10167977102157,9.718145569843582,3.435082573367232,27.785562981649317,...,2.1272970451010886,3.571162158531478,2.580330810996113,3.0788713507191434,,,78.0102742305564,,2013.593757908056,
2,stddev,,352.8943402904954,,,0.0,5.413494724155707,6.348194348133843,2.055461804742117,16.655154583281252,...,4.014081150928973,6.349898372180542,0.9341394698673624,1.4058025434166923,,,159.5179245725537,,10.11258258432115,
3,min,Baden_Württemberg,0.0,central_heating,NONE,10.0,71.43,0.0,-9.17,1.0,...,-1.0,0.0,1.0,1.0,Aachen,Aach,0.0,A,1015.0,May19
4,max,Thüringen,146118.0,wood_pellet_heating,ON_DEMAND,10.0,90.76,109.0,14.92,100.0,...,999.0,999.0,5.0,7.0,Zwickau_Kreis,Ürzig,12613.0,NO_INFORMATION,2919.0,Sep18


In [4]:
df.printSchema()

root
 |-- regio1: string (nullable = true)
 |-- serviceCharge: double (nullable = true)
 |-- heatingType: string (nullable = true)
 |-- telekomTvOffer: string (nullable = true)
 |-- telekomHybridUploadSpeed: double (nullable = true)
 |-- newlyConst: boolean (nullable = true)
 |-- balcony: boolean (nullable = true)
 |-- electricityBasePrice: double (nullable = true)
 |-- picturecount: integer (nullable = true)
 |-- pricetrend: double (nullable = true)
 |-- telekomUploadSpeed: double (nullable = true)
 |-- totalRent: double (nullable = true)
 |-- yearConstructed: double (nullable = true)
 |-- electricityKwhPrice: double (nullable = true)
 |-- scoutId: integer (nullable = true)
 |-- noParkSpaces: double (nullable = true)
 |-- firingTypes: string (nullable = true)
 |-- hasKitchen: boolean (nullable = true)
 |-- geo_bln: string (nullable = true)
 |-- cellar: boolean (nullable = true)
 |-- yearConstructedRange: double (nullable = true)
 |-- baseRent: double (nullable = true)
 |-- houseNumber

# Iniciamos con la Limpieza de datos

In [5]:
datasetSize = df.count()
print(datasetSize)

198397


funcion que permite detectar el porcentaje nulo de cada columna

In [6]:
def detectNullPercentajeByColumn(df):
    columns = df.schema.names
    datasetSize = df.count()
    values = list()
    for i in columns:
        nullAmount = df.where(col(i).isNull()).count()
        nullPercentage = nullAmount/datasetSize
        values.append((i,nullPercentage,nullAmount))
    return values

In [7]:
detectNullPercentajeByColumn(df)

[('regio1', 9.072717833434981e-05, 18),
 ('serviceCharge', 0.0258572458252897, 5130),
 ('heatingType', 0.16445813192739808, 32628),
 ('telekomTvOffer', 0.12005725893032657, 23819),
 ('telekomHybridUploadSpeed', 0.8141856983724552, 161532),
 ('newlyConst', 9.072717833434981e-05, 18),
 ('balcony', 9.072717833434981e-05, 18),
 ('electricityBasePrice', 0.7621687827940947, 151212),
 ('picturecount', 9.072717833434981e-05, 18),
 ('pricetrend', 0.006245054108681079, 1239),
 ('telekomUploadSpeed', 0.12291516504785858, 24386),
 ('totalRent', 0.15014339934575624, 29788),
 ('yearConstructed', 0.2133096770616491, 42320),
 ('electricityKwhPrice', 0.7621687827940947, 151212),
 ('scoutId', 9.072717833434981e-05, 18),
 ('noParkSpaces', 0.6575603461745894, 130458),
 ('firingTypes', 0.21103645720449402, 41869),
 ('hasKitchen', 9.072717833434981e-05, 18),
 ('geo_bln', 9.072717833434981e-05, 18),
 ('cellar', 9.072717833434981e-05, 18),
 ('yearConstructedRange', 0.2133096770616491, 42320),
 ('baseRent', 9.

### Conclusion de datos nulos

Como se puede observar existen columnas con muchos datos nulos por lo cual seran eliminadas del dataset:

'telekomHybridUploadSpeed': 0.8141856983724552

'electricityBasePrice': 0.7621687827940947

'noParkSpaces': 0.6575603461745894

'interiorQual': 0.41864544322746816

'petsAllowed': 0.42766271667414324

'thermalChar': 0.3897639581243668

'numberOfFloors': 0.36214257272035366

'heatingCosts': 0.6814719980644869

'energyEfficiencyClass': 0.722737743010227

'lastRefurbish': 0.7012303613461897

'electricityKwhPrice', 0.7641407042328702


### Se eliminan las columnas presentadas

In [8]:
df = df.drop('telekomHybridUploadSpeed','electricityKwhPrice','electricityBasePrice','noParkSpaces','interiorQual','petsAllowed',\
            'thermalChar','numberOfFloors','heatingCosts','energyEfficiencyClass','lastRefurbish')


### Eliminación del filas en base a totalRent

TotalRent es el atributo que usaremos para aplicar la regression, debido a que presenta valores nulos se eliminaran todas las filas que presenten esos valores

El porcentaje de valores nulos es 'totalRent', 0.15014339934575624

In [9]:
df = df.na.drop(subset=["totalRent"])

Realizamos el conteo de todos los registros, para verificar la disminucion de datos los cuales eran 198397

In [10]:
datasetSize = df.count()
print(df.count())

168609


Volvemos a imprimir el procentaje para los valores nulos

In [11]:
detectNullPercentajeByColumn(df)

[('regio1', 0.0, 0),
 ('serviceCharge', 0.01779264452075512, 3000),
 ('heatingType', 0.15905438025253693, 26818),
 ('telekomTvOffer', 0.11810757432877249, 19914),
 ('newlyConst', 0.0, 0),
 ('balcony', 0.0, 0),
 ('picturecount', 0.0, 0),
 ('pricetrend', 0.006411282908978761, 1081),
 ('telekomUploadSpeed', 0.12100777538565557, 20403),
 ('totalRent', 0.0, 0),
 ('yearConstructed', 0.21638228089840994, 36484),
 ('scoutId', 0.0, 0),
 ('firingTypes', 0.21119275957985637, 35609),
 ('hasKitchen', 0.0, 0),
 ('geo_bln', 0.0, 0),
 ('cellar', 0.0, 0),
 ('yearConstructedRange', 0.21638228089840994, 36484),
 ('baseRent', 0.0, 0),
 ('houseNumber', 0.23566357667740157, 39735),
 ('livingSpace', 0.00010675586712453072, 18),
 ('geo_krs', 0.00010675586712453072, 18),
 ('condition', 0.24770326613644586, 41765),
 ('streetPlain', 0.23574660901849842, 39749),
 ('lift', 0.00010675586712453072, 18),
 ('baseRentRange', 0.00010675586712453072, 18),
 ('typeOfFlat', 0.13713977308447353, 23123),
 ('geo_plz', 0.000106

# Analisis de cada variable

Todavía existen variables que debemos evaluar y curar sus valores nulos, sin embargo devemos de analizar su conposición

Variable: regio1

In [12]:
df.groupBy('regio1').count().show()

+--------------------+-----+
|              regio1|count|
+--------------------+-----+
|       Niedersachsen| 8977|
|   Baden_Württemberg| 9233|
|  Schleswig_Holstein| 4332|
| Nordrhein_Westfalen|37473|
|         Brandenburg| 4896|
|              Berlin| 7261|
|              Bayern|13100|
|             Sachsen|39794|
|             Hamburg| 2445|
|              Bremen| 1812|
|     Rheinland_Pfalz| 4690|
|           Thüringen| 5625|
|              Hessen|10381|
|      Sachsen_Anhalt|13241|
|            Saarland|  751|
|Mecklenburg_Vorpo...| 4598|
+--------------------+-----+



## Variable: serviceCharge

('serviceCharge', 0.01779264452075512)

Observamos que es un valor numerico, por lo cual 

In [13]:
df.select('serviceCharge').show()

+-------------+
|serviceCharge|
+-------------+
|        100.0|
|         84.0|
|        115.0|
|         33.0|
|       169.75|
|        138.0|
|        68.63|
|         85.5|
|        118.0|
|        225.0|
|         90.0|
|        200.0|
|         80.0|
|        150.0|
|         78.8|
|        210.0|
|         77.0|
|        140.0|
|        219.1|
|        120.0|
+-------------+
only showing top 20 rows



In [14]:
df.select('serviceCharge').describe().show()

+-------+-----------------+
|summary|    serviceCharge|
+-------+-----------------+
|  count|           165609|
|   mean|151.5554388952293|
| stddev|373.7195537729053|
|    min|              0.0|
|    max|         146118.0|
+-------+-----------------+



In [15]:
df.groupBy('serviceCharge').count().show()

+-------------+-----+
|serviceCharge|count|
+-------------+-----+
|        305.0|   79|
|        46.15|    1|
|         69.8|    3|
|        86.44|    2|
|       300.96|    1|
|        60.17|    1|
|        299.0|   26|
|       134.97|    1|
|       125.64|    3|
|        79.89|    2|
|        73.73|    1|
|        168.5|    3|
|         74.5|    7|
|        98.09|    2|
|        128.8|    7|
|        330.4|    1|
|       179.26|    1|
|       189.83|    1|
|       146.41|    3|
|       295.39|    1|
+-------------+-----+
only showing top 20 rows



Observando los datos vemos que la moda "305" pero se repite muy pocas veces, por lo cual usaremos la media para llenar los valores nulos la cual es 151.55

## Variable: heatingType

('heatingType', 0.15905438025253693, 26818),    
    
Observamos los valores de esta columna, como se puede observar son datos categoricos, por lo cual es dificil sustituirlos por otro valor

In [16]:
df.select('heatingType').show()

+--------------------+
|         heatingType|
+--------------------+
|                null|
|     central_heating|
|night_storage_heater|
|                null|
|         gas_heating|
|     central_heating|
|                null|
|         gas_heating|
|     central_heating|
|     central_heating|
|                null|
|           heat_pump|
|     central_heating|
|     central_heating|
|         gas_heating|
|     central_heating|
|     central_heating|
|                null|
|     central_heating|
|    district_heating|
+--------------------+
only showing top 20 rows



Si observamos bien, se puede ver que los valores nulos representan la segunda cantidad mas grande de datos en la columna por lo que se decide borrar la columna

In [17]:
df.groupBy('heatingType').count().sort(col("count").desc()).withColumn('perc_of_count_total', (F.col('count') / datasetSize)) \
  .toPandas().head(20)

Unnamed: 0,heatingType,count,perc_of_count_total
0,central_heating,82141,0.487169
1,,26818,0.159054
2,district_heating,15721,0.093239
3,gas_heating,13357,0.079219
4,floor_heating,11074,0.065679
5,self_contained_central_heating,10943,0.064902
6,oil_heating,3351,0.019874
7,heat_pump,1530,0.009074
8,combined_heat_and_power_plant,1382,0.008196
9,night_storage_heater,837,0.004964


Como podemos ver, existe un gran porcentaje para valores nulos, lo que realizaremos es realizar un función que permite crear una asignación de forman random, siguiendo la probabilidades mostradas en la anterior tabla

In [18]:
# np.random.choice(numpy.arange(1, 7), p=[0.1, 0.05, 0.05, 0.2, 0.4, 0.2])

def generateHeatingType():
    p =[0.5,
        0.15,
        0.10,
        0.09,
        0.07,
        0.02,
        0.01,
        0.01,
        0.01,
        0.01,
        0.01,
        0.01,
        0.01]
    categories =["central_heating",
                "district_heating",
                "gas_heating",
                "floor_heating",
                "self_contained_central_heating",
                "oil_heating",
                "heat_pump",
                "combined_heat_and_power_plant",
                "night_storage_heater",
                "wood_pellet_heating",
                "electric_heating",
                "stove_heating",
                "solar_heating"]
    index = np.random.choice(np.arange(0, len(categories)), p=p)
    return categories[index]

print(generateHeatingType())

    

central_heating


### Creamos una nueva columna con la correción de datos

In [19]:
from pyspark.sql.types import StringType
generate_heating_type = udf(generateHeatingType,'string')

df = df.withColumn('heatingTypeN', F.when(F.isnull('heatingType'),generate_heating_type()).otherwise(df['heatingType']))

In [20]:
df.groupBy('heatingTypeN').count().sort(col("count").desc()).withColumn('perc_of_count_total', (F.col('count') / datasetSize)) \
  .toPandas().head(20)

Unnamed: 0,heatingTypeN,count,perc_of_count_total
0,central_heating,95449,0.566097
1,district_heating,19695,0.116809
2,gas_heating,16117,0.095588
3,floor_heating,13473,0.079907
4,self_contained_central_heating,12829,0.076087
5,oil_heating,3931,0.023314
6,heat_pump,1807,0.010717
7,combined_heat_and_power_plant,1641,0.009733
8,night_storage_heater,1062,0.006299
9,wood_pellet_heating,875,0.00519


## Variable: telekomTvOffer

('telekomTvOffer', 0.11810757432877249, 19914)

Como podemos observar al igual que heatingType, existe gran cantidad de valores nulos

Observamos el tipo de datos para la columna

In [21]:
df.groupBy('telekomTvOffer').count().sort(col("count").desc()).withColumn('perc_of_count_total', (F.col('count') / datasetSize)) \
  .toPandas().head(20)

Unnamed: 0,telekomTvOffer,count,perc_of_count_total
0,ONE_YEAR_FREE,143224,0.849445
1,,19914,0.118108
2,NONE,3162,0.018753
3,ON_DEMAND,2309,0.013694


Como podemos ver, existe un gran porcentaje para valores nulos, lo que realizaremos es realizar un función que permite crear una asignación de forman random, siguiendo la probabilidades mostradas en la anterior tabla

In [22]:
def generateTelekomTvOffer():
    p =[0.95,
        0.025,
        0.025
       ]
    categories =["ONE_YEAR_FREE",
                "NONE",
                "ON_DEMAND",
                ]
    index = np.random.choice(np.arange(0, len(categories)), p=p)
    return categories[index]

print(generateTelekomTvOffer())



ONE_YEAR_FREE


In [23]:
generate_telekom_tv_offer = udf(generateTelekomTvOffer,'string')

df = df.withColumn('telekomTvOfferN', F.when(F.isnull('telekomTvOffer'),generate_telekom_tv_offer()).otherwise(df['telekomTvOffer']))

In [24]:
df.groupBy('telekomTvOfferN').count().sort(col("count").desc()).withColumn('perc_of_count_total', (F.col('count') / datasetSize)) \
  .toPandas().head(20)

Unnamed: 0,telekomTvOfferN,count,perc_of_count_total
0,ONE_YEAR_FREE,162203,0.962007
1,NONE,3630,0.021529
2,ON_DEMAND,2776,0.016464


## Variable: newlyConst
 ('newlyConst', 0.0, 0)
 
Observamos los valores de esta columna, como se puede observar son datos categoricos, por lo cual es dificil sustituirlos por otro valor

In [25]:
df.groupBy('newlyConst').count().sort(col("count").desc()).withColumn('perc_of_count_total', (F.col('count') / datasetSize)) \
  .toPandas().head(20)

Unnamed: 0,newlyConst,count,perc_of_count_total
0,False,155519,0.922365
1,True,13090,0.077635


## Variable: balcony
 ('balcony', 0.0, 0)
 
Observamos los valores de esta columna, como se puede observar son datos categoricos, por lo cual es dificil sustituirlos por otro valor

In [26]:
df.groupBy('balcony').count().sort(col("count").desc()).withColumn('perc_of_count_total', (F.col('count') / datasetSize)) \
  .toPandas().head(20)

Unnamed: 0,balcony,count,perc_of_count_total
0,True,103118,0.611581
1,False,65491,0.388419


## Variable: picturecount
 ('picturecount', 0.0, 0)
 
Observamos los valores de esta columna, como se puede observar son datos categoricos, por lo cual es dificil sustituirlos por otro valor

In [27]:
df.groupBy('picturecount').count().sort(col("count").desc()).withColumn('perc_of_count_total', (F.col('count') / datasetSize)) \
  .toPandas().head(20)

Unnamed: 0,picturecount,count,perc_of_count_total
0,8,13839,0.082077
1,7,13561,0.080429
2,9,13408,0.079521
3,10,12501,0.074142
4,6,11569,0.068614
5,11,10397,0.061663
6,5,9537,0.056563
7,12,9084,0.053876
8,4,7560,0.044837
9,13,7501,0.044488


## Variable: pricetrend
('pricetrend', 0.006411282908978761, 1081)
 
Observamos los valores de esta columna, como se puede observar son datos categoricos, por lo cual es dificil sustituirlos por otro valor

In [28]:
df.select('pricetrend').show()

+----------+
|pricetrend|
+----------+
|      3.52|
|      4.92|
|      4.69|
|      0.93|
|     -1.89|
|      1.92|
|      1.32|
|      5.51|
|      3.49|
|      6.93|
|      5.08|
|      3.78|
|      2.81|
|      1.95|
|      1.79|
|      5.06|
|      1.89|
|      0.19|
|      1.61|
|      1.11|
+----------+
only showing top 20 rows



In [29]:
df.groupBy('pricetrend').count().sort(col("count").desc()).withColumn('perc_of_count_total', (F.col('count') / datasetSize)) \
  .toPandas().head(20)

Unnamed: 0,pricetrend,count,perc_of_count_total
0,0.0,9482,0.056237
1,3.33,1595,0.00946
2,3.23,1542,0.009145
3,3.57,1239,0.007348
4,1.92,1235,0.007325
5,3.85,1183,0.007016
6,3.17,1182,0.00701
7,0.19,1152,0.006832
8,3.7,1144,0.006785
9,1.75,1125,0.006672


In [30]:
df.select('pricetrend').describe().show()

+-------+------------------+
|summary|        pricetrend|
+-------+------------------+
|  count|            167528|
|   mean| 3.437725096700156|
| stddev|2.0961914504814594|
|    min|             -9.17|
|    max|             14.92|
+-------+------------------+



Observando los datos vemos que la moda "0.00" pero se repite muy pocas veces, por lo cual usaremos la media para llenar los valores nulos la cual es 3.44

In [31]:
df = df.fillna({'pricetrend':3.44})

## Variable: telekomUploadSpeed
('telekomUploadSpeed', 0.12100777538565557, 20403)

observamos los valores de esta columna, como se puede observar son datos de tipo entero, por lo cual es dificil sustituirlos por otro valor

In [32]:
df.select('telekomUploadSpeed').show()

+------------------+
|telekomUploadSpeed|
+------------------+
|               2.4|
|               2.4|
|              40.0|
|              40.0|
|               2.4|
|              40.0|
|              40.0|
|              40.0|
|              null|
|              40.0|
|               2.4|
|              40.0|
|               2.4|
|              40.0|
|              40.0|
|              null|
|              null|
|              40.0|
|              null|
|              40.0|
+------------------+
only showing top 20 rows



In [33]:
df.groupBy('telekomUploadSpeed').count().sort(col("count").desc()).withColumn('perc_of_count_total', (F.col('count') / datasetSize)) \
  .toPandas().head(20)

Unnamed: 0,telekomUploadSpeed,count,perc_of_count_total
0,40.0,95583,0.566891
1,2.4,28277,0.167708
2,10.0,23225,0.137745
3,,20403,0.121008
4,5.0,822,0.004875
5,1.0,159,0.000943
6,100.0,111,0.000658
7,4.0,29,0.000172


Pare este caso, usaremos el algoritmo que se ha venido utilizando para los datos categorios para asignar de forma random pero con una distribución de probabilidad 

In [34]:
def generateTelekomUploadSpeed():
    p =[0.6,
        0.2,
        0.2
       ]
    categories =[40,
                 2.4,
                 10
                ]
    index = np.random.choice(np.arange(0, len(categories)), p=p)
    return categories[index]

print(generateTelekomUploadSpeed())

40


In [35]:
from pyspark.sql.types import StringType
generate_Telekom_up_load_speed = udf(generateTelekomUploadSpeed,'int')
df = df.withColumn('telekomUploadSpeedN', F.when(F.isnull('telekomUploadSpeed'),generate_Telekom_up_load_speed()).otherwise(df['telekomUploadSpeed']))
# df = df.withColumn('telekomUploadSpeedN', F.when(F.isnan('telekomUploadSpeedN'),generate_Telekom_up_load_speed()).otherwise(df['telekomUploadSpeedN']))

In [36]:
df.groupBy('telekomUploadSpeedN').count().sort(col("count").desc()).withColumn('perc_of_count_total', (F.col('count') / datasetSize)) \
  .toPandas().head(20)

Unnamed: 0,telekomUploadSpeedN,count,perc_of_count_total
0,40.0,107795,0.639319
1,2.4,28277,0.167708
2,10.0,27316,0.162008
3,,4100,0.024317
4,5.0,822,0.004875
5,1.0,159,0.000943
6,100.0,111,0.000658
7,4.0,29,0.000172


Por un error de la libreria o valores especiales, por lo que usamos la moda y el siguiente comando 

In [37]:
df = df.fillna({'telekomUploadSpeedN':40})
df.groupBy('telekomUploadSpeedN').count().sort(col("count").desc()).withColumn('perc_of_count_total', (F.col('count') / datasetSize)) \
  .toPandas().head(20)

Unnamed: 0,telekomUploadSpeedN,count,perc_of_count_total
0,40.0,111917,0.663766
1,2.4,28277,0.167708
2,10.0,27294,0.161877
3,5.0,822,0.004875
4,1.0,159,0.000943
5,100.0,111,0.000658
6,4.0,29,0.000172


## Variable: totalRent

('totalRent', 0.0, 0)

Observamos que es un valor numerico, por lo cual 

In [38]:
df.select('totalRent').show()

+---------+
|totalRent|
+---------+
|    429.0|
|    578.0|
|    365.0|
|    358.0|
|   449.49|
|    493.0|
|   434.82|
|    332.5|
|   503.95|
|   1025.0|
|    474.0|
|   1060.0|
|    530.0|
|    590.0|
|    425.0|
|   1135.0|
|    400.0|
|    485.0|
|  1287.21|
|    420.0|
+---------+
only showing top 20 rows



## Variable: yearConstructed

 ('yearConstructed', 0.21638228089840994, 36484),

Observamos que es un valor numerico, por lo cual 

In [39]:
df.select('yearConstructed').show()

+---------------+
|yearConstructed|
+---------------+
|         1918.0|
|         1997.0|
|           null|
|         1965.0|
|         1930.0|
|         1963.0|
|         1985.0|
|         1900.0|
|         1953.0|
|         1996.0|
|         1994.0|
|         2011.0|
|         1965.0|
|         1977.0|
|         1969.0|
|         1993.0|
|         1976.0|
|           null|
|         2016.0|
|         1898.0|
+---------------+
only showing top 20 rows



In [40]:
df.groupBy('yearConstructed').count().sort(col("count").desc()).withColumn('perc_of_count_total', (F.col('count') / datasetSize)) \
  .toPandas().head(30)

Unnamed: 0,yearConstructed,count,perc_of_count_total
0,,36484,0.216382
1,1900.0,6888,0.040852
2,2019.0,6091,0.036125
3,2018.0,5954,0.035312
4,1910.0,2900,0.0172
5,1996.0,2861,0.016968
6,1995.0,2822,0.016737
7,2017.0,2514,0.01491
8,1960.0,2311,0.013706
9,1972.0,2244,0.013309


In [41]:
df.groupBy('yearConstructed').count().count()

397

Debido a la gran cantidad diversa de diferentes años, y ademas al ser el procentaje nulo tan alto, se decide borrar dicha columna

In [42]:
df = df.drop('yearConstructed')

## Variable: scoutId

('scoutId', 0.0, 0)

Observamos que es un valor numerico, por lo cual 

In [43]:
df.select('scoutId').show()

+---------+
|  scoutId|
+---------+
|111154933|
| 61181725|
|106786484|
|110965684|
|110949429|
|112052968|
|112903533|
|111292104|
|111257072|
|106618713|
|106718415|
|113746891|
| 48884608|
|113643275|
| 99068828|
| 90647536|
| 46817402|
|110126567|
|106735471|
|113862914|
+---------+
only showing top 20 rows



In [44]:
df.groupBy('scoutId').count().count()

168609

Como se puede observar, es un id unico el cual no nos sirve para algoritmos de predicción

In [45]:
df = df.drop('scoutId')

## Variable: firingTypes

('firingTypes', 0.21119275957985637, 35609)

Observamos que es un valor numerico, por lo cual 

In [46]:
df.select('firingTypes').show()

+-----------------+
|      firingTypes|
+-----------------+
|      electricity|
|              gas|
|      electricity|
|              gas|
|             null|
|              gas|
| district_heating|
|              gas|
|              gas|
|             null|
|              gas|
|      heat_supply|
| district_heating|
|              gas|
|              gas|
|              gas|
|natural_gas_light|
|             null|
| district_heating|
| district_heating|
+-----------------+
only showing top 20 rows



In [47]:
df.groupBy('firingTypes').count().sort(col("count").desc()).withColumn('perc_of_count_total', (F.col('count') / datasetSize)) \
  .toPandas().head(20)

Unnamed: 0,firingTypes,count,perc_of_count_total
0,gas,69520,0.412315
1,,35609,0.211193
2,district_heating,31680,0.18789
3,oil,11095,0.065803
4,natural_gas_light,6568,0.038954
5,electricity,2784,0.016512
6,natural_gas_heavy,2501,0.014833
7,pellet_heating,1529,0.009068
8,geothermal,1489,0.008831
9,gas:electricity,920,0.005456


In [48]:
def generatefiringTypes():
    p =[0.5,
        0.2,
        0.10,
        0.08,
        0.04,
        0.02,
        0.02,
        0.02,
        0.02]
    categories =["Gas",
                "district_heating",
                "oil",
                "natural_gas_light",
                "electricity",
                "natural_gas_heavy",
                "pellet_heating",
                "geothermal",
                "gas:electricity"
                ]
    index = np.random.choice(np.arange(0, len(categories)), p=p)
    return categories[index]

print(generatefiringTypes())

district_heating


In [49]:
from pyspark.sql.types import StringType
generate_heating_type = udf(generatefiringTypes,'string')
df = df.withColumn('firingTypes', F.when(F.isnull('firingTypes'),generate_heating_type()).otherwise(df['firingTypes']))

In [50]:
df.groupBy('firingTypes').count().sort(col("count").desc()).withColumn('perc_of_count_total', (F.col('count') / datasetSize)) \
  .toPandas().head(20)

Unnamed: 0,firingTypes,count,perc_of_count_total
0,gas,69520,0.412315
1,district_heating,38813,0.230195
2,Gas,17826,0.105724
3,oil,14672,0.087018
4,natural_gas_light,9410,0.05581
5,electricity,4141,0.02456
6,natural_gas_heavy,3195,0.018949
7,geothermal,2229,0.01322
8,pellet_heating,2213,0.013125
9,gas:electricity,1676,0.00994


## Variable: hasKitchen

('hasKitchen', 0, 0)

Como podemos observar al igual que heatingType, existe gran cantidad de valores nulos

In [51]:
df.groupBy('hasKitchen').count().sort(col("count").desc()).withColumn('perc_of_count_total', (F.col('count') / datasetSize)) \
  .toPandas().head(20)

Unnamed: 0,hasKitchen,count,perc_of_count_total
0,False,111061,0.65869
1,True,57548,0.34131


## Variable: geo_bln

('geo_bln', 0, 0)

Como podemos observar al igual que heatingType, existe gran cantidad de valores nulos

In [52]:
df.groupBy('geo_bln').count().sort(col("count").desc()).withColumn('perc_of_count_total', (F.col('count') / datasetSize)) \
  .toPandas().head(20)

Unnamed: 0,geo_bln,count,perc_of_count_total
0,Sachsen,39794,0.236013
1,Nordrhein_Westfalen,37473,0.222248
2,Sachsen_Anhalt,13241,0.078531
3,Bayern,13100,0.077695
4,Hessen,10381,0.061568
5,Baden_Württemberg,9233,0.05476
6,Niedersachsen,8977,0.053242
7,Berlin,7261,0.043064
8,Thüringen,5625,0.033361
9,Brandenburg,4896,0.029038


## Variable: cellar

('cellar', 0, 0)

Como podemos observar al igual que heatingType, existe gran cantidad de valores nulos

In [53]:
df.groupBy('cellar').count().sort(col("count").desc()).withColumn('perc_of_count_total', (F.col('count') / datasetSize)) \
  .toPandas().head(20)

Unnamed: 0,cellar,count,perc_of_count_total
0,True,109224,0.647795
1,False,59385,0.352205


## Variable: yearConstructedRange

('yearConstructedRange', 0.21638228089840994, 36484)

Como podemos observar al igual que heatingType, existe gran cantidad de valores nulos

In [54]:
df.groupBy('yearConstructedRange').count().sort(col("count").desc()).withColumn('perc_of_count_total', (F.col('count') / datasetSize)) \
  .toPandas().head(20)

Unnamed: 0,yearConstructedRange,count,perc_of_count_total
0,,36484,0.216382
1,1.0,35585,0.21105
2,2.0,27660,0.164048
3,5.0,16885,0.100143
4,9.0,16670,0.098868
5,3.0,14600,0.086591
6,4.0,11390,0.067553
7,8.0,4842,0.028717
8,6.0,2252,0.013356
9,7.0,2241,0.013291


In [56]:
def generateYearConstructedRange():
    p =[0.27,
        0.18,
        0.10,
        0.08,
        0.12,
        0.08,
        0.03,
        0.03,
        0.11]
    categories =[1,2,3,4,5,6,7,8,9]
    index = np.random.choice(np.arange(0, len(categories)), p=p)
    return categories[index]

print(generateYearConstructedRange())

1


In [57]:
generate_year_constructed_range = udf(generateYearConstructedRange,'int')
df = df.withColumn('yearConstructedRange', F.when(F.isnull('yearConstructedRange'),generate_year_constructed_range()).otherwise(df['yearConstructedRange']))

In [58]:
df.groupBy('yearConstructedRange').count().sort(col("count").desc()).withColumn('perc_of_count_total', (F.col('count') / datasetSize)) \
  .toPandas().head(20)

Unnamed: 0,yearConstructedRange,count,perc_of_count_total
0,1.0,45442,0.269511
1,2.0,34196,0.202812
2,5.0,21426,0.127075
3,9.0,20681,0.122657
4,3.0,18195,0.107912
5,4.0,14353,0.085126
6,8.0,5956,0.035324
7,6.0,5095,0.030218
8,7.0,3265,0.019364


## Variable: baseRent

 ('baseRent', 0.0, 0)


In [59]:
df.groupBy('baseRent').count().sort(col("count").desc()).withColumn('perc_of_count_total', (F.col('count') / datasetSize)) \
  .toPandas().head(20)

Unnamed: 0,baseRent,count,perc_of_count_total
0,350.0,2396,0.01421
1,450.0,2160,0.012811
2,300.0,2093,0.012413
3,400.0,1915,0.011358
4,650.0,1757,0.010421
5,550.0,1641,0.009733
6,320.0,1620,0.009608
7,750.0,1598,0.009478
8,500.0,1577,0.009353
9,250.0,1526,0.009051


## Variable: houseNumber

 ('houseNumber', 0.23566357667740157, 39735)

Como podemos observar al igual que heatingType, existe gran cantidad de valores nulos

In [60]:
df.groupBy('houseNumber').count().sort(col("count").desc()).withColumn('perc_of_count_total', (F.col('count') / datasetSize)) \
  .toPandas().head(20)

Unnamed: 0,houseNumber,count,perc_of_count_total
0,,39735,0.235664
1,1.0,4656,0.027614
2,2.0,4535,0.026897
3,4.0,3834,0.022739
4,3.0,3725,0.022093
5,5.0,3586,0.021268
6,6.0,3300,0.019572
7,8.0,3151,0.018688
8,7.0,3135,0.018593
9,10.0,2892,0.017152


In [61]:
df.select('houseNumber').describe().show()

+-------+------------------+
|summary|       houseNumber|
+-------+------------------+
|  count|            128874|
|   mean| 51.55301151469267|
| stddev|3046.6609783010163|
|    min|                !=|
|    max|                yy|
+-------+------------------+



Al observar la media y moda, incluso los datos min y max, aparte de la gran cantidad de valores nulos, se decide eliminar la columna 

In [62]:
df = df.drop('houseNumber')

## Variable: livingSpace

('livingSpace', 0.00010675586712453072, 18)

Como podemos observar al igual que heatingType, existe gran cantidad de valores nulos

In [64]:
df.groupBy('livingSpace').count().sort(col("count").desc()).withColumn('perc_of_count_total', (F.col('count') / datasetSize)) \
  .toPandas().head(20)

Unnamed: 0,livingSpace,count,perc_of_count_total
0,60.0,3257,0.019317
1,70.0,2562,0.015195
2,80.0,2422,0.014365
3,65.0,2315,0.01373
4,75.0,2241,0.013291
5,50.0,2122,0.012585
6,55.0,1866,0.011067
7,90.0,1859,0.011026
8,100.0,1767,0.01048
9,85.0,1607,0.009531


Llenamos los 18 campos nulos con la mediana 

In [65]:
df = df.fillna({'livingSpace':60})

In [66]:
df.groupBy('livingSpace').count().sort(col("count").desc()).withColumn('perc_of_count_total', (F.col('count') / datasetSize)) \
  .toPandas().head(20)

Unnamed: 0,livingSpace,count,perc_of_count_total
0,60.0,3275,0.019424
1,70.0,2562,0.015195
2,80.0,2422,0.014365
3,65.0,2315,0.01373
4,75.0,2241,0.013291
5,50.0,2122,0.012585
6,55.0,1866,0.011067
7,90.0,1859,0.011026
8,100.0,1767,0.01048
9,85.0,1607,0.009531


## Variable: geo_krs

('geo_krs', 0.00010675586712453072, 18)

Como podemos observar al igual que heatingType, existe gran cantidad de valores nulos

In [68]:
df.groupBy('geo_krs').count().sort(col("count").desc()).withColumn('perc_of_count_total', (F.col('count') / datasetSize)) \
  .toPandas().head(20)

Unnamed: 0,geo_krs,count,perc_of_count_total
0,Leipzig,9278,0.055027
1,Chemnitz,8845,0.052459
2,Berlin,7243,0.042957
3,Dresden,4743,0.02813
4,Magdeburg,3260,0.019335
5,Halle_Saale,2937,0.017419
6,München,2785,0.016518
7,Frankfurt_am_Main,2704,0.016037
8,Essen,2652,0.015729
9,Hamburg,2445,0.014501


In [69]:
df = df.fillna({'geo_krs':'Leipzig'})

In [70]:
df.groupBy('geo_krs').count().sort(col("count").desc()).withColumn('perc_of_count_total', (F.col('count') / datasetSize)) \
  .toPandas().head(20)

Unnamed: 0,geo_krs,count,perc_of_count_total
0,Leipzig,9296,0.055133
1,Chemnitz,8845,0.052459
2,Berlin,7243,0.042957
3,Dresden,4743,0.02813
4,Magdeburg,3260,0.019335
5,Halle_Saale,2937,0.017419
6,München,2785,0.016518
7,Frankfurt_am_Main,2704,0.016037
8,Essen,2652,0.015729
9,Hamburg,2445,0.014501


## Variable: condition

('condition', 0.24770326613644586, 41765)

Como podemos observar al igual que heatingType, existe gran cantidad de valores nulos

In [71]:
df.groupBy('condition').count().sort(col("count").desc()).withColumn('perc_of_count_total', (F.col('count') / datasetSize)) \
  .toPandas().head(20)

Unnamed: 0,condition,count,perc_of_count_total
0,,41765,0.247703
1,well_kept,41366,0.245337
2,refurbished,18623,0.110451
3,fully_renovated,16562,0.098227
4,mint_condition,13675,0.081105
5,first_time_use,13167,0.078092
6,modernized,10935,0.064854
7,first_time_use_after_refurbishment,10230,0.060673
8,negotiable,1402,0.008315
9,need_of_renovation,881,0.005225


In [77]:
def generateCondition():
    p =[0.30,
        0.14,
        0.12,
        0.11,
        0.10,
        0.09,
        0.09,
        0.02,
        0.02,
        0.01
       ]
    categories =["well_kept",
                "refurbished",
                "fully_renovated",
                "mint_condition",
                "first_time_use",
                "modernized",
                "first_time_use_after_refurbishment",
                "negotiable",
                "need_of_renovation",
                "ripe_for_demolition"
                ]
    index = np.random.choice(np.arange(0, len(categories)), p=p)
    return categories[index]

print(generateCondition())

modernized


In [78]:
generate_condition = udf(generateCondition,'string')
df = df.withColumn('condition', F.when(F.isnull('condition'),generate_telekom_tv_offer()).otherwise(df['condition']))

In [79]:
df.groupBy('condition').count().sort(col("count").desc()).withColumn('perc_of_count_total', (F.col('count') / datasetSize)) \
  .toPandas().head(20)

Unnamed: 0,condition,count,perc_of_count_total
0,well_kept,41366,0.245337
1,ONE_YEAR_FREE,39617,0.234964
2,refurbished,18623,0.110451
3,fully_renovated,16562,0.098227
4,mint_condition,13675,0.081105
5,first_time_use,13167,0.078092
6,modernized,10935,0.064854
7,first_time_use_after_refurbishment,10230,0.060673
8,negotiable,1402,0.008315
9,ON_DEMAND,1098,0.006512


## Variable: streetPlain

('streetPlain', 0.23574660901849842, 39749)

Como podemos observar al igual que heatingType, existe gran cantidad de valores nulos

In [80]:
df.groupBy('streetPlain').count().sort(col("count").desc()).withColumn('perc_of_count_total', (F.col('count') / datasetSize)) \
  .toPandas().head(20)

Unnamed: 0,streetPlain,count,perc_of_count_total
0,,39749,0.235747
1,Hauptstraße,588,0.003487
2,Bahnhofstraße,544,0.003226
3,Leipziger_Straße,323,0.001916
4,Goethestraße,276,0.001637
5,Berliner_Straße,264,0.001566
6,Schillerstraße,245,0.001453
7,Gartenstraße,243,0.001441
8,Bahnhofstr.,241,0.001429
9,Hauptstr.,235,0.001394


Al observar los datos tan dispersos y la gran cantidad de nulos

In [81]:
df = df.drop('streetPlain')

## Variable: lift

('lift', 0.00010675586712453072, 18)

Como podemos observar al igual que heatingType, existe gran cantidad de valores nulos

In [82]:
df.groupBy('lift').count().sort(col("count").desc()).withColumn('perc_of_count_total', (F.col('count') / datasetSize)) \
  .toPandas().head(20)

Unnamed: 0,lift,count,perc_of_count_total
0,False,129190,0.766211
1,True,39401,0.233683
2,,18,0.000107


Utilizamos para llenar esos 18 registros nulos el valor false

In [84]:
df = df.fillna({'lift':False})

## Variable: typeOfFlat

('typeOfFlat', 0.13713977308447353, 23123)

Como podemos observar al igual que heatingType, existe gran cantidad de valores nulos

In [85]:
df.groupBy('typeOfFlat').count().sort(col("count").desc()).withColumn('perc_of_count_total', (F.col('count') / datasetSize)) \
  .toPandas().head(20)

Unnamed: 0,typeOfFlat,count,perc_of_count_total
0,apartment,82019,0.486445
1,,23123,0.13714
2,roof_storey,22151,0.131375
3,ground_floor,19642,0.116494
4,other,6121,0.036303
5,maisonette,5798,0.034387
6,raised_ground_floor,3701,0.02195
7,terraced_flat,2132,0.012645
8,penthouse,2052,0.01217
9,half_basement,1254,0.007437


In [86]:
def generateTypeOfFlat():
    p =[0.53,
        0.15,
        0.15,
        0.04,
        0.04,
        0.03,
        0.02,
        0.02,
        0.01,
        0.01
       ]
    categories =["apartment",
                "roof_storey",
                "ground_floor",
                "other",
                "maisonette",
                "raised_ground_floor",
                "terraced_flat",
                "penthouse",
                "half_basement",
                "loft"
                ]
    index = np.random.choice(np.arange(0, len(categories)), p=p)
    return categories[index]

print(generateTypeOfFlat())

ground_floor


In [92]:
generate_type_of_flat = udf(generateTypeOfFlat,'string')
df = df.withColumn('typeOfFlat', F.when(F.isnull('typeOfFlat'),generate_type_of_flat()).otherwise(df['typeOfFlat']))

In [90]:
df.groupBy('typeOfFlat').count().sort(col("count").desc()).withColumn('perc_of_count_total', (F.col('count') / datasetSize)) \
  .toPandas().head(20)

Unnamed: 0,typeOfFlat,count,perc_of_count_total
0,apartment,82019,0.486445
1,ground_floor,24205,0.143557
2,roof_storey,22151,0.131375
3,40,13806,0.081882
4,other,6121,0.036303
5,maisonette,5798,0.034387
6,10,4754,0.028195
7,raised_ground_floor,3701,0.02195
8,terraced_flat,2132,0.012645
9,penthouse,2052,0.01217


## Variable: geo_plz

('geo_plz', 0.00010675586712453072, 18)

Como podemos observar al igual que heatingType, existe gran cantidad de valores nulos

In [93]:
df.groupBy('geo_plz').count().sort(col("count").desc()).withColumn('perc_of_count_total', (F.col('count') / datasetSize)) \
  .toPandas().head(20)

Unnamed: 0,geo_plz,count,perc_of_count_total
0,9130.0,1406,0.008339
1,9126.0,1388,0.008232
2,9131.0,1212,0.007188
3,9112.0,1203,0.007135
4,9113.0,1048,0.006216
5,8056.0,757,0.00449
6,6217.0,698,0.00414
7,39112.0,626,0.003713
8,4157.0,622,0.003689
9,8523.0,582,0.003452


In [94]:
df = df.fillna({'geo_plz':9130})

## Variable: noRooms

('noRooms', 0.00010675586712453072, 18)


In [95]:
df.groupBy('noRooms').count().sort(col("count").desc()).withColumn('perc_of_count_total', (F.col('count') / datasetSize)) \
  .toPandas().head(20)

Unnamed: 0,noRooms,count,perc_of_count_total
0,3.0,57387,0.340355
1,2.0,57081,0.338541
2,4.0,17209,0.102065
3,1.0,17078,0.101288
4,2.5,6001,0.035591
5,3.5,5458,0.032371
6,5.0,3499,0.020752
7,1.5,2161,0.012817
8,4.5,1217,0.007218
9,6.0,822,0.004875


In [96]:
df = df.fillna({'noRooms':3})

## Variable: floor

('floor', 0.00010675586712453072, 18)

In [97]:
df.groupBy('floor').count().sort(col("count").desc()).withColumn('perc_of_count_total', (F.col('count') / datasetSize)) \
  .toPandas().head(20)

Unnamed: 0,floor,count,perc_of_count_total
0,1.0,40323,0.239151
1,2.0,36548,0.216762
2,,29617,0.175655
3,3.0,24508,0.145354
4,0.0,15545,0.092196
5,4.0,13135,0.077902
6,5.0,5131,0.030431
7,6.0,1531,0.00908
8,7.0,654,0.003879
9,8.0,359,0.002129


In [98]:
def generateTelekomUploadSpeed():
    p =[0.29
        0.24
        0.19
        0.15
        0.09
        0.04
       ]
    categories =[1,2,3,0,4,5]
    index = np.random.choice(np.arange(0, len(categories)), p=p)
    return categories[index]

print(generateTelekomUploadSpeed())

## Variable: noRoomsRange

('noRoomsRange', 0.00010675586712453072, 18)

In [99]:
df.groupBy('noRoomsRange').count().sort(col("count").desc()).withColumn('perc_of_count_total', (F.col('count') / datasetSize)) \
  .toPandas().head(20)

Unnamed: 0,noRoomsRange,count,perc_of_count_total
0,2.0,63160,0.374594
1,3.0,62849,0.37275
2,1.0,19242,0.114122
3,4.0,18427,0.109288
4,5.0,4913,0.029138
5,,18,0.000107


In [100]:
df = df.fillna({'noRoomsRange':2})

## Variable: garden

('garden', 0.00010675586712453072, 18)

In [101]:
df.groupBy('garden').count().sort(col("count").desc()).withColumn('perc_of_count_total', (F.col('count') / datasetSize)) \
  .toPandas().head(20)

Unnamed: 0,garden,count,perc_of_count_total
0,False,134390,0.797051
1,True,34201,0.202842
2,,18,0.000107


In [102]:
df = df.fillna({'garden':False})

## Variable: livingSpaceRange

('livingSpaceRange', 0.00010675586712453072, 18)

In [104]:
df.groupBy('livingSpaceRange').count().sort(col("count").desc()).withColumn('perc_of_count_total', (F.col('count') / datasetSize)) \
  .toPandas().head(20)

Unnamed: 0,livingSpaceRange,count,perc_of_count_total
0,3.0,52370,0.3106
1,2.0,49104,0.29123
2,4.0,26215,0.155478
3,1.0,16668,0.098856
4,5.0,12422,0.073673
5,6.0,7767,0.046065
6,7.0,4045,0.02399
7,,18,0.000107


In [105]:
df = df.fillna({'livingSpaceRange':3})

## Variable: regio2

('regio2', 0.00010675586712453072, 18)

In [106]:
df.groupBy('regio2').count().sort(col("count").desc()).withColumn('perc_of_count_total', (F.col('count') / datasetSize)) \
  .toPandas().head(20)

Unnamed: 0,regio2,count,perc_of_count_total
0,Leipzig,9278,0.055027
1,Chemnitz,8845,0.052459
2,Berlin,7243,0.042957
3,Dresden,4743,0.02813
4,Magdeburg,3260,0.019335
5,Halle_Saale,2937,0.017419
6,München,2785,0.016518
7,Frankfurt_am_Main,2704,0.016037
8,Essen,2652,0.015729
9,Hamburg,2445,0.014501


In [108]:
df = df.fillna({'regio2':'Leipzig'})

## Variable: regio3

('regio3', 0.00010675586712453072, 18)

In [109]:
df.groupBy('regio3').count().sort(col("count").desc()).withColumn('perc_of_count_total', (F.col('count') / datasetSize)) \
  .toPandas().head(20)

Unnamed: 0,regio3,count,perc_of_count_total
0,Innenstadt,2760,0.016369
1,Stadtmitte,1757,0.010421
2,Altstadt,1554,0.009217
3,Sonnenberg,1349,0.008001
4,Kaßberg,1267,0.007514
5,Hilbersdorf,975,0.005783
6,Schloßchemnitz,943,0.005593
7,Mitte,924,0.00548
8,Zentrum,774,0.004591
9,Südstadt,727,0.004312


In [110]:
df = df.fillna({'regio3':'Innenstadt'})

## Variable: date

('date', 0.00010675586712453072, 18)

In [111]:
df.groupBy('date').count().sort(col("count").desc()).withColumn('perc_of_count_total', (F.col('count') / datasetSize)) \
  .toPandas().head(20)

Unnamed: 0,date,count,perc_of_count_total
0,May19,66187,0.392547
1,Oct19,62641,0.371516
2,Sep18,39763,0.23583
3,,18,0.000107


In [112]:
df = df.fillna({'date':'May19'})