In [93]:
import os
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, regexp_replace, when

In [94]:
from ydata_profiling import ProfileReport

In [95]:
pyspark.__version__

'3.3.2'

In [96]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName('profiling') \
    .getOrCreate()

23/06/20 20:48:14 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [97]:
df = spark.read \
    .option("header", "true") \
    .option("inferSchema", "True") \
    .csv("/home/konradballegro/scripts/scraper/outputs/data/offers.csv")

                                                                                

In [98]:
# Count the number of rows in the DataFrame
num_rows = df.count()

# Count the number of columns in the DataFrame
num_cols = len(df.columns)

# Print the shape of the DataFrame
print("Number of rows: ", num_rows)
print("Number of columns: ", num_cols)

Number of rows:  129589
Number of columns:  230




In [99]:
headers = df.columns
for h, head in enumerate(headers):
    print(f"{h}: {head}")

0: Offer from
1: Category
2: Show offers with VIN number
3: Has registration number
4: Vehicle brand
5: Vehicle model
6: Version
7: Generation
8: Year of production
9: Mileage
10: Engine capacity
11: Fuel type
12: Power
13: Gearbox
14: Range
15: Drive
16: Battery capacity
17: Battery ownership type
18: CO2 emissions
19: Particulate filter
20: City fuel consumption
21: Body type
22: Number of doors
23: Number of seats
24: Color
25: Metallic
26: Color type
27: Right-hand drive (Anglik)
28: Country of origin
29: Leasing
30: VAT margin
31: VAT invoice
32: Manufacturer warranty period
33: Financing possibility
34: First registration
35: Registered in Poland
36: First owner
37: Accident-free
38: Serviced at authorized service center
39: Condition
40: ABS
41: Apple CarPlay
42: Android Auto
43: Rear side airbags43
44: Driver side airbag
45: CD
46: Central locking
47: Electric front windows
48: Electrically adjustable passenger seat
49: Electrically adjustable mirrors
50: Immobilizer
51: Driver

In [100]:
# # Sample 20% of the rows without replacement
# sampled_df = df.sample(withReplacement=False, fraction=0.2)

In [101]:
# # Count the number of rows in the DataFrame
# num_rows = sampled_df.count()

# # Count the number of columns in the DataFrame
# num_cols = len(sampled_df.columns)

# # Print the shape of the DataFrame
# print("Number of rows: ", num_rows)
# print("Number of columns: ", num_cols)

In [102]:
# profile_sample = ProfileReport(sampled_df.toPandas(), tsmode=False, title="Pandas Profiling Report Sample Data")

In [103]:
# profile_sample.to_file("/home/konradballegro/notebooks/outputs/reports/profiling_sample.html")

In [104]:
profile_filtered = df.filter((df["Currency"] == "PLN") &
                        (df["Country of origin"] == "Polska") &
                        (df["Accident-free"].isNotNull()) &
                        (df["Price"].isNotNull()) &
                        (df["Offer from"].isNotNull()) &
                        (df["Condition"].isNotNull()) &
                        (df["Vehicle brand"].isNotNull()) &
                        (df["Vehicle model"].isNotNull()) &
                        (df["Year of production"].isNotNull()) &
                        (df["Mileage"].isNotNull()) &
                        (df["Fuel type"].isNotNull()) &
                        (df["Power"].isNotNull()) &
                        (df["Gearbox"].isNotNull()) &
                        (df["Body type"].isNotNull()) &
                        (df["Number of doors"].isNotNull())
                       ).select(col("Price").cast("float").alias("Price"),
                                "Offer from",
                                "Condition",
                                "Vehicle brand",
                                "Vehicle model",
                                col("Year of production").cast("string").alias("Year of production"),
                                regexp_replace(regexp_replace(col("Mileage"), " ", ""), "km", "").cast("float").alias("Mileage"),
                                "Fuel type",
                                regexp_replace(regexp_replace(col("Power"), " ", ""), "KM", "").cast("integer").alias("Power"),
                                "Gearbox",
                                "Body type",
                                "Number of doors",
                                "URL path",
                                "ID",
                                "Epoch"
                               )

In [105]:
profile_filtered = profile_filtered.filter(profile_filtered["Price"].isNotNull())

In [106]:
profile_filtered = ProfileReport(profile_filtered.toPandas(), tsmode=False, title="Pandas Profiling Report filtered data")

                                                                                

In [107]:
profile_filtered.to_file("/home/konradballegro/notebooks/outputs/reports/profiling_filtered.html")

Summarize dataset: 100%|█| 40/40 [00:12<00:00, 
Generate report structure: 100%|█| 1/1 [00:09<0
Render HTML: 100%|█| 1/1 [00:01<00:00,  1.14s/i
Export report to file: 100%|█| 1/1 [00:00<00:00
