In [3]:
# Instalar PySpark via pip
!pip install pyspark



In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("satvshr/top-4-used-car-sales-datasets-combined")

print("Path to dataset files:", path)

path = kagglehub.dataset_download("octopusteam/imdb-top-rated-titles-movies-and-tv-series")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/satvshr/top-4-used-car-sales-datasets-combined?dataset_version_number=2...


100%|██████████| 446k/446k [00:00<00:00, 61.5MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/satvshr/top-4-used-car-sales-datasets-combined/versions/2





Downloading from https://www.kaggle.com/api/v1/datasets/download/octopusteam/imdb-top-rated-titles-movies-and-tv-series?dataset_version_number=13...


100%|██████████| 140k/140k [00:00<00:00, 43.5MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/octopusteam/imdb-top-rated-titles-movies-and-tv-series/versions/13





In [29]:
# Iniciar uma Sessão Spark
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("ExemploPySpark") \
    .getOrCreate()

In [30]:
import pandas as pd
df_pandas = pd.read_csv("/root/.cache/kagglehub/datasets/satvshr/top-4-used-car-sales-datasets-combined/versions/2/output.csv")

In [14]:
df_pandas.head(4)

Unnamed: 0,brand,model,transmission,age,fuel,engine,km,owner,price,location,mileage,power,seats,type
0,mahindra,thar,manual,4.0,diesel,2184.0,11003.0,1.0,1231000.0,,,,,
1,hyundai,verna,manual,6.0,petrol,1591.0,66936.0,1.0,786000.0,,,,,
2,tata,harrier,manual,2.0,diesel,1956.0,27990.0,1.0,1489000.0,,,,,
3,honda,city,automatic,1.0,petrol,1498.0,5061.0,1.0,1227000.0,,,,,


In [15]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("ExemploLeitura").getOrCreate()

# Leitura de um arquivo CSV
df_pyspark = spark.read.csv("/root/.cache/kagglehub/datasets/satvshr/top-4-used-car-sales-datasets-combined/versions/2/output.csv", header=True, inferSchema=True)

In [31]:
#Nomes das colunas
for i in df_pandas.columns:
  print(i)

brand
model
transmission
age
fuel
engine
km
owner
price
location
mileage
power
seats
type


In [17]:
# Seleciona colunas específicas - Pandas
df_selecionado_pandas = df_pandas[['brand', 'model','transmission']]

In [18]:
# Seleciona colunas específicas - PySpark
df_selecionado_spark = df_pyspark.select('brand', 'model','transmission')

In [19]:
# Filtra linhas onde coluna1 > 50 - Pandas
df_filtrado_pandas = df_pandas[df_pandas['price'] > 2]

In [20]:
# Filtra linhas onde coluna1 > 50 - PySpark
df_filtrado_spark = df_pyspark.filter(df_pyspark.price > 2)

In [23]:
# Calcula a média de coluna1 agrupada por coluna2 - Pandas
df_agregado_pandas = df_pandas.groupby('age')['km'].mean().reset_index()

In [24]:
# Calcula a média de coluna1 agrupada por coluna2 - PySpark
from pyspark.sql.functions import avg

# Calcula a média de coluna1 agrupada por coluna2
df_agregado_spark = df_pyspark.groupBy("age").agg(avg("km").alias("media_age_km"))

In [25]:
df_selecionado_spark.show(5)
df_filtrado_spark.show(5)
df_agregado_spark.show(5)

+--------+--------+------------+
|   brand|   model|transmission|
+--------+--------+------------+
|mahindra|    thar|      manual|
| hyundai|   verna|      manual|
|    tata| harrier|      manual|
|   honda|    city|   automatic|
|    ford|ecosport|      manual|
+--------+--------+------------+
only showing top 5 rows

+--------+--------+------------+---+------+------+-------+-----+---------+--------+-------+-----+-----+----+
|   brand|   model|transmission|age|  fuel|engine|     km|owner|    price|location|mileage|power|seats|type|
+--------+--------+------------+---+------+------+-------+-----+---------+--------+-------+-----+-----+----+
|mahindra|    thar|      manual|4.0|diesel|2184.0|11003.0|  1.0|1231000.0|    NULL|   NULL| NULL| NULL|NULL|
| hyundai|   verna|      manual|6.0|petrol|1591.0|66936.0|  1.0| 786000.0|    NULL|   NULL| NULL| NULL|NULL|
|    tata| harrier|      manual|2.0|diesel|1956.0|27990.0|  1.0|1489000.0|    NULL|   NULL| NULL| NULL|NULL|
|   honda|    city|   au

In [35]:
# Junção de dois DataFrames pandas

df1 = pd.read_csv("/root/.cache/kagglehub/datasets/satvshr/top-4-used-car-sales-datasets-combined/versions/2/output.csv")
df2 = pd.read_csv("/root/.cache/kagglehub/datasets/octopusteam/imdb-top-rated-titles-movies-and-tv-series/versions/13/data.csv")

df_concat_horizontal = pd.concat([df1, df2], axis=1)

In [36]:
# Junção de dois DataFrames PySpark
df1 = spark.read.csv("/root/.cache/kagglehub/datasets/satvshr/top-4-used-car-sales-datasets-combined/versions/2/output.csv", header=True, inferSchema=True)
df2 = spark.read.csv("/root/.cache/kagglehub/datasets/octopusteam/imdb-top-rated-titles-movies-and-tv-series/versions/13/data.csv", header=True, inferSchema=True)

df_juncao = df1.join(df2, on="type", how="left")

df_juncao.show()

+----+--------+--------+------------+----+------+------+-------+-----+---------+--------+-------+-----+-----+----+-----+------+-------------+--------+-----------+
|type|   brand|   model|transmission| age|  fuel|engine|     km|owner|    price|location|mileage|power|seats|  id|title|genres|averageRating|numVotes|releaseYear|
+----+--------+--------+------------+----+------+------+-------+-----+---------+--------+-------+-----+-----+----+-----+------+-------------+--------+-----------+
|NULL|mahindra|    thar|      manual| 4.0|diesel|2184.0|11003.0|  1.0|1231000.0|    NULL|   NULL| NULL| NULL|NULL| NULL|  NULL|         NULL|    NULL|       NULL|
|NULL| hyundai|   verna|      manual| 6.0|petrol|1591.0|66936.0|  1.0| 786000.0|    NULL|   NULL| NULL| NULL|NULL| NULL|  NULL|         NULL|    NULL|       NULL|
|NULL|    tata| harrier|      manual| 2.0|diesel|1956.0|27990.0|  1.0|1489000.0|    NULL|   NULL| NULL| NULL|NULL| NULL|  NULL|         NULL|    NULL|       NULL|
|NULL|   honda|    cit