# 📊 Análise Exploratória - Insights Iniciais

### ✅ Objetivo
Analisar a qualidade dos dados do dataset e identificar possíveis problemas para tratamento.

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum, when, trim

In [2]:
# 1. Criando uma SparkSession
spark = SparkSession.builder.appName("ExemploShapePySpark").getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/09/22 12:40:05 WARN Utils: Your hostname, codespaces-ef9f50, resolves to a loopback address: 127.0.0.1; using 10.0.11.209 instead (on interface eth0)
25/09/22 12:40:05 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/09/22 12:40:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# 2. Ler o CSV (como no pandas.read_csv)
df = spark.read.csv("ncr_ride_bookings.csv", header=True, inferSchema=True)

                                                                                

In [4]:
# 3. Calcular número de linhas e colunas
num_linhas = df.count()
num_colunas = len(df.columns)

In [15]:
# 4. Imprimir o "shape"
print(f"Dimensão do dataset: ({num_linhas}, {num_colunas})")

df.printSchema()

df.describe().show()

# 1. Total de linhas no DataFrame
total = df.count()

df.select([
    sum(
        when(
            (trim(col(c)) == "") | 
            (trim(col(c)) == "null") | 
            (trim(col(c)) == "NA") | 
            (trim(col(c)) == "NaN"), 
            1
        ).otherwise(0)
    ).alias(c) for c in df.columns
]).show()

null_counts = df.select([
    sum(
        when(
            (trim(col(c)) == "") | 
            (trim(col(c)) == "null") | 
            (trim(col(c)) == "NA") | 
            (trim(col(c)) == "NaN"), 
            1
        ).otherwise(0)
    ).alias(c) for c in df.columns
]).collect()[0].asDict()

# Cria lista com colunas, contagem e porcentagem
cols_nulls = [(col_name, count, 100 * count / total) for col_name, count in null_counts.items()]

# Ordena pelo número de nulos (decrescente)
cols_nulls_sorted = sorted(cols_nulls, key=lambda x: x[1], reverse=True)

# Mostrar resultado
print("Coluna | # Nulos | % Nulos")
for col_name, count, pct in cols_nulls_sorted:
    print(f"{col_name} | {count} | {pct:.2f}%")

Dimensão do dataset: (150000, 21)
root
 |-- Date: date (nullable = true)
 |-- Time: timestamp (nullable = true)
 |-- Booking ID: string (nullable = true)
 |-- Booking Status: string (nullable = true)
 |-- Customer ID: string (nullable = true)
 |-- Vehicle Type: string (nullable = true)
 |-- Pickup Location: string (nullable = true)
 |-- Drop Location: string (nullable = true)
 |-- Avg VTAT: string (nullable = true)
 |-- Avg CTAT: string (nullable = true)
 |-- Cancelled Rides by Customer: string (nullable = true)
 |-- Reason for cancelling by Customer: string (nullable = true)
 |-- Cancelled Rides by Driver: string (nullable = true)
 |-- Driver Cancellation Reason: string (nullable = true)
 |-- Incomplete Rides: string (nullable = true)
 |-- Incomplete Rides Reason: string (nullable = true)
 |-- Booking Value: string (nullable = true)
 |-- Ride Distance: string (nullable = true)
 |-- Driver Ratings: string (nullable = true)
 |-- Customer Rating: string (nullable = true)
 |-- Payment Met

                                                                                

+-------+----------------+--------------------+----------------+------------+---------------+-------------+-----------------+-----------------+---------------------------+---------------------------------+-------------------------+--------------------------+----------------+-----------------------+------------------+------------------+-------------------+-------------------+--------------+
|summary|      Booking ID|      Booking Status|     Customer ID|Vehicle Type|Pickup Location|Drop Location|         Avg VTAT|         Avg CTAT|Cancelled Rides by Customer|Reason for cancelling by Customer|Cancelled Rides by Driver|Driver Cancellation Reason|Incomplete Rides|Incomplete Rides Reason|     Booking Value|     Ride Distance|     Driver Ratings|    Customer Rating|Payment Method|
+-------+----------------+--------------------+----------------+------------+---------------+-------------+-----------------+-----------------+---------------------------+---------------------------------+---------

                                                                                

+----+----+----------+--------------+-----------+------------+---------------+-------------+--------+--------+---------------------------+---------------------------------+-------------------------+--------------------------+----------------+-----------------------+-------------+-------------+--------------+---------------+--------------+
|Date|Time|Booking ID|Booking Status|Customer ID|Vehicle Type|Pickup Location|Drop Location|Avg VTAT|Avg CTAT|Cancelled Rides by Customer|Reason for cancelling by Customer|Cancelled Rides by Driver|Driver Cancellation Reason|Incomplete Rides|Incomplete Rides Reason|Booking Value|Ride Distance|Driver Ratings|Customer Rating|Payment Method|
+----+----+----------+--------------+-----------+------------+---------------+-------------+--------+--------+---------------------------+---------------------------------+-------------------------+--------------------------+----------------+-----------------------+-------------+-------------+--------------+---------

                                                                                