In [1]:
# Importing required functions using %run
%run src/data_cleaning.py
%run src/trip_analysis.py
%run src/tip_analysis.py
%run src/fare_analysis.py
%run src/traffic_analysis.py


In [2]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pyspark.sql import SparkSession


In [3]:
# Function to set environment variables
def set_environment_variables():
    os.environ['JAVA_HOME'] = r'C:\Program Files\Java\jdk-11'
    os.environ['SPARK_HOME'] = r'C:\spark'
    os.environ['PYSPARK_PYTHON'] = r'C:\Users\yazid\.conda\envs\spark_project\python.exe'
    os.environ['PYSPARK_DRIVER_PYTHON'] = r'C:\Users\yazid\.conda\envs\spark_project\python.exe'
    print("Environment variables set successfully")

# Function to delete environment variables
def delete_environment_variables():
    os.environ.pop('JAVA_HOME', None)
    os.environ.pop('SPARK_HOME', None)
    os.environ.pop('PYSPARK_PYTHON', None)
    os.environ.pop('PYSPARK_DRIVER_PYTHON', None)
    print("Environment variables deleted successfully")

# Set environment variables
set_environment_variables()


Environment variables set successfully


In [4]:
# Initialize Spark session with increased memory settings
try:
    spark = SparkSession.builder \
        .appName('NYC_taxi_analysis') \
        .config("spark.driver.memory", "8g") \
        .config("spark.executor.memory", "8g") \
        .config("spark.memory.offHeap.enabled", "true") \
        .config("spark.memory.offHeap.size", "8g") \
        .getOrCreate()
    print("Spark Session created successfully")

    folder_path = r'C:\Users\yazid\Desktop\spark_assigment\NYC'
    dataframes = []

    for filename in os.listdir(folder_path):
        if filename.endswith('.parquet'):
            file_path = os.path.join(folder_path, filename)
            print(f"Reading {file_path}")
            try:
                df = spark.read.parquet(file_path)
                dataframes.append(df)
            except Exception as e:
                print(f"Error reading {file_path}: {e}")

    if dataframes:
        combined_df = dataframes[0]
        for df in dataframes[1:]:
            combined_df = combined_df.unionByName(df, allowMissingColumns=True)

        combined_df = combined_df.dropDuplicates()
        combined_df.show(5)
        combined_df.printSchema()
        print(f"Total number of rows after removing duplicates: {combined_df.count()}")

        combined_df.write.parquet(r"C:\Users\yazid\Desktop\spark_assigment\combined_nyc_taxi_2021.parquet")
    else:
        print("No Parquet files found.")
except Exception as e:
    print(f"Error creating Spark Session: {e}")



Spark Session created successfully
Reading C:\Users\yazid\Desktop\spark_assigment\NYC\1.parquet
Reading C:\Users\yazid\Desktop\spark_assigment\NYC\10.parquet
Reading C:\Users\yazid\Desktop\spark_assigment\NYC\11.parquet
Reading C:\Users\yazid\Desktop\spark_assigment\NYC\12.parquet
Reading C:\Users\yazid\Desktop\spark_assigment\NYC\2.parquet
Reading C:\Users\yazid\Desktop\spark_assigment\NYC\3.parquet
Reading C:\Users\yazid\Desktop\spark_assigment\NYC\4.parquet
Reading C:\Users\yazid\Desktop\spark_assigment\NYC\5.parquet
Reading C:\Users\yazid\Desktop\spark_assigment\NYC\6.parquet
Reading C:\Users\yazid\Desktop\spark_assigment\NYC\7.parquet
Reading C:\Users\yazid\Desktop\spark_assigment\NYC\8.parquet
Reading C:\Users\yazid\Desktop\spark_assigment\NYC\9.parquet
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------

In [5]:
# Handle missing values
calculate_nan_percentage(combined_df)
combined_df = handle_missing_values(combined_df)


-RECORD 0-------------------------------------
 VendorID              | 0.0                  
 tpep_pickup_datetime  | 0.0                  
 tpep_dropoff_datetime | 0.0                  
 passenger_count       | 0.047847536337005184 
 trip_distance         | 0.0                  
 RatecodeID            | 0.047847536337005184 
 store_and_fwd_flag    | 0.047847536337005184 
 PULocationID          | 0.0                  
 DOLocationID          | 0.0                  
 payment_type          | 0.0                  
 fare_amount           | 0.0                  
 extra                 | 0.0                  
 mta_tax               | 0.0                  
 tip_amount            | 0.0                  
 tolls_amount          | 0.0                  
 improvement_surcharge | 0.0                  
 total_amount          | 0.0                  
 congestion_surcharge  | 0.047847536337005184 
 airport_fee           | 0.18254471188935859  



In [6]:
combined_df.show(5)

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|airport_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|       2| 2021-10-01 00:50:24|  2021-10-01 01:02:11|            1.0|         4.05|       1.0|                 N|          24|          50|           4|      -14.5| -0.5|   -0.5|       0.

In [7]:
#  trip analysis job
analyze_trip(combined_df)


+----+------------------+------------------+
|hour|avg(trip_distance)|avg(trip_duration)|
+----+------------------+------------------+
|0   |5.697302468454763 |15.730267774016255|
|1   |5.339389276880548 |15.850162623066096|
|2   |5.281638347063696 |14.82795115797    |
|3   |8.981665757846745 |15.549170610945112|
|4   |45.895842964455724|16.76969686715944 |
|5   |43.51459739253192 |16.754208948062846|
|6   |28.825264044061754|16.470007003437665|
|7   |14.787098091862468|16.430668381762306|
|8   |12.428777984690617|15.92411902626665 |
|9   |8.48698598152477  |15.937159524944068|
|10  |6.890357112173898 |15.968707707679075|
|11  |7.591890247088315 |15.990086114978126|
|12  |5.918490297216475 |16.550210286584242|
|13  |6.369747729357942 |16.92855739941454 |
|14  |5.343977336571386 |18.008710551099977|
|15  |5.5549453596293095|18.798311988582775|
|16  |5.726637253255657 |18.979807123045916|
|17  |4.738831453140698 |17.867833479002556|
|18  |3.8434327806086697|16.217714356261908|
|19  |4.47

In [8]:
# Perform tip analysis
analyze_tips(combined_df)


+------------+-------------------+
|PULocationID|avg(tip_percentage)|
+------------+-------------------+
|252         |1989.087674627638  |
|1           |859.4629608102979  |
|130         |122.02896363647231 |
|62          |99.15102255958752  |
|176         |97.12963522296104  |
|265         |83.61822178057028  |
|251         |80.10595263395518  |
|214         |77.50651795756703  |
|243         |52.63534475327919  |
|92          |47.99232259803468  |
+------------+-------------------+
only showing top 10 rows

+-------------+----------+
|trip_distance|tip_amount|
+-------------+----------+
|         4.05|       0.0|
|         1.45|      2.45|
|         6.12|       0.0|
|         1.33|       0.0|
|         1.47|      2.16|
|         4.52|       2.5|
|         1.68|       0.0|
|          2.0|      2.65|
|          1.0|       0.0|
|          3.0|      4.65|
+-------------+----------+
only showing top 10 rows

+----+------------------+
|hour|avg(tip_amount)   |
+----+------------------+
|0

In [9]:
# Perform fare analysis
analyze_fares(combined_df)


+------------+------------+------------------+
|PULocationID|DOLocationID|  avg(fare_amount)|
+------------+------------+------------------+
|         234|         144| 9.145437997724684|
|          90|         142|11.901476576398503|
|          90|         231|10.009889675736114|
|         246|         249| 9.161221325233793|
|         114|         100|10.699848141432456|
|          79|         116|29.635694996028597|
|         129|          95| 18.88580357142857|
|         192|          44|           111.575|
|         231|         261| 6.431756088951642|
|         142|         144|19.875815966655978|
|         264|          36|26.884409090909088|
|         236|           1| 77.26832713754646|
|         148|         262|18.432332572298332|
|          48|         232|19.839425971298564|
|          10|          87| 51.37811111111111|
|          87|          33|10.446895986895989|
|         170|         179|18.258574766355146|
|         249|         225| 23.98541734860883|
|          25

In [1]:
# Perform traffic analysis
df_speed_hour_pandas, df_speed_week_pandas, df_speed_month_pandas = analyze_traffic(your_dataframe)



NameError: name 'analyze_traffic' is not defined

In [None]:
# Convert Spark DataFrame to Pandas DataFrame for plotting
sampled_df = combined_df.sample(fraction=0.01).toPandas()

# Generate box plots for each column
for column in sampled_df.columns:
    plt.figure(figsize=(10, 6))
    sns.boxplot(x=sampled_df[column])
    plt.title(f'Boxplot of {column}')
    plt.xlabel(column)
    plt.show()
