<font color = 'green'>***Analyzing NYC Taxi Trip Records: A Comparative Study of Passenger Behaviors and Fare Patterns between March and May***</font>

*Installing Required packages*

In [None]:
!pip install tensorflow-data-validation

In [None]:
!pip install apache-beam

*Importing all the necessary libraries*

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import tensorflow_data_validation as tfdv

In [None]:
# connecting to google drive
from google.colab import drive
drive.mount('/content/drive')

***Datasets dowloaded from https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page***

In [None]:
# loading the data into the Parquet file into a Pandas DataFrame
march_data = pd.read_parquet('/content/drive/MyDrive/yellow_tripdata_2020-03.parquet')
may_data = pd.read_parquet('/content/drive/MyDrive/yellow_tripdata_2020-05.parquet')

1) Prep the data in order to be ready to be fed to a model.

Look for missing, null, NaN records.

Find outliers.

Transform data – all entries should be numeric.

In [None]:
march_data.head()

In [None]:
print(march_data.shape)
print(may_data.shape)

In [None]:
march_data.info() # checking the info

In [None]:
may_data.info()

In [None]:
march_data.isna().sum() # checking for null values

In [None]:
march_data.isna().sum()

In [None]:
march_data = march_data.drop(['airport_fee'], axis=1)
may_data = may_data.drop(['airport_fee'], axis=1)

In [None]:
march_data = march_data.fillna(method = 'ffill')
may_data = may_data.fillna(method = 'ffill')

In [None]:
march_data.isna().sum()

In [None]:
may_data.isna().count()

In [None]:
march_data.describe()

In [None]:
may_data.describe()

In [None]:
# Handle outliers (assuming fare_amount and trip_distance are relevant columns)
def remove_outliers(df, column):
    q1 = df[column].quantile(0.25)
    q3 = df[column].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

march_data = remove_outliers(march_data, 'fare_amount')
march_data = remove_outliers(march_data, 'trip_distance')
may_data = remove_outliers(may_data, 'fare_amount')
may_data = remove_outliers(may_data, 'trip_distance')

In [None]:
march_data.dtypes

In [None]:
# Transform data to numeric
# Assuming store_and_fwd_flag is categorical and needs to be converted to numeric
march_data['store_and_fwd_flag'] = march_data['store_and_fwd_flag'].apply(lambda x: 1 if x == 'Y' else 0)
may_data['store_and_fwd_flag'] = may_data['store_and_fwd_flag'].apply(lambda x: 1 if x == 'Y' else 0)

In [None]:
march_data.dtypes

2) List all types of data, numeric, categorical

In [None]:
# Task 2: List all types of data
march_numeric_columns = march_data.select_dtypes(include=[np.number]).columns.tolist()
march_categorical_columns = march_data.select_dtypes(include=[np.object]).columns.tolist()
may_numeric_columns = may_data.select_dtypes(include=[np.number]).columns.tolist()
may_categorical_columns = may_data.select_dtypes(include=[np.object]).columns.tolist()

In [None]:
march_numeric_columns

In [None]:
march_categorical_columns #as we drop the airport_fee colums there are no category columns

In [None]:
may_categorical_columns

3) Perform EDA on data
Utilize both:

Classic approach in EDA (Pandas, Numpy libraries)

The TFDV (TensorFlow Data Validation) module with the powerful graphical statistics
generated (apache beam library…)

Present dependencies and correlations among the various features in the data.

List the most variables (Feature Importance) that will affect the target label.

In [None]:
# Basic information about the data
print("Basic Info:")
print(march_data.info())

In [None]:
# Summary statistics
march_summary_stats = march_data.describe()
march_summary_stats

In [None]:
# Correlation matrix
march_correlation_matrix = march_data.corr()
march_correlation_matrix

In [None]:
# Create subplots
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 6))

# Histogram for 'trip_distance' in march_data
sns.histplot(march_data['fare_amount'], bins=30, kde=True, color='blue', ax=axes[0])
axes[0].set_title('Distribution of trip_distance (March Data)')
axes[0].set_xlabel('Distance')
axes[0].set_ylabel('Frequency')

# Histogram for 'trip_distance' in may_data
sns.histplot(may_data['fare_amount'], bins=30, kde=True, color='olive', ax=axes[1])
axes[1].set_title('Distribution of trip_distance (May data)')
axes[1].set_xlabel('Distance')
axes[1].set_ylabel('Frequency')

# Adjust layout
plt.tight_layout()

# Show the plot
plt.show()


In [None]:
# Create subplots
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 6))

# Histogram for 'trip_distance' in march_data
sns.histplot(march_data['trip_distance'], bins=30, kde=True, color='blue', ax=axes[0])
axes[0].set_title('Distribution of trip_distance (march data)')
axes[0].set_xlabel('Distance')
axes[0].set_ylabel('Frequency')

# Histogram for 'trip_distance' in may_data
sns.histplot(may_data['trip_distance'], bins=30, kde=True, color='olive', ax=axes[1])
axes[1].set_title('Distribution of trip_distance (may data)')
axes[1].set_xlabel('Distance')
axes[1].set_ylabel('Frequency')

# Adjust layout
plt.tight_layout()

# Show the plot
plt.show()


In [None]:
# Create subplots with 1 row and 2 columns
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(14, 6))

# Box plot for 'trip_distance' in the first subplot
sns.boxplot(y=march_data['trip_distance'], ax=axes[0])
axes[0].set_title('Box Plot of Trip Distance - march data')
axes[0].set_ylabel('Trip Distance')

# Box plot for 'trip_distance' in the second subplot
sns.boxplot(y=may_data['trip_distance'], ax=axes[1], color = 'orange')
axes[1].set_title('Box Plot of Trip Distance - march data')
axes[1].set_ylabel('Trip Distance')

# Adjust layout
plt.tight_layout()

# Show the plot
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Features and datasets
features = ['fare_amount', 'trip_distance']
data_sets = [march_data, may_data]
titles = ['March', 'May']

# Create subplots
fig, axes = plt.subplots(nrows=len(features), ncols=len(data_sets), figsize=(16, 8))

# Plot violin plots for each feature and dataset
for i, feature in enumerate(features):
    for j, data in enumerate(data_sets):
        sns.violinplot(x='passenger_count', y=feature, data=data, palette='coolwarm', ax=axes[i][j])
        axes[i][j].set_title(f'Violin Plot of {feature} by Passenger Count ({titles[j]})')
        axes[i][j].set_xlabel('Passenger Count')
        axes[i][j].set_ylabel(feature)

# Adjust layout
plt.tight_layout()

# Show the plot
plt.show()


In [None]:
#finding correlation for march data
correlation_matrix = march_data.corr()
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f")
plt.title('Correlation Matrix')
plt.show()


In [None]:
#finding correlation for march data
may_correlation_matrix = may_data.corr()
plt.figure(figsize=(12, 8))
sns.heatmap(may_correlation_matrix, annot=True, fmt=".2f")
plt.title('Correlation Matrix')
plt.show()

In [None]:
#assign the the march data for training and may data for evaluation
training_data = march_data
testing_data = may_data

In [None]:
# Generate statistics for TFDV using march data
stats = tfdv.generate_statistics_from_dataframe(march_data)

# Infer the schema
schema = tfdv.infer_schema(stats)

# Display schema
print("\nSchema:")
tfdv.display_schema(schema)

In [None]:
tfdv.visualize_statistics(stats)

In [None]:
anomalies = tfdv.validate_statistics(statistics=stats, schema=schema)
tfdv.display_anomalies(anomalies)

In [None]:
# Calculate feature importance using correlation matrix
feature_importance = correlation_matrix['trip_distance'].abs().sort_values(ascending=False)
print("Feature Importance:")
print(feature_importance)

4) Be aware of the time-window selection for the data.
March 2020 was when COVID19 pandemic broke up in the US.
Needless to say, every industry and business initiatives were impacted drastically.

Starting March 2020, the NYC Taxi industry has established a ‘new normal’.

<< Extra Credit >>:

January 2020 data presents the ‘baseline’ of what the NYC Taxi business used to be.

Compare the data of Jan-2020 vs Mar-2020.

Present your findings.


In [None]:
data_january = pd.read_parquet('/content/drive/MyDrive/yellow_tripdata_2020-01.parquet')

In [None]:
data_january.head()

In [None]:
# Compare summary statistics of January 2020 vs March 2020
january_stats = data_january.describe()
march_stats = march_data.describe()

In [None]:
# Compare average fare amounts
average_fare_january = data_january['fare_amount'].mean()
average_fare_march = march_data['fare_amount'].mean()

In [None]:
# Compare average trip distances
average_distance_january = data_january['trip_distance'].mean()
average_distance_march = march_data['trip_distance'].mean()

In [None]:
# Compare total number of rides
total_rides_january = len(data_january)
total_rides_march = len(march_data)

In [None]:
# Print and visualize the comparisons
print("Comparison between January 2020 and March 2020:")
print("Summary Statistics:")
print("January 2020:")
print(january_stats)
print("March 2020:")
print(march_stats)

In [None]:
print("Average Fare Amount:")
print("January 2020: $", average_fare_january)
print("March 2020: $", average_fare_march)

In [None]:
print("Average Trip Distance:")
print("January 2020:", average_distance_january, "miles")
print("March 2020:", average_distance_march, "miles")

In [None]:
print("Total Number of Rides:")
print("January 2020:", total_rides_january)
print("March 2020:", total_rides_march)

In [None]:
# Calculate average trip fare for January and March
january_avg_fare = data_january['fare_amount'].mean()
march_avg_fare = march_data['fare_amount'].mean()

# Data for the bar plot
months = ['January', 'March']
avg_fares = [january_avg_fare, march_avg_fare]

# Create a bar plot using Seaborn
sns.set(style="whitegrid")
plt.figure(figsize=(8, 6))
sns.barplot(x=months, y=avg_fares, palette=['skyblue', 'olive'])
plt.title('Average Trip Fare by Month')
plt.xlabel('Month')
plt.ylabel('Average Fare Amount')
plt.show()