PROCESS

In [None]:
# Install needed packages
install.packages("tidyverse", dependencies = TRUE)
install.packages("lubridate")
install.packages("geosphere")


In [None]:
# Load the packages
library(readr)
library(dplyr)
library(purrr)
library(tidyr)
library(lubridate)
library(ggplot2)

In [None]:
# Set the working directory to the location of the downloaded data
setwd("/home/ladipo/Desktop/Data Analytic/DataSets/Cyclic_Data")

In [None]:
# List all CSV files in the working directory set above
files <- list.files(pattern = "*.csv")

In [None]:
# Read each CSV file and store them in a list for efficient CSV reading
data_list <- lapply(files, read_csv)

In [None]:
# Combine the list of data frames into a single data frame using bind_rows
combined_data <- bind_rows(data_list)

In [None]:
head(combined_data)

In [None]:
# View the combined data
View(combined_data)

In [None]:
# Structure of the data
str(combined_data)

# Print the original data frame
print("Original Data Frame:")
print(combined_data)

# Identify duplicate rows
duplicate_rows <- combined_data[duplicated(combined_data), ]
print("Duplicate Rows:")
print(duplicate_rows)

# Create a copy of the original data frame
combined_data <- combined_data

# Remove duplicate rows from the copy
combined_data <- combined_data[!duplicated(combined_data), ]

print("Data Frame after Removing Duplicates:")
print(combined_data)


# Print the original data frame
print("Original Data Frame:")
print(combined_data)

# Identify missing values
missing_values <- is.na(combined_data)
print("Missing Values in Data Frame:")
missing_values

# Create a copy of the original data frame
combined_data <- combined_data

# Fill missing values in the copy
# For numeric columns, fill with mean
combined_data$started_at[is.na(combined_data$start_station_id)] <- mean(combined_data$start_station_id, na.rm = TRUE)

# For character columns, fill with "Unknown"
combined_data$start_station_name[is.na(combined_data$start_station_name)] <- "Unknown"

# For character columns, fill with "Unknown"
combined_data$start_station_id[is.na(combined_data$start_station_id)] <- "Unknown"

# For numeric columns, fill with mean
# combined_data$end_station_id[is.na(combined_data$end_station_id)] <- mean(combined_data$end_station_id, na.rm = TRUE)

# For character columns, fill with "Unknown"
combined_data$end_station_name[is.na(combined_data$end_station_name)] <- "Unknown"

# For character columns, fill with "Unknown"
combined_data$end_station_id[is.na(combined_data$end_station_id)] <- "Unknown"

print("Data Frame after Filling Missing Values:")
print(combined_data)

In [None]:
# Convert to datetime using lubridate
combined_data$started_at <- ymd_hms(combined_data$started_at)
combined_data$ended_at <- ymd_hms(combined_data$ended_at)

In [None]:
# Calculate the ride length time
combined_data$time_diff <- as.duration(interval(combined_data$started_at, combined_data$ended_at))

In [None]:
# Convert the duration to a more readable format, if necessary
combined_data$ride_length <- as.numeric(combined_data$time_diff, units = "mins")

In [None]:
# Extract the day of the week as a number (1 = Sunday, 2 = Monday, ...)
combined_data$day_of_week_num <- wday(combined_data$started_at)

In [None]:
# Extract the day of the week as a label (Sunday, Monday, ...)
combined_data$day_of_week_label <- wday(combined_data$started_at, label = TRUE)

In [None]:
# Extract the month number and name
combined_data$month_number <- month(combined_data$started_at)
combined_data$month_name <- month(combined_data$started_at, label = TRUE, abbr = FALSE)

In [None]:
library(geosphere)

# Calculate distance in meters
combined_data$distance <- distHaversine(p1 = cbind(combined_data$start_lng, combined_data$start_lat), 
                              p2 = cbind(combined_data$end_lng, combined_data$end_lat))

head(combined_data)

In [None]:
# Filter out rows with NA in the 'started_at' or 'ended_at' columns
combined_data_clean <- combined_data %>%
  filter(!is.na(started_at) & !is.na(ended_at))

head(combined_data_clean)

In [None]:
# Export dataframe to CSV file on my computer
write.csv(combined_data_clean, file = "/home/ladipo/Desktop/Data Analytic/Coursera Training/combined_data_clean_bike_vscode.csv", row.names = FALSE)

ANALYSIS

In [None]:
# Combine grouping and summarizing of members and casual day of the week label
combined_data_clean_month <- combined_data_clean %>%
  group_by(month_name) %>%
  summarize(
    average_ride_length = ceiling(mean(ride_length)),
    count = ceiling(n()/10000)
  )

View(combined_data_clean_month)

In [None]:
# Plot the average ride length with custom colors and white background
p1 <- ggplot(combined_data_clean_month, aes(x = month_name, y = average_ride_length)) +
  geom_bar(stat = "identity", fill = "#5F9EA0") +  # Turquoise
  geom_text(aes(label = average_ride_length), vjust = -0.5, size = 4, hjust = 1) +  # Add data labels
  labs(title = "Average Ride Length by Month of the Year",
       x = "Month of the Year",
       y = "Average Ride Length (minutes)") +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 20, face = "bold"),
    axis.text.x = element_text(size = 12, angle = 45, hjust = 1),
    axis.text.y = element_text(size = 12),
    axis.title = element_text(size = 14),
    legend.text = element_text(size = 12),
    plot.background = element_rect(fill = "white")
    ) 

# Print both plots
print(p1)



In [None]:
# Plot the number of rides with custom colors and white background
p2 <- ggplot(combined_data_clean_month, aes(x = month_name, y = count)) +
  geom_bar(stat = "identity", fill = "#FFA07A") +  # Light Salmon
  geom_text(aes(label = count), vjust = -0.5, size = 4, hjust = 1) +  # Add data labels
  labs(title = "Number of Rides by Month of the Year",
       x = "Month of the Year",
       y = "Number of Rides(10000)") +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 16, face = "bold"),
    axis.text.x = element_text(size = 12, angle = 45, hjust = 1),
    axis.text.y = element_text(size = 12),
    axis.title = element_text(size = 14),
    legend.text = element_text(size = 12),
    plot.background = element_rect(fill = "white")
    ) 

# Print both plots
print(p2)

In [None]:
# Combine grouping and summarizing of members and casual day of the week label
combined_data_clean_day <- combined_data_clean %>%
  group_by(day_of_week_label) %>%
  summarize(
    mean_ride_length = ceiling(mean(ride_length)),
    count = ceiling(n()/10000)
  )

View(combined_data_clean_day)

In [None]:
# Load the necessary package
library(ggplot2)

# Plot the average ride length with custom colors and white background
p1 <- ggplot(combined_data_clean_day, aes(x = day_of_week_label, y = mean_ride_length)) +
  geom_bar(stat = "identity", fill = "#5F9EA0") +  # Turquoise
  geom_text(aes(label = mean_ride_length), vjust = -0.5, size = 4, hjust = 1) +  # Add data labels
  labs(title = "Average Ride Length by Day of the Week",
       x = "Day of the Week",
       y = "Average Ride Length (minutes)") +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 16, face = "bold"),
    axis.text.x = element_text(size = 12, angle = 45, hjust = 1),
    axis.text.y = element_text(size = 12),
    axis.title = element_text(size = 14),
    legend.text = element_text(size = 12),
    plot.background = element_rect(fill = "white")
    ) 


# Print both plots
print(p1)


In [None]:
# Plot the number of rides with custom colors and white background
p2 <- ggplot(combined_data_clean_day, aes(x = day_of_week_label, y = count)) +
  geom_bar(stat = "identity", fill = "#FFA07A") +  # Light Salmon
  geom_text(aes(label = count), vjust = -0.5) +  # Add data labels
  labs(title = "Number of Rides by Day of the Week",
       x = "Day of the Week",
       y = "Number of Rides") +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 16, face = "bold"),
    axis.text.x = element_text(size = 12, angle = 45, hjust = 1),
    axis.text.y = element_text(size = 12),
    axis.title = element_text(size = 14),
    legend.text = element_text(size = 12),
    plot.background = element_rect(fill = "white")
    ) 

# Print both plots
print(p2)

In [None]:
# Group by 'day_of_week_label' and count 'member' and 'casual'

# Calculate counts and percentages
member_casual <- combined_data_clean %>%
  group_by(day_of_week_label) %>%
  summarize(
    total_member = sum(member_casual == "member"),
    total_casual = sum(member_casual == "casual"),
    percent_member = ceiling((total_member / sum(total_member + total_casual)) * 100),
    percent_casual = ceiling((total_casual / sum(total_member + total_casual)) * 100)
  )
View(member_casual)

In [None]:
# Reshape data for plotting
member_casual_long <- member_casual %>%
  pivot_longer(cols = c(percent_member, percent_casual),
               names_to = "type",
               values_to = "percentage")

# Plotting the stacked bar chart
ggplot(member_casual_long, aes(x = day_of_week_label, y = percentage, fill = type)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = percentage), vjust = -0.5, size = 4, hjust = 1) +  # Add data labels
  labs(title = "Percentage of Member and Casual Users by Day of the Week",
       x = "Day of the Week",
       y = "Percentage (%)",
       fill = "User Type") +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 16, face = "bold"),
    axis.text.x = element_text(size = 12, angle = 45, hjust = 1),
    axis.text.y = element_text(size = 12),
    axis.title = element_text(size = 14),
    legend.text = element_text(size = 12),
    plot.background = element_rect(fill = "white")
  )

In [None]:
# Group by 'rideable_type' and count 'member' and 'casual'
rideable_type_number <- combined_data_clean %>%
  dplyr::group_by(rideable_type) %>%
  dplyr::summarize(
    member = sum(member_casual == "member"),
    casual = sum(member_casual == "casual")
  )

View(rideable_type_number)