# Dataset Scraping and Cleaning from the source
Dataset scraping using beautifulsoup library
---
---
Dataset source : [(Dataset Source)](https://divvy-tripdata.s3.amazonaws.com/index.html)


# Import Library
---

In [2]:
!pip install requests beautifulsoup4 pandas --quiet
print("Instalasi library yang dibutuhkan telah selesai.")

Instalasi library yang dibutuhkan telah selesai.


In [3]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import zipfile
import pandas as pd
import glob

# Configuration
---

In [4]:
CSV_FILE_NAME = f"Fixed_Dataset.csv"

# Retrieve News URL List from Home Page
---

In [5]:
import requests
import xml.etree.ElementTree as ET
import pandas as pd

# The page URL, which is also the base for the API request
url = "https://divvy-tripdata.s3.amazonaws.com/"

print(f"Requesting data from S3 API at: {url}")

# Make the same request the JavaScript does
response = requests.get(url)
response.raise_for_status() # Ensure the request was successful

# The response is XML, not HTML
xml_data = response.content

# Parse the XML data
root = ET.fromstring(xml_data)

# Namespace is required to find elements in S3's XML
namespace = {'s3': 'http://s3.amazonaws.com/doc/2006-03-01/'}

file_links = []
# Find all 'Contents' tags, which represent files
for content in root.findall('s3:Contents', namespace):
    key = content.find('s3:Key', namespace).text
    # We only want links to .zip files
    if key.endswith('.zip'):
        link = url + key
        file_links.append(link)

print(f"\nSuccessfully found {len(file_links)} zip file links.")

# Display the first 10 links
for link in file_links[:10]:
    print(link)

# You can easily put this into a pandas DataFrame
df = pd.DataFrame(file_links, columns=['download_url'])
print("\n--- DataFrame Head ---")
print(df.head())

Requesting data from S3 API at: https://divvy-tripdata.s3.amazonaws.com/

Successfully found 81 zip file links.
https://divvy-tripdata.s3.amazonaws.com/202004-divvy-tripdata.zip
https://divvy-tripdata.s3.amazonaws.com/202005-divvy-tripdata.zip
https://divvy-tripdata.s3.amazonaws.com/202006-divvy-tripdata.zip
https://divvy-tripdata.s3.amazonaws.com/202007-divvy-tripdata.zip
https://divvy-tripdata.s3.amazonaws.com/202008-divvy-tripdata.zip
https://divvy-tripdata.s3.amazonaws.com/202009-divvy-tripdata.zip
https://divvy-tripdata.s3.amazonaws.com/202010-divvy-tripdata.zip
https://divvy-tripdata.s3.amazonaws.com/202011-divvy-tripdata.zip
https://divvy-tripdata.s3.amazonaws.com/202012-divvy-tripdata.zip
https://divvy-tripdata.s3.amazonaws.com/202101-divvy-tripdata.zip

--- DataFrame Head ---
                                        download_url
0  https://divvy-tripdata.s3.amazonaws.com/202004...
1  https://divvy-tripdata.s3.amazonaws.com/202005...
2  https://divvy-tripdata.s3.amazonaws.com/20

# Extracting and Downloading the last zip file that contained on the download URL
---

In [6]:
import os
import requests
import zipfile
import pandas as pd
import glob
import shutil

# --- Setup ---
# Use a more descriptive name for the temporary directory
temp_dir = 'divvy_data_temp'
os.makedirs(temp_dir, exist_ok=True)

# List to hold each individual DataFrame
all_dataframes = []

# Assuming 'file_links' is a list of URLs from the previous step
# Change the slice to get only the last link
download_urls = [file_links[-1]] # Select only the last element

print(f"✅ Preparing to download and process {len(download_urls)} zip files.")

# --- Main Processing Loop ---
for url in download_urls:
    zip_filename = os.path.basename(url)
    zip_filepath = os.path.join(temp_dir, zip_filename)

    try:
        # 1. DOWNLOAD the single zip file
        print(f"Downloading: {zip_filename}...")
        response = requests.get(url, stream=True)
        response.raise_for_status()
        with open(zip_filepath, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)

        # 2. EXTRACT the specific CSV from the zip
        with zipfile.ZipFile(zip_filepath, 'r') as zip_ref:
            # Find the CSV filename inside the zip archive to avoid ambiguity
            csv_filename = None
            for name in zip_ref.namelist():
                if name.endswith('.csv') and not name.startswith('__MACOSX'):
                    csv_filename = name
                    break

            if not csv_filename:
                print(f"⚠️ Warning: No CSV file found in {zip_filename}. Skipping.")
                continue

            # Extract only that one CSV file
            zip_ref.extract(csv_filename, path=temp_dir)
            csv_filepath = os.path.join(temp_dir, csv_filename)

        # 3. READ the single CSV and add its DataFrame to our list
        print(f"   -- Reading {csv_filename}...")
        df = pd.read_csv(csv_filepath)
        all_dataframes.append(df)
        print(f"   -- ✅ Appended data from {csv_filename}")

    except Exception as e:
        print(f"❌ Error processing {url}: {e}")

    finally:
        # 4. CLEAN UP the used files inside the loop to ensure a clean state
        if 'csv_filepath' in locals() and os.path.exists(csv_filepath):
            os.remove(csv_filepath)
        if os.path.exists(zip_filepath):
            os.remove(zip_filepath)

# --- Final Combination ---
if all_dataframes:
    print("\n⏳ Combining all DataFrames into a final result...")
    final_combined_df = pd.concat(all_dataframes, ignore_index=True)

    print("\n--- ✅ Final Combined DataFrame ---")
    print(final_combined_df.info())
    print("\n--- Head ---")
    print(final_combined_df.head())
    print("\n--- Tail ---")
    print(final_combined_df.tail())
else:
    print("❌ No dataframes were successfully processed.")

# --- Final Cleanup ---
# Remove the temporary directory
if os.path.exists(temp_dir):
    shutil.rmtree(temp_dir)
print(f"\n🗑️ Cleaned up temporary directory: {temp_dir}")

✅ Preparing to download and process 1 zip files.
Downloading: Divvy_Trips_2020_Q1.zip...
   -- Reading Divvy_Trips_2020_Q1.csv...
   -- ✅ Appended data from Divvy_Trips_2020_Q1.csv

⏳ Combining all DataFrames into a final result...

--- ✅ Final Combined DataFrame ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 426887 entries, 0 to 426886
Data columns (total 13 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   ride_id             426887 non-null  object 
 1   rideable_type       426887 non-null  object 
 2   started_at          426887 non-null  object 
 3   ended_at            426887 non-null  object 
 4   start_station_name  426887 non-null  object 
 5   start_station_id    426887 non-null  int64  
 6   end_station_name    426886 non-null  object 
 7   end_station_id      426886 non-null  float64
 8   start_lat           426887 non-null  float64
 9   start_lng           426887 non-null  float64
 10  end_lat         

## Testing Extracted Data
---

In [7]:
final_combined_df.head(5)

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,EACB19130B0CDA4A,docked_bike,2020-01-21 20:06:59,2020-01-21 20:14:30,Western Ave & Leland Ave,239,Clark St & Leland Ave,326.0,41.9665,-87.6884,41.9671,-87.6674,member
1,8FED874C809DC021,docked_bike,2020-01-30 14:22:39,2020-01-30 14:26:22,Clark St & Montrose Ave,234,Southport Ave & Irving Park Rd,318.0,41.9616,-87.666,41.9542,-87.6644,member
2,789F3C21E472CA96,docked_bike,2020-01-09 19:29:26,2020-01-09 19:32:17,Broadway & Belmont Ave,296,Wilton Ave & Belmont Ave,117.0,41.9401,-87.6455,41.9402,-87.653,member
3,C9A388DAC6ABF313,docked_bike,2020-01-06 16:17:07,2020-01-06 16:25:56,Clark St & Randolph St,51,Fairbanks Ct & Grand Ave,24.0,41.8846,-87.6319,41.8918,-87.6206,member
4,943BC3CBECCFD662,docked_bike,2020-01-30 08:37:16,2020-01-30 08:42:48,Clinton St & Lake St,66,Wells St & Hubbard St,212.0,41.8856,-87.6418,41.8899,-87.6343,member


## Checking the shape (column and row data) on the dataset extracted
---

In [8]:
final_combined_df.shape

(426887, 13)

# Exporting and Downloading the data that are already extracted
---

In [9]:
from google.colab import files

# Check if the dataframe exists and is not empty before exporting
if 'final_combined_df' in locals() and not final_combined_df.empty:
  # Define the name for the downloadable CSV file
  output_filename = CSV_FILE_NAME # Using the predefined filename

  # Export the DataFrame to a CSV file in the Colab environment
  final_combined_df.to_csv(output_filename, index=False)

  # Make the file available for download
  print(f"\n✅ Exported DataFrame to '{output_filename}'. Downloading...")
  files.download(output_filename)

else:
  print("\n⚠️ final_combined_df does not exist or is empty. Cannot export.")



✅ Exported DataFrame to 'Fixed_Dataset.csv'. Downloading...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [10]:
# prompt: hitung jumlah data pada csv

# The number of rows (data points) in the combined dataframe
num_data_points = final_combined_df.shape[0]

print(f"Total number of data points (rows) in the final combined DataFrame: {num_data_points}")

Total number of data points (rows) in the final combined DataFrame: 426887
