In [None]:
path = "F:\\Culture Data Scraping\\The Art Of Blowjob\\index.html"
html_content = open(path, 'r', encoding='utf-8').read()


In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import re

# Assuming you have the HTML content in a variable called 'html_content'
soup = BeautifulSoup(html_content, 'html.parser')

data = []

for div in soup.find_all('div', class_=['update-side', 'update-middle']):
    link = div.find('a')['href']
    thumbnail = div.find('img')['src']
    title = div.find('h3').text.strip()
    datetime = div.find('time')['datetime']
    duration_text = div.find('p', class_='datetime').text.split('~')[-1].strip()
    
    # Convert duration to seconds
    duration_parts = re.findall(r'\d+', duration_text)
    duration_seconds = sum(int(x) * 60 ** i for i, x in enumerate(reversed(duration_parts)))
    
    data.append({
        'link': link,
        'thumbnail': thumbnail,
        'title': title,
        'datetime': datetime,
        'duration_seconds': duration_seconds,
        'large_image': None,
        'description': None,
    })

df = pd.DataFrame(data)

# Display the first few rows of the DataFrame
print(df.head())

In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import time
import random

# Assuming 'df' is your existing DataFrame with 'link' and 'description' columns

# Function to fetch and parse description
def get_description(link):
    full_link = urljoin('https://web.archive.org', link)
    response = requests.get(full_link)
    page_soup = BeautifulSoup(response.text, 'html.parser')
    
    article = page_soup.find('div', class_='article')
    if article:
        paragraphs = article.find_all('p')
        description = ' '.join([p.text.strip() for p in paragraphs if p.text.strip()])
    else:
        description = ''
    
    return description

# Iterate through rows without description
for index, row in df[df['description'].isnull()].iterrows():
    try:
        description = get_description(row['link'])
        df.at[index, 'description'] = description
        print(f"Added description for {row['title']}")
        
        # Add a longer, randomized delay to avoid triggering blocks
        delay = random.uniform(5, 10)
        print(f"Waiting for {delay:.2f} seconds before next request...")
        time.sleep(delay)
    except Exception as e:
        print(f"Error processing {row['title']}: {str(e)}")
        print("Waiting for 60 seconds before retrying...")
        time.sleep(60)

# Display the updated DataFrame
print(df.head())

In [None]:
# Function to convert thumbnail URL to large image URL
def get_large_image_url(thumb_url):
    if thumb_url and '-th.jpg' in thumb_url:
        return thumb_url.replace('-th.jpg', '-lg.jpg').replace('20170606190201', '20170709133354')
    return None

# Apply the function to rows where large_image is not set
mask = df['large_image'].isnull()
df.loc[mask, 'large_image'] = df.loc[mask, 'thumbnail'].apply(get_large_image_url)


In [None]:
from deltalake import DeltaTable, write_deltalake
import pyarrow as pa

# Convert Pandas DataFrame to PyArrow Table
arrow_table = pa.Table.from_pandas(df)

# Save as Delta table to a local directory
local_path = "F:\\Culture Data Scraping\\The Art Of Blowjob\\delta_table"

# Write the data to the Delta table
# Use mode="overwrite" to replace all data, or mode="append" to add new data
write_deltalake(local_path, arrow_table, mode="overwrite")


In [None]:
import re

# Function to clean description
def clean_description(desc):
    if isinstance(desc, str):
        # Remove "Updated on <date> " pattern
        cleaned = re.sub(r'^Updated on [A-Za-z]+, [A-Za-z]+ \d+, \d+ ', '', desc)
        return cleaned.strip()
    return desc

# Apply the cleaning function to the 'description' column
df['clean_description'] = df['description'].apply(clean_description)


In [None]:
# List all versions of the Delta table
delta_table = DeltaTable(local_path)
history = delta_table.history()

print("All versions of the Delta table:")
for row in history:
    print(f"Version: {row['version']}, Timestamp: {row['timestamp']}, Operation: {row['operation']}")


In [None]:
import requests
import os
import time
import re

def download_image(url, folder, filename):
    file_path = os.path.join(folder, filename)
    if os.path.exists(file_path):
        print(f"File already exists: {filename}")
        return
    
    try:
        response = requests.get(f"{url}", stream=True)
        response.raise_for_status()
        
        os.makedirs(folder, exist_ok=True)
        
        with open(file_path, 'wb') as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)
        
        print(f"Downloaded: {filename}")
    except Exception as e:
        print(f"Error downloading {filename}: {str(e)}")

def extract_date_from_url(url):
    date_pattern = r'\d{4}-\d{2}-\d{2}'
    match = re.search(date_pattern, url)
    return match.group(0) if match else 'unknown_date'

# Get the directory of the HTML file
html_dir = os.path.dirname(local_path)

# Iterate through each row in the DataFrame
for _, row in df.iterrows():
    # Extract date from URL
    date = extract_date_from_url(row['thumbnail'])
    
    # Download thumbnail
    download_image(f"https://web.archive.org{row['thumbnail']}", os.path.join(html_dir, 'thumbnails'), f"{date}_thumb.jpg")
    time.sleep(1)  # Add delay similar to reading descriptions
    
    # Download large image
    download_image(f"https://web.archive.org{row['large_image']}", os.path.join(html_dir, 'large_images'), f"{date}_large.jpg")
    time.sleep(1)  # Add delay similar to reading descriptions

print("All images processed successfully.")


# Chloe Morgane

In [None]:
# Read the HTML file
chloe_local_path = "F:\\Culture Data Scraping\\Chloe Morgane\\updates.html"
with open(chloe_local_path, 'r', encoding='utf-8') as file:
    chloe_html_content = file.read()

# Parse the HTML content
chloe_soup = BeautifulSoup(chloe_html_content, 'html.parser')

# Find all update items
chloe_update_items = chloe_soup.find_all('article', class_='updates-item')

# Prepare lists to store data
chloe_titles = []
chloe_links = []
chloe_thumbnail_urls = []
chloe_categories = []
chloe_dates = []  # New list to store dates

# Extract information from each update item
for item in chloe_update_items:
    # Extract title
    chloe_title = item.find('h3', class_='updates-name').text.strip()
    chloe_titles.append(chloe_title)
    
    # Extract link
    chloe_link = item.find('h3', class_='updates-name').find('a')['href']
    chloe_links.append(chloe_link)
    
    # Extract thumbnail URL
    chloe_thumbnail_url = item.find('img', class_='updates-poster')['src']
    chloe_thumbnail_urls.append(chloe_thumbnail_url)
    
    # Determine category
    chloe_category = 'photos' if '/photos/' in chloe_link else 'video'
    chloe_categories.append(chloe_category)
    
    # Extract date from thumbnail URL
    date_match = re.search(r'(\d{4}-\d{2}-\d{2})', chloe_thumbnail_url)
    if date_match:
        chloe_date = date_match.group(1)
    else:
        chloe_date = None  # or some default value if date is not found
    chloe_dates.append(chloe_date)  # Append date to the list

# Create a DataFrame
chloe_df = pd.DataFrame({
    'title': chloe_titles,
    'link': chloe_links,
    'thumbnail_url': chloe_thumbnail_urls,
    'category': chloe_categories,
    'date': chloe_dates  # Use the list of dates instead of a single value
})


In [None]:
# Generate video stream URLs for video releases
chloe_df['video_stream_url'] = chloe_df.apply(
    lambda row: row['link'].replace('/video/', '/video-stream/') if row['category'] == 'video' else None,
    axis=1
)



In [None]:
import os
from dotenv import load_dotenv
import requests
from urllib.parse import urlparse
import re

# Load environment variables
load_dotenv()

# Get credentials from .env file
username = os.getenv('CHLOE_USERNAME')
password = os.getenv('CHLOE_PASSWORD')

# Get the base directory where the HTML is located
base_dir = os.path.dirname(os.path.abspath(chloe_local_path))

def download_video(row):
    if row['category'] != 'video' or pd.isna(row['video_stream_url']):
        return

    # Extract the ID from the link
    parsed_url = urlparse(row['link'])
    video_id = parsed_url.path.split('/')[-1]

    # Clean the title (remove special characters and spaces)
    clean_title = re.sub(r'[^\w\-_\. ]', '', row['title'])
    clean_title = clean_title.replace(' ', '_')

    # Create filename
    filename = f"{row['date']}-{video_id}-{clean_title}.mp4"

    # Create full path for the file
    full_path = os.path.join(base_dir, filename)

    # Check if the file already exists
    if os.path.exists(full_path):
        print(f"File already exists: {full_path}")
        return

    # Download the video
    response = requests.get(row['video_stream_url'], auth=(username, password), stream=True)
    
    if response.status_code == 200:
        with open(full_path, 'wb') as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)
        print(f"Downloaded: {full_path}")
    else:
        print(f"Failed to download: {full_path}")

# Apply the download function to each row
# Download all video files that don't exist yet
for _, row in chloe_df[chloe_df['category'] == 'video'].iterrows():
    download_video(row)


In [None]:
def download_poster(row):
    # Extract the date and ID
    date = row['date']
    parsed_url = urlparse(row['link'])
    video_id = parsed_url.path.split('/')[-1]

    # Construct the poster URL
    poster_url = f"https://chloemorgane.com/posters/{date}-lg.jpg"

    # Create filename
    filename = f"{date}-{video_id}.jpg"

    # Create full path for the file
    full_path = os.path.join(base_dir, filename)

    # Check if the file already exists
    if os.path.exists(full_path):
        print(f"Poster already exists: {full_path}")
        return

    # Download the poster
    response = requests.get(poster_url, auth=(username, password))
    
    if response.status_code == 200:
        with open(full_path, 'wb') as file:
            file.write(response.content)
        print(f"Downloaded poster: {full_path}")
    else:
        print(f"Failed to download poster: {full_path}")

# Apply the download function to each row
for _, row in chloe_df.iterrows():
    download_poster(row)


In [None]:
def download_html(row):
    # Extract the date and ID
    date = row['date']
    parsed_url = urlparse(row['link'])
    video_id = parsed_url.path.split('/')[-1]

    # Construct the filename
    filename = f"{date}-{video_id}.html"

    # Create full path for the file
    full_path = os.path.join(base_dir, filename)

    # Check if the file already exists
    if os.path.exists(full_path):
        print(f"HTML file already exists: {full_path}")
        return

    # Download the HTML content
    response = requests.get(row['link'], auth=(username, password))
    
    if response.status_code == 200:
        with open(full_path, 'w', encoding='utf-8') as file:
            file.write(response.text)
        print(f"Downloaded HTML: {full_path}")
    else:
        print(f"Failed to download HTML: {full_path}")

# Apply the download function to each row
for _, row in chloe_df.iterrows():
    download_html(row)


In [None]:
import os
from bs4 import BeautifulSoup
import requests
from urllib.parse import urljoin

def download_full_res_images(row):
    # Extract the date and ID
    date = row['date']
    parsed_url = urlparse(row['link'])
    video_id = parsed_url.path.split('/')[-1]

    # Construct the HTML filename
    html_filename = f"{date}-{video_id}.html"
    html_full_path = os.path.join(base_dir, html_filename)

    # Check if the HTML file exists
    if not os.path.exists(html_full_path):
        print(f"HTML file not found: {html_full_path}")
        return

    # Read the HTML content
    with open(html_full_path, 'r', encoding='utf-8') as file:
        html_content = file.read()

    # Parse the HTML
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find all photo items
    photo_items = soup.find_all('article', class_='photos-item')

    for index, item in enumerate(photo_items, start=1):
        # Find the link to the full resolution image
        link = item.find('a')
        if link and 'href' in link.attrs:
            full_res_url = urljoin(row['link'], link['href'])
            
            # Construct the filename for the full resolution image
            image_filename = f"{date}-{video_id}-{index:03d}.jpg"
            image_full_path = os.path.join(base_dir, image_filename)

            # Check if the image already exists
            if os.path.exists(image_full_path):
                print(f"Image already exists: {image_full_path}")
                continue

            # Download the full resolution image
            response = requests.get(full_res_url, auth=(username, password))
            
            if response.status_code == 200:
                with open(image_full_path, 'wb') as file:
                    file.write(response.content)
                print(f"Downloaded full resolution image: {image_full_path}")
            else:
                print(f"Failed to download full resolution image: {full_res_url}")

# Apply the download function to each row of category 'photos'
for _, row in chloe_df[chloe_df['category'] == 'photos'].sort_values('date').iterrows():
    download_full_res_images(row)


In [None]:
def extract_description(html_full_path):
    # Read the HTML content
    with open(html_full_path, 'r', encoding='utf-8') as file:
        html_content = file.read()

    # Parse the HTML
    soup = BeautifulSoup(html_content, 'html.parser')

    # Try to find description in the first structure
    card_div = soup.find('div', class_='card')
    if card_div:
        p_elements = card_div.find_all('p')
        if p_elements:
            return ' '.join(p.text.strip() for p in p_elements)

    # If not found, try the second structure
    sample_section = soup.find('section', class_='sample')
    if sample_section:
        p_elements = sample_section.find_all('p')
        if p_elements:
            return ' '.join(p.text.strip() for p in p_elements)

    # If still not found, return None
    return None

# Function to get the HTML file path
def get_html_file_path(row):
    date = row['date']
    parsed_url = urlparse(row['link'])
    video_id = parsed_url.path.split('/')[-1]
    base_dir = 'F:\\Culture Data Scraping\\Chloe Morgane'
    html_filename = f"{date}-{video_id}.html"
    return os.path.join(base_dir, html_filename)

# Apply the function to create the new 'description' column
chloe_df['description'] = chloe_df.apply(lambda row: extract_description(get_html_file_path(row)), axis=1)


In [None]:

from deltalake import DeltaTable, write_deltalake
import pyarrow as pa

# Convert Pandas DataFrame to PyArrow Table
arrow_table = pa.Table.from_pandas(df)

# Save as Delta table to a local directory
local_path = "F:\\Culture Data Scraping\\Chloe Morgane\\delta_table"

# Write the data to the Delta table
# Use mode="overwrite" to replace all data, or mode="append" to add new data
write_deltalake(local_path, arrow_table, mode="overwrite")


In [None]:
import pandas as pd
import dotenv
import os
from libraries.client_stashapp import get_stashapp_client
from libraries.StashDbClient import StashDbClient

dotenv.load_dotenv()

stash = get_stashapp_client()

stashbox_client = StashDbClient(
    os.getenv("STASHDB_ENDPOINT"),
    os.getenv("STASHDB_API_KEY"),
)

In [None]:
import re

# Function to generate the video filename based on the row data
def get_video_filename(row):
    if row['category'] == 'video':
        date = row['date']
        parsed_url = urlparse(row['link'])
        video_id = parsed_url.path.split('/')[-1]
        title = re.sub(r'[^\w\-_\. ]', '', row['title'])  # Remove special characters
        title = title.replace(' ', '_')  # Replace spaces with underscores
        return f"{date}-{video_id}-{title}.mp4"
    return None

# Base directory for video files
video_base_dir = 'F:\\Culture Data Scraping\\Chloe Morgane'

# Create the new column 'video_filename'
chloe_df['video_filename'] = chloe_df.apply(get_video_filename, axis=1)

# Create the full path column
chloe_df['video_filepath'] = chloe_df['video_filename'].apply(lambda x: os.path.join(video_base_dir, x) if x else None)

# Check which files actually exist
chloe_df['video_exists'] = chloe_df['video_filepath'].apply(lambda x: os.path.isfile(x) if x else None)

# Print summary
video_rows = chloe_df[chloe_df['category'] == 'video']
print(f"Total videos: {len(video_rows)}")
print(f"Existing videos: {video_rows['video_exists'].sum()}")
print(f"Missing videos: {len(video_rows) - video_rows['video_exists'].sum()}")


In [None]:
for _, row in chloe_df.iterrows():
    if row['category'] == 'video':
        scenes = stash.find_scenes({ "path": { "modifier": "INCLUDES", "value": row['video_filename'] } })
        if len(scenes) == 0:
            print(f"Missing video: {row['video_filepath']}")
        if len(scenes) > 1:
            print(f"Duplicate video: {row['video_filepath']}")
        if len(scenes) == 1:
            scene = scenes[0]
            print(f"Scene ID: {scene['id']}")
            print(f"Scene Filename: {scene['title']}")
            print(f"Scene Date: {scene['date']}")
            print(f"Scene Rating: {scene['rating100']}")
            print(f"Scene Tags: {scene['tags']}")
            print(f"Scene Galleries: {scene['galleries']}")
            chloe_df.at[_, 'scene_id'] = scene['id']
            

In [None]:
for _, row in chloe_df.iterrows():
    if row['category'] == 'video':
        if row['scene_id']:
            video_id = row['link'].split('/')[-1]
            stash.update_scene({
                "id": row['scene_id'],
                "code": video_id
            })


In [None]:
import os
import base64

# Define the directory path
directory = r"F:\Culture Data Scraping\Chloe Morgane"

for _, row in chloe_df.iterrows():
    if row['category'] == 'video' and row['scene_id']:
        # Generate the filename using the same logic as in the example
        # Parse the ID from the link
        video_id = row['link'].split('/')[-1]
        filename = f"{row['date']}-{video_id}.jpg"

        # Construct the full file path
        file_path = os.path.join(directory, filename)

        if os.path.exists(file_path):
            # Read the image file and convert to base64
            with open(file_path, "rb") as image_file:
                encoded_string = base64.b64encode(image_file.read()).decode('utf-8')

            # Create the data URL
            data_url = f"data:image/jpeg;base64,{encoded_string}"

            # Update the scene with the base64 data URL as cover_image
            stash.update_scene({
                "id": row['scene_id'],
                "cover_image": data_url
            })
            print(f"Updated cover image for scene: {row['title']}")
        else:
            print(f"Cover image not found for scene: {row['title']}")


In [None]:
chloe_morgane_scenes = stash.find_scenes({ "studios": { "value": ["106"], "modifier": "INCLUDES" } })
df_chloe_morgane_scenes = pd.DataFrame(chloe_morgane_scenes)


In [None]:
# Create a dictionary to store scenes by date
scenes_by_date = {}

# Populate the dictionary
for scene in chloe_morgane_scenes:
    date = scene.get('date')
    if date:
        if date not in scenes_by_date:
            scenes_by_date[date] = []
        scenes_by_date[date].append(scene)

# Sort the dictionary by date
sorted_scenes_by_date = dict(sorted(scenes_by_date.items()))

# Find duplicates
duplicates = {date: scenes for date, scenes in sorted_scenes_by_date.items() if len(scenes) > 1}

# Print the results
if duplicates:
    print("Duplicate scenes found:")
    for date, scenes in duplicates.items():
        print(f"\nDate: {date}")
        for scene in scenes:
            print(f"  - Title: {scene.get('title', 'N/A')}")
            print(f"    ID: {scene.get('id', 'N/A')}")
else:
    print("No duplicate scenes found.")
