In [None]:
# Create Stash app client

import pandas as pd
import dotenv
import os

from libraries.client_stashapp import get_stashapp_client

dotenv.load_dotenv()

stash = get_stashapp_client("AURAL_")

In [None]:
import os
import pandas as pd

from dotenv import load_dotenv

load_dotenv()

# Directory containing JSON files
json_dir = os.getenv("JSON_SIDECARS_PATH")

# List to store DataFrames
dataframes = []

total_entries = 0

# Loop through each JSON file in the directory
for file_name in os.listdir(json_dir):
    if file_name.endswith('.json'):
        file_path = os.path.join(json_dir, file_name)

        # Load the JSON file
        imported_json = pd.read_json(file_path)

        # Assuming the entries of interest are in a column named 'entries'
        entries_column = imported_json['entries']
        entries_df = pd.DataFrame(entries_column.tolist(), columns=['Post ID', 'Subreddit', 'Author', 'Content Type', 'Title', 'Timestamp', 'Upvotes', 'Length', 'Submitted By'])

        # Convert the Unix timestamp to a formatted date
        entries_df['Formatted Date'] = pd.to_datetime(entries_df['Timestamp'], unit='s').dt.strftime('%Y-%m-%d')

        # Replace the 'Timestamp' column with 'Formatted Date'
        entries_df['Timestamp'] = entries_df['Formatted Date']
        entries_df = entries_df.drop(columns=['Formatted Date'])

        # Drop unwanted columns
        entries_df = entries_df.drop(columns=['Length', 'Submitted By'])

        # Append the processed DataFrame to the list
        dataframes.append(entries_df)
        total_entries += entries_df.shape[0]

# Concatenate all the DataFrames into a single DataFrame
combined_df = pd.concat(dataframes, ignore_index=True)

# Remove duplicates by 'Post ID'
deduplicated_df = combined_df.drop_duplicates(subset=['Post ID'])

print(f"Deduplicated DataFrame shape: {deduplicated_df.shape[0]}")

In [None]:
scenes = stash.find_scenes({ 'title': { 'value': '', 'modifier': 'IS_NULL' }}, q = "AprilW9")
df_scenes = pd.DataFrame(scenes)
df_scenes['basename'] = df_scenes['files'].apply(lambda x: x[0]['basename'].replace('.mp4', '').replace('\'', ''))
df_scenes['author'] = df_scenes['basename'].apply(lambda x: x.split(' - ', 1)[0])
df_scenes['reddit_title'] = df_scenes['basename'].apply(lambda x: x.split(' - ', 1)[1])

In [None]:
import pandas as pd
from thefuzz import fuzz

# Function to find the best matching title and return relevant data
def find_best_match(row, deduplicated_df):
    # Filter deduplicated_df to only include rows with the same author
    author_matches = deduplicated_df[deduplicated_df['Author'] == row['author']]
    
    # Calculate similarity for each title
    if not author_matches.empty:
        author_matches['similarity'] = author_matches['Title'].apply(lambda x: fuzz.token_set_ratio(row['reddit_title'], x))
        
        # Find the title with the highest similarity
        best_match = author_matches.loc[author_matches['similarity'].idxmax()]
        
        # Return the relevant columns from df_scenes and deduplicated_df
        return pd.Series([
            row['id'], 
            row['author'], 
            row['reddit_title'],
            best_match['Post ID'],
            best_match['Subreddit'],
            best_match['Author'],
            best_match['Title'],
        ])
    else:
        # If no matching author, return None values for deduplicated_df columns
        return pd.Series([
            row['id'], 
            row['author'], 
            row['reddit_title'],
            None,  # Post ID
            None,  # Subreddit
            None,  # Author
            None   # Title
        ])

# Apply the function to df_scenes
matched_df = df_scenes.apply(lambda row: find_best_match(row, deduplicated_df), axis=1)

# Assign column names to the new DataFrame
matched_df.columns = ['id', 'author', 'reddit_title', 'Post ID', 'Subreddit', 'Author', 'Title']

# Place Title columns next to each other
matched_df = matched_df[['id', 'author', 'Post ID', 'reddit_title', 'Title', 'Subreddit', 'Author']]

# Display the resulting DataFrame
print(matched_df.head())


In [None]:
manually_matched_df = matched_df[~matched_df['id'].isin(["120", "123", "148"])]

In [None]:
for index, scene in manually_matched_df.iterrows():
    stash.update_scene({
        'id': scene['id'],
        'title': None,
        'code': scene['Post ID'],
    })

In [None]:
# Check if all authors in df_scenes are found in deduplicated_df
all_authors_in_deduplicated = df_scenes['author'].isin(deduplicated_df['Author'])

# Check if there are any authors in df_scenes that are not in deduplicated_df
missing_authors = df_scenes.loc[~all_authors_in_deduplicated, 'author']

# Display results
if missing_authors.empty:
    print("All authors in df_scenes are found in deduplicated_df.")
else:
    print("The following authors in df_scenes are not found in deduplicated_df:")
    print(missing_authors.unique())


In [None]:
import os
import pandas as pd
import json

# Assuming 'scenes' is your list of scene dictionaries
filtered_scenes = []

for scene in scenes:
    # Check if the scene has exactly one file
    if len(scene['files']) == 1:
        file_info = scene['files'][0]
        file_path = file_info['path']
        
        # Construct the expected JSON sidecar path
        json_sidecar_path = os.path.splitext(file_path)[0] + '.json'
        
        # Check if the JSON sidecar file exists
        if os.path.exists(json_sidecar_path):
            # Load the JSON sidecar file
            with open(json_sidecar_path, 'r') as json_file:
                sidecar_data = json.load(json_file)
                
                # Extract the required fields from the JSON sidecar
                urls = sidecar_data.get('urls', [])
                cleaned_urls = [url.replace('old.reddit.com', 'www.reddit.com') for url in urls]

                title = sidecar_data.get('title', '')
                author = sidecar_data.get('author', '')
                
                # Add these details to the scene dictionary
                scene['sidecar_urls'] = cleaned_urls
                scene['sidecar_title'] = title
                scene['sidecar_author'] = author
                
                # Append the scene to the filtered list
                filtered_scenes.append(scene)

# Convert the filtered scenes to a DataFrame
df_filtered_scenes = pd.DataFrame(filtered_scenes)

# Output the filtered scenes with sidecar information
df_filtered_scenes


In [None]:
import praw

dotenv.load_dotenv()

reddit = praw.Reddit(client_id = os.getenv("REDDIT_CLIENT_ID"), client_secret = os.getenv("REDDIT_CLIENT_SECRET"), password = os.getenv("REDDIT_CLIENT_PASSWORD"), user_agent = os.getenv("REDDIT_CLIENT_USER_AGENT"), username = os.getenv("REDDIT_CLIENT_USERNAME"))

In [None]:
for index, scene in df_filtered_scenes.iterrows():
    stash_performers = stash.find_performers({ 'name': { 'value': scene['sidecar_author'], 'modifier': 'EQUALS' } })
    if len(stash_performers) == 1:
        stash_performer = stash_performers[0]

        stash.update_scene({
            'id': scene['id'],
            'title': scene['sidecar_title'],
            'performer_ids': [stash_performer['id']],
            'urls': scene['sidecar_urls']
        })

In [None]:
scenes_for_update = stash.find_scenes({ 'title': { 'value': '', 'modifier': 'IS_NULL' }, 'code': { 'value': '', 'modifier': 'NOT_NULL' } }, q = "AprilW9")
df_scenes_for_update = pd.DataFrame(scenes_for_update)

In [None]:
import datetime

for index, scene in df_scenes_for_update.iterrows():
    scene_for_scraping = stash.find_scene(scene['id'])

    # reddit_url = next((url for url in scene_for_scraping['urls'] if 'reddit.com' in url), None)
    # if not reddit_url:
    #     raise Exception('No Reddit URL found')

    submission = reddit.submission(id = scene_for_scraping['code'])

    stash.update_scene({
        'id': scene_for_scraping['id'],
        'code': submission.id,
        'title': submission.title,
        'performer_ids': [stash_performer['id']],
        'urls': scene_for_scraping['urls'],
        'date': datetime.datetime.fromtimestamp(submission.created_utc, tz=datetime.UTC).strftime('%Y-%m-%d'),
        'details': submission.selftext
    })

In [None]:
aprilw9_gwasi = deduplicated_df[deduplicated_df['Author'] == 'AprilW9']
aprilw9_gwasi

In [None]:
aprilw9_gwasi[aprilw9_gwasi['Title'].str.contains('Love Blowing')]

In [None]:
aprilw9_scenes = stash.find_scenes({ 'title': { 'value': '', 'modifier': 'IS_NULL' } }, q = "AprilW9")
df_aprilw9_scenes = pd.DataFrame(aprilw9_scenes)

df_aprilw9_scenes