In [1]:
import pandas as pd
import Levenshtein

def levenshtein_ratio(name1, name2):
    return Levenshtein.ratio(name1, name2)

# Set a threshold for matching
threshold = 0.80  # You can adjust this value

In [3]:
file_path = "F:\\Eva de Vil - Algolia Results\\output.json"
df_json = pd.read_json(file_path)

desired_columns = ["title", "publish_date"]
df_json = df_json[desired_columns]
df_json['publish_date'] = pd.to_datetime(df_json['publish_date'])
df_json['publish_date'] = df_json['publish_date'].dt.date

print(df_json)

                                  title publish_date
0        This Was Meant To Be Temporary   2024-01-13
1                 So Weak, So Desperate   2024-01-12
2                   Habits For My Slave   2024-01-07
3               Your Sex Life This Year   2024-01-05
4                    My Loser Pump Slut   2023-12-02
..                                  ...          ...
764                  Sucker For My Tits   2021-11-13
765      Cruel Cuckolding: Eat His Load   2021-11-12
766        Gooner's Pussy Free Paradise   2021-10-31
767            Motivational Ass Worship   2021-10-30
768  Quick Fishnet Ass Worship (iPhone)   2018-01-02

[769 rows x 2 columns]


In [4]:
import os
import re
from datetime import datetime

directory_path = "Y:\\Culture\\Videos\\Performers\\Eva de Vil\\"
file_names = [f for f in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, f))]
df_files = pd.DataFrame(file_names, columns=["raw_file_name"])

def extract_after_last_dash(raw_file_name):
    last_part = raw_file_name.rsplit('-', 1)[-1] if '-' in raw_file_name else raw_file_name
    return last_part.replace(" (STOLEN)", "").replace(".mp4", "").strip()

def parse_date_from_filename(filename):
    # Regular expression pattern for YYYY-mm-DD
    pattern = r'\d{4}-\d{2}-\d{2}'
    
    # Search for the pattern in the filename
    match = re.search(pattern, filename)
    
    # If a match is found, parse it into a date
    if match:
        date_str = match.group()
        date_object = datetime.strptime(date_str, '%Y-%m-%d')
        return date_object.date()  # Use .date() to get just the date part
    
    # If no date found, return None or handle as needed
    return None

df_files["file_name"] = df_files["raw_file_name"].apply(extract_after_last_dash)
df_files["release_date"] = df_files["raw_file_name"].apply(parse_date_from_filename)

In [5]:
# Compare each pair of rows
matches = []
for i, row1 in df_json.iterrows():
    for j, row2 in df_files.iterrows():
        ratio = levenshtein_ratio(row1["title"], row2["file_name"])
        if ratio >= threshold:
            matches.append((i, j, ratio))

# Create a new DataFrame based on matches
matched_rows = [(df_json.iloc[i], df_files.iloc[j], ratio) for i, j, ratio in matches]
df_matched = pd.DataFrame(matched_rows, columns=['JSON Row', 'Files Row', 'Ratio'])

In [14]:
# This is for Dirty Words
df_dw = pd.read_json("Y:\\eva_de_vil_dirtywords.org.json")

dw_matches = []
dw_missing = []
for i, row1 in df_dw.iterrows():
    found = False
    for j, row2 in df_files.iterrows():
        ratio = levenshtein_ratio(row1["name"], row2["file_name"])
        if ratio >= threshold:
            dw_matches.append((i, j, ratio))
            found = True
    
    if not found:
        dw_missing.append(row1)

# Create a new DataFrame based on matches
matched_dw_rows = [(df_dw.iloc[i], df_files.iloc[j], ratio) for i, j, ratio in dw_matches]
df_matched = pd.DataFrame(matched_dw_rows, columns=['JSON Row', 'Files Row', 'Ratio'])

df_missing = pd.DataFrame(dw_missing)


In [16]:
df_missing.to_json("Y:\\eva_de_vil_dirtywords_missing.json", orient='records')

In [6]:
df_matched_flat = pd.DataFrame({
    'raw_filename': df_matched['Files Row'].apply(lambda x: x['raw_file_name']),
    'title': df_matched['JSON Row'].apply(lambda x: x['title']),
    'publish_date': df_matched['JSON Row'].apply(lambda x: x['publish_date']),
    'file_name': df_matched['Files Row'].apply(lambda x: x['file_name']),
    'release_date': df_matched['Files Row'].apply(lambda x: x['release_date']),
    'ratio': df_matched['Ratio'],
})


In [7]:
# Find rows where publish_date and release_date are different
different_dates = df_matched_flat[
    (df_matched_flat['release_date'].notna()) &
    ~(df_matched_flat['title'].str.startswith("Edge Slut Training"))
]

In [8]:
df_filtered = df_matched_flat.loc[
    ~df_matched_flat["release_date"].notna() &
    df_matched_flat["raw_filename"].str.contains("STOLEN")
]

In [9]:
df_renames = pd.DataFrame({
    'rename_command': "mv \"" + df_filtered['raw_filename'] + "\" \"Eva de Vil - " + pd.to_datetime(df_filtered['publish_date']).dt.strftime('%Y-%m-%d') + " - " + df_filtered['title'] + " (STOLEN).mp4\"",
})

In [121]:
# Assuming df_renames is your DataFrame and 'rename_command' is the column
df_renames['rename_command'].to_csv('rename_commands.txt', index=False, header=False, sep='\t')
