In [4]:
import os
import re
from fuzzywuzzy import fuzz

In [5]:
### SHOW SPECIFIC CONFIGURATION ###

### gentle REMINDER: ###
# 1. make sure you go through visually and inspect the episode list you have with you, be it scrapped or
#    manually created, and make sure the episode names are best to your knowledge
# 2. make sure to point to the right directories.

SHOW_NAME = "TMKOC"
EPISODE_DOWNLOADED_DIRS = [r"D:\Shows\Taarak Mehta Ka Ooltah Chashmah"]
SEARCH_THRESHOLD = 0.9


def custom_clean_function(downloaded_ep_name):
    """write this to clean your file name which may be unique to how you have downloaded it.
    Try to make it return as close to the episode name in the list as possible.

    Args:
            downloaded_ep_name (str): the episode name in the downloaded folder, with only 1 dot before its extension, and all non alphanumeric characters removed.
    """
    # remove the show number from before the semi colon if it exists
    if ":" in downloaded_ep_name:
        downloaded_ep_name = downloaded_ep_name.split(":")[1]
    return downloaded_ep_name

In [6]:
# print the current working directory
print(os.getcwd())

s:\Shows\Shows-scripts\script_files


In [7]:
# get the list of episodes
episodes_file = os.path.join(
    os.getcwd(),
    f"../shows_info_files/{SHOW_NAME}",
    "episode_name_list.txt",
)

In [8]:
# read episodes file.
# and store it in hashmap

episodes = {}
with open(episodes_file, "r") as f:
    for i, line in enumerate(f):
        episodes[line.strip()] = i + 1

In [9]:
print("Length of episodes:", len(episodes))
# print the first 100 episodes just to see how it looks


for episode_name, episode_number in list(episodes.items())[:10]:

    print(f"{episode_number}: {episode_name}")

Length of episodes: 2000
1: 100. Taarak Mehta Ka Ooltah Chashmah - Jethalal And His Family Reach The Theatre
2: 99. Taarak Mehta Ka Ooltah Chashmah - A Burglar Snatches Jethalal's Mobile
3: 98. Taarak Mehta Ka Ooltah Chashmah - Jethalal Is Excited To Go For Exercise With Babita
4: 97. Taarak Mehta Ka Ooltah Chashmah - Jethalal Starts Dieting
5: 96. Taarak Mehta Ka Ooltah Chashmah - Babita Suggests Jethalal To Exercise
6: 95. Taarak Mehta Ka Ooltah Chashmah - Winner Of Gokuldham's Cultural Dance Program
7: 94. Taarak Mehta Ka Ooltah Chashmah - Special Guest In Gokuldham's Cultural Dance Program
8: 93. Taarak Mehta Ka Ooltah Chashmah - Taarak Declares A Cultural Dance Program
9: 92. Taarak Mehta Ka Ooltah Chashmah - Clash Of Plans
10: 91. Taarak Mehta Ka Ooltah Chashmah - Dr. Hathi Gets Stuck In Auto Rickshaw


In [10]:
# get the list of files in the directory
all_files = []
for directory in EPISODE_DOWNLOADED_DIRS:
    all_files.extend(os.listdir(directory))

In [11]:
print("Length of all files:", len(all_files))
# print the first 10 files just to see how it looks
for file in all_files[:10]:
    print(file)

Length of all files: 2006
S01E0001. Taarak Mehta Introduces Himself And Society Members.mp4
S01E0002. Jethalal's Son Tapu Decides To Spend His Holiday.mp4
S01E0003. Tapu Has Been The Worry Of Jethalal.mp4
S01E0004. Daya Approaches Taarak Mehta For Help.mp4
S01E0005. Jethalal In Huge Trouble.mp4
S01E0006. Champaklal Gada Goes Missing.mp4
S01E0007. Champaklal Gada Gets A Grand Welcome In Gokuldham Society.mp4
S01E0008. Mumbai Is Hit By Heavy Rains.mp4
S01E0009. Jethalal Is Thrown In An Unlikely Situation.mp4
S01E0010. Champak Chacha Returns Home.mp4


In [12]:
# check if lengths are same
if len(episodes) == len(all_files):
    print("Lengths are same! congratulations!")
elif len(episodes) > len(all_files):
    print("More episodes than files")
else:
    print("More files than episodes")

More files than episodes


In [13]:
def clean_ep_name(ep_name):
    # remove all special characters
    ep_name = re.sub(r"[^A-Za-z0-9\s]", "", ep_name)
    # as the extension is the last 3 characters, lets make sure there is no other dot other than the 4th last char
    ep_name = ep_name[:-3] + "." + ep_name[-3:].replace(".", "_")

    return custom_clean_function(ep_name)

In [18]:
def rename_episode(old_name, new_name):
    # go through the directory list, and try each one to see if it matches
    for dir in EPISODE_DOWNLOADED_DIRS:
        try:
            if old_name in os.listdir(dir):
                print(f"Renaming {old_name} to {new_name}")
                os.rename(os.path.join(dir, old_name), os.path.join(dir, new_name))
                return True
        except Exception as e:
            print(f"Error renaming {old_name} to {new_name} in {dir}")
            print(e)
            return False

In [19]:
# iterate through all files and see if you find an episode match
issues = {}
for episode in all_files:
    print(f"Checking episode: {episode}")
    old_name, extension = clean_ep_name(episode).split(".")
    old_name = old_name.strip()
    print(f"Cleaned name: {old_name}")
    # do a fuzzy search
    max_match = 0
    best_match = ""
    for ep in episodes.keys():
        match = fuzz.ratio(old_name, ep)
        if match > max_match:
            max_match = match
            best_match = ep

    print(f"Found episode: {best_match}")
    # rename it to the episode number
    new_name = f"{episodes[best_match]}. {old_name}.{extension}"
    if not rename_episode(episode, new_name):
        issues[episode] = new_name

Checking episode: S01E0001. Taarak Mehta Introduces Himself And Society Members.mp4
Cleaned name: S01E0001 Taarak Mehta Introduces Himself And Society Members
Found episode: 1. Taarak Mehta Ka Ooltah Chashmah - Taarak Mehta Introduces Himself And Society Members
Renaming S01E0001. Taarak Mehta Introduces Himself And Society Members.mp4 to 100. S01E0001 Taarak Mehta Introduces Himself And Society Members.mp4
Checking episode: S01E0002. Jethalal's Son Tapu Decides To Spend His Holiday.mp4
Cleaned name: S01E0002 Jethalals Son Tapu Decides To Spend His Holiday
Found episode: 2. Taarak Mehta Ka Ooltah Chashmah - Jethalal's Son Tapu Decides To Spend His Holiday
Renaming S01E0002. Jethalal's Son Tapu Decides To Spend His Holiday.mp4 to 99. S01E0002 Jethalals Son Tapu Decides To Spend His Holiday.mp4
Checking episode: S01E0003. Tapu Has Been The Worry Of Jethalal.mp4
Cleaned name: S01E0003 Tapu Has Been The Worry Of Jethalal
Found episode: 3. Taarak Mehta Ka Ooltah Chashmah - Tapu Has Been The

In [17]:
issues

{}