In [88]:
import pysrt
from pysrt.srttime import SubRipTime
import pandas as pd
from bs4 import BeautifulSoup
from IPython.display import clear_output
import chardet

In [133]:
heb_sub_path = r"Data/Hebrew/Aquaman.2018.srt"
eng_sub_path = r"Data/English/Aquaman.2018.srt"

In [134]:
# Find encoding
with open(heb_sub_path, 'rb') as file:
    result = chardet.detect(file.read())
encoding = result['encoding']
encoding

'utf-8'

In [135]:
heb_subs = pysrt.open(heb_sub_path, encoding=encoding)
eng_subs = pysrt.open(eng_sub_path)

In [136]:
dif = 0.8
dif = int(dif * 1000)

In [137]:
subs = pd.DataFrame(columns=["Hebrew", "English", "Safe", "Hebrew_st", "Hebrew_e", "English_st", "English_e"])
for heb_sub in heb_subs:
    h_start = heb_sub.start
    h_end = heb_sub.end
    safe = True
    for eng_sub in eng_subs:
        e_start = eng_sub.start
        e_end = eng_sub.end

        if h_start + 2000 < e_start:
            break

        if h_start - dif > e_start:  # More than (dif) second apart
            continue
        if (e_end - dif) < h_end < (e_end + dif):
            if (e_start - dif) < h_start < (e_start + dif):
                heb_text = BeautifulSoup(heb_sub.text, "html.parser").getText()  # Removes html format like <i>
                eng_text = BeautifulSoup(eng_sub.text, "html.parser").getText()

                if heb_text.count("\n") != eng_text.count("\n"):  # Check if the numbers of lines is not the same - meaning chance of unmatching sentences
                    safe = False
                if heb_text.count("?") != eng_text.count("?"):  # Check if the numbers of lines is not the same - meaning chance of unmatching sentences
                    safe = False
                subs.loc[subs.shape[0]] = [heb_text, eng_text, safe, h_start, h_end, e_start, e_end]
                break



In [138]:
subs[subs["Safe"] != True]

Unnamed: 0,Hebrew,English,Safe,Hebrew_st,Hebrew_e,English_st,English_e
19,...עושה את דרכו לחופים,is due to make landfall\nsometime after 2:00 p.m.,False,"00:05:09,150","00:05:11,460","00:05:08,810","00:05:11,312"
22,".על שם האגדה\n?הוא מלך, לא","After a legend. He's a king, isn't he?",False,"00:05:15,870","00:05:20,570","00:05:15,942","00:05:19,946"
39,שני העולמות לא נועדו להיפגש אי-פעם,"Their two worlds\nwere never meant to meet,",False,"00:07:35,840","00:07:38,770","00:07:36,124","00:07:38,876"
50,",אני אחזור\n.כשהמצב יהיה בטוח",I will return to you. When it's safe.,False,"00:08:27,220","00:08:31,560","00:08:27,341","00:08:31,512"
56,.שלא יישכח אותי,Don't let him forget me. Hmm?,False,"00:08:56,220","00:08:59,420","00:08:56,329","00:08:59,332"
...,...,...,...,...,...,...,...
928,.אבי היה שומר מגדלור,My father\nwas a lighthouse keeper.,False,"02:10:53,880","02:10:56,580","02:10:53,889","02:10:56,517"
937,",במכון למדעי הים בארה''ב",at the United States Institute\nof Marine Scie...,False,"02:14:43,240","02:14:45,450","02:14:43,076","02:14:45,495"
943,"!ד''ר שין\n.תן לי לסיים, בבקשה",- Dr. Shin!\n- Will you please let me finish?,False,"02:14:59,480","02:15:01,210","02:14:59,259","02:15:00,979"
944,.שוב התחלנו,Here we go again. Atlantean?,False,"02:15:01,480","02:15:02,860","02:15:01,011","02:15:02,763"


In [96]:
unsafe_len = subs[subs["Safe"] != True].shape[0]
i = 0
for index, row in subs[subs["Safe"] != True].iterrows():
    i += 1
    while True:
        print(f"{i}\\{unsafe_len}")
        print(row["Hebrew_st"],"-",row["Hebrew_e"])
        print(row['Hebrew'])
        print(row['English'])
        res = input()
        if res == "x":
            clear_output()
            print(f"Marking unsafe: {subs.iloc[index]['Hebrew']}\n")
            break
        elif res == "v":
            clear_output()
            print(f"Marking safe: {subs.iloc[index]['Hebrew']}\n")
            subs.loc[index, "Safe"] = True
            break
        elif res == "break":
            break
        else:
            clear_output()
            print("Invalid input, x-delete, v-save, break-break\n")
            continue
    if res == "break":
        break

Marking unsafe: ...עושה את דרכו לחופים

2\128
00:05:15,870 - 00:05:20,570
.על שם האגדה
?הוא מלך, לא
After a legend. He's a king, isn't he?


 break


In [139]:
subs = subs[subs["Safe"] == True]

In [140]:
subs["Hebrew"] = subs["Hebrew"].str.replace("\n", ", ")
subs["English"] = subs["English"].str.replace("\n", ", ")
subs["Hebrew"] = subs["Hebrew"].str.replace(":", ", ")
subs["English"] = subs["English"].str.replace(":", ", ")

In [141]:
subs.to_csv(r"Data/Modified/Aquaman.2018.csv", index=False)

In [144]:
test = pd.read_csv(r"Data/Modified/Mission.Impossible.Dead.Reckoning.Part.One-modified.csv")
test

Unnamed: 0,Hebrew,English,Safe,Hebrew_st,Hebrew_e,English_st,English_e
0,.באורח נס,miraculously.,True,"00:01:03,534","00:01:04,867","00:01:03,523","00:01:04,899"
1,",לאחר 25,000 מיילים ימיים","After 25, 000 nautical miles,",True,"00:01:11,542","00:01:14,086","00:01:11,072","00:01:14,200"
2,.ונותרה בלתי מזוהה לחלוטין,and remained completely undetected.,True,"00:01:18,882","00:01:21,802","00:01:19,080","00:01:21,833"
3,"יכולות החמקנות של הפודקובה, .התעלו על כל הציפיות","The Podkova's stealth capability has, exceeded...",True,"00:01:23,637","00:01:28,642","00:01:23,668","00:01:28,881"
4,מכונת ההרג המפחידה ביותר,The most fearsome killing machine,True,"00:01:44,366","00:01:46,702","00:01:44,397","00:01:46,733"
...,...,...,...,...,...,...,...
1296,.והעולם ישלם את המחיר האולטימטיבי,and the world will pay the ultimate price.,True,"02:34:42,961","02:34:45,714","02:34:42,983","02:34:45,444"
1297,.הקרבתו תהיה לשווא,their sacrifice will have been in vain.,True,"02:34:50,594","02:34:53,555","02:34:50,616","02:34:53,035"
1298,.אז תזדרז,So hurry.,True,"02:34:54,264","02:34:55,724","02:34:54,286","02:34:55,454"
1299,.אין הרבה זמן,There isn't much time.,True,"02:34:56,350","02:34:58,268","02:34:56,371","02:34:57,915"


In [82]:
# Loads back the SubRipTime type
test['Hebrew_st'] = test['Hebrew_st'].apply(lambda x: SubRipTime.from_string(x) if isinstance(x, str) else x)
test['Hebrew_e'] = test['Hebrew_e'].apply(lambda x: SubRipTime.from_string(x) if isinstance(x, str) else x)
test['English_st'] = test['English_st'].apply(lambda x: SubRipTime.from_string(x) if isinstance(x, str) else x)
test['English_e'] = test['English_e'].apply(lambda x: SubRipTime.from_string(x) if isinstance(x, str) else x)

In [122]:
subs

Unnamed: 0,Hebrew,English,Safe,Hebrew_st,Hebrew_e,English_st,English_e
0,!גול,Goal!,True,"00:02:11,928","00:02:13,367","00:02:12,345","00:02:13,835"
2,",מה שזה לא יהיה, .זה רחוק מכאן","I heard it, Manny., Whatever it is, it's miles...",True,"00:03:27,611","00:03:29,011","00:03:26,887","00:03:29,219"
3,"?פיצ'ס, את בסדר","Peaches, are you all right?",True,"00:03:29,118","00:03:31,076","00:03:29,389","00:03:30,720"
5,"שניכם הייתם אמורים להיות, !דודים אחראיים","You two were supposed to be, responsible uncles!",True,"00:03:42,116","00:03:44,616","00:03:42,469","00:03:44,869"
6,"מה? לא ראיתי את פיצ'ס מתגנבת, .לפני 15 או 20 דקות","What? I didn't see Peaches sneak off, maybe 15...",True,"00:03:44,728","00:03:48,113","00:03:44,938","00:03:48,169"
...,...,...,...,...,...,...,...
395,- וונדה סייקס -,# Somehow we found it here #,True,"01:20:39,405","01:20:42,781","01:20:39,882","01:20:42,942"
396,- ג'ניפר לופז -,# We found us a home #,True,"01:20:43,028","01:20:46,468","01:20:43,118","01:20:46,576"
399,"- אלאיין צ'באט -, - הת'ר מוריס -","# We are #, # We are #",True,"01:21:12,224","01:21:15,321","01:21:12,814","01:21:15,510"
400,"...אנחנו, אנחנו","# We are, we are #",True,"01:21:19,879","01:21:22,489","01:21:20,355","01:21:22,949"
