In [1]:
import pandas as pd
from googletrans import Translator
from tqdm.notebook import tqdm
import time 

In [2]:
df = pd.read_csv("../data/video_transcription_with_whisper.csv")
df

Unnamed: 0,video_id,transcription
0,L0oo8PsUKrA,"I am from BBC News, and I want to ask about t..."
1,2Shla50pRxA,"Biamonduotus, welcome to Reporters Plus here ..."
2,nbBsh46aSz4,Sianan is investigating potential mass detent...
3,tcKw5jiT6wg,Being one of the most influential media in th...
4,nfGkJAX2WUc,Our administration is committed to leading wi...
...,...,...
718,rMLqE8N24wE,1.5% 1.5% 1.5% 1.5% 1.5% 1.5% 1.5% 1.5% 1.5% ...
719,dS7FpBbRA70,4ac 80ml 설탕 1kgavoCA 440 Armenians 1.5% 355人 ...
720,Hbhp6Oi_eXI,1.5分間 1.5分間 1.5分間 1.5分間 1.5分間 1.5分間 1.5分間 1.5...
721,OsItiPfnzLI,"Dr. Grover Proctor, what I'm going to do toni..."


In [3]:
# Drop video_ids that are not video or null.
df = df[df.transcription != 'Not Video']
df = df[df.transcription.notnull()]
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,video_id,transcription
0,L0oo8PsUKrA,"I am from BBC News, and I want to ask about t..."
1,2Shla50pRxA,"Biamonduotus, welcome to Reporters Plus here ..."
2,nbBsh46aSz4,Sianan is investigating potential mass detent...
3,tcKw5jiT6wg,Being one of the most influential media in th...
4,nfGkJAX2WUc,Our administration is committed to leading wi...
...,...,...
564,rMLqE8N24wE,1.5% 1.5% 1.5% 1.5% 1.5% 1.5% 1.5% 1.5% 1.5% ...
565,dS7FpBbRA70,4ac 80ml 설탕 1kgavoCA 440 Armenians 1.5% 355人 ...
566,Hbhp6Oi_eXI,1.5分間 1.5分間 1.5分間 1.5分間 1.5分間 1.5分間 1.5分間 1.5...
567,OsItiPfnzLI,"Dr. Grover Proctor, what I'm going to do toni..."


In [4]:
# Creates Translator object
def create_translator():
    translator = Translator()
    translator.raise_Exception = True
    
    return translator

In [5]:
# Tries to translate in English .
def translate(translator, transcription):
    
    translated_text = translator.translate(transcription , dest ='en').text
    
    return translated_text

In [6]:
# Detects the language of translated text
def detect_language(translator, translated_text):
    
    detected_lan = translator.detect(translated_text)
    
    return detected_lan

In [7]:
# Traverses the data (like batches) since there is characterization limit.
def traverse(length, limit, trans, function, translator):
    
    complete_text = ""
    
    loop_count = int(length/limit)

    for i in range(loop_count):
        text = ""
        if i == 0:
            text = function(translator, trans[:limit*(i+1)])
        elif i == loop_count-1:
            text = function(translator, trans[limit*(i+1):])
        else:
            text = function(translator, trans[limit*i:limit*(i+1)])
        
        if function.__name__ == 'translate':
            complete_text += text
        
        elif function.__name__ == 'detect_language':
            complete_text = complete_text + text.lang + " "
            # Delete last space
            if i == loop_count - 1:
                complete_text = complete_text[:-1]
    
    return complete_text        

In [8]:
# Main
def find_translation(trans):
    
    
    translator = create_translator()
    
    
    limit = 4800
    
    while True:
        try:
            length = len(trans)
            complete_translated_text = ""
            
            if length <= limit:
                complete_translated_text = translate(translator, trans)
            else:
                complete_translated_text = traverse(length, limit, trans, translate, translator)
            
            # Prevents 409 ERROR
            time.sleep(1)
            
            det_limit = 4800
            detected_lan_ratio = 0
            while True:
                try:
                    len_comp_text = len(complete_translated_text)

                    if len_comp_text <= det_limit:
                        detected_lan = detect_language(translator, complete_translated_text)
                        if detected_lan.lang == 'en':
                            detected_lan_ratio = 1
                        # Not English
                        else:
                            detected_lan_ratio = 0
                        break
                    else:
                        detected_lan = traverse(len_comp_text, det_limit, complete_translated_text, detect_language, translator)
                        # Find percentage of English language among other languages
                        detected_lan_ratio = detected_lan.count('en')/(detected_lan.count(' ') + 1)
                        break
                except Exception as exc:
                    print("Exception happened while detecting language: ", exc)
                    det_limit = int(det_limit/2)
           
            # Prevents 409 ERROR
            time.sleep(1)
            
            # If English language ratio less than 80 percent, English is not dominant.
            if detected_lan_ratio < 0.8:
                return "Not English"
            else: 
                return complete_translated_text            
        except Exception as exc:
            print("Exception happened while translating the language: ", exc)
            limit = int(limit/2)


In [9]:
# Find for all
translations = []

for transcription in tqdm(df.transcription):
    translations.append(find_translation(transcription))

  0%|          | 0/569 [00:00<?, ?it/s]

In [10]:
trans_df = pd.DataFrame({'video_id': df.video_id.to_list(), 'transcription': translations})
trans_df

Unnamed: 0,video_id,transcription
0,L0oo8PsUKrA,"I am from BBC News, and I want to ask about th..."
1,2Shla50pRxA,"Biamonduotus, welcome to Reporters Plus here o..."
2,nbBsh46aSz4,Sianan is investigating potential mass detenti...
3,tcKw5jiT6wg,Being one of the most influential media in the...
4,nfGkJAX2WUc,Our administration is committed to leading wit...
...,...,...
564,rMLqE8N24wE,1.5% 1.5% 1.5% 1.5% 1.5% 1.5% 1.5% 1.5% 1.5% 1...
565,dS7FpBbRA70,4AC 80ml sugar 1kgavoca 440 Armenians 1.5% 355...
566,Hbhp6Oi_eXI,1.5 minutes for 1.5 minutes for 1.5 minutes fo...
567,OsItiPfnzLI,"Dr. Grover Proctor, what I'm going to do tonig..."


In [11]:
# Drop rows that are Not English transcription
trans_filtered_df = trans_df[trans_df.transcription != "Not English"]
trans_filtered_df

Unnamed: 0,video_id,transcription
0,L0oo8PsUKrA,"I am from BBC News, and I want to ask about th..."
1,2Shla50pRxA,"Biamonduotus, welcome to Reporters Plus here o..."
2,nbBsh46aSz4,Sianan is investigating potential mass detenti...
3,tcKw5jiT6wg,Being one of the most influential media in the...
4,nfGkJAX2WUc,Our administration is committed to leading wit...
...,...,...
564,rMLqE8N24wE,1.5% 1.5% 1.5% 1.5% 1.5% 1.5% 1.5% 1.5% 1.5% 1...
565,dS7FpBbRA70,4AC 80ml sugar 1kgavoca 440 Armenians 1.5% 355...
566,Hbhp6Oi_eXI,1.5 minutes for 1.5 minutes for 1.5 minutes fo...
567,OsItiPfnzLI,"Dr. Grover Proctor, what I'm going to do tonig..."


In [12]:
# Save
trans_filtered_df.to_csv("video_transcription_and_translation_whisper.csv", index = False)