# Mount Google Drive

In [6]:
from google.colab import drive
from google.colab import files
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Install Requirements

In [7]:
%%capture
!pip install python-docx
!pip install pandas==1.3.2
!pip install pyPDF2

In [8]:
import os
import re
import pandas as pd
import numpy as np

In [9]:
# Import Local Modules
path_wd = '/content/drive/MyDrive/Github/Screenplay'
os.chdir(path_wd)

%load_ext autoreload
%reload_ext autoreload

from screenplay import Screenplay
scp = Screenplay.Screenplay()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Upload or Select Screenplay Directory

## Upload

In [11]:
# Run Cell to choose file
path_upload = '/content/drive/MyDrive/Github/Screenplay/upload'
os.chdir(path_upload)
uploaded = files.upload()

path_to_read = path_upload

## Read from GDrive Folder

In [12]:
# Read from public_screenplay directory
path_to_read = '/content/drive/MyDrive/Github/Screenplay/private_screenplays'

ls_fns = []
for dirpath, dirname, fns in os.walk(path_to_read):
  ls_fns.extend(fns)
ls_fns

['Friend_Request_2019.pdf',
 '愚人之家（2019稿）nowm.docx',
 '电影剧本_往来有玉面.docx',
 '合并剧本word.docx',
 'wanglai.xml',
 'PARADOX_CN_20200904_VX.pdf',
 'Friend_Request_2019.txt',
 'Friend_Request_2019.xml',
 'df_friend_request.json']

# Load Screenplay
 - parse.Scene_Heading
 - parse.D_Character_Parenthetical
 - parse.D_Dialogue_Parenthetical

In [13]:
dfsc = scp.read.auto(path_to_read +  '/Friend_Request_2019.xml')

# Parse Scene Heading into separate columns
dfsc = scp.parse.Scene_Heading(dfsc)

# Parse Character Parenthetical into separate rows
dfsc = scp.parse.D_Character_Parenthetical(dfsc)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfC_melted['value'] = dfC_melted['value'].apply(str.strip)


In [None]:
dfsc[10:20]

# Translate

In [15]:
# Create a target Element column for translation
#############
tgt_lang='zh'
#############
dfsc['Element_{}'.format(tgt_lang)] = dfsc['Element'] 

## Translate Characters
 D Characters
 Other Characters in Element

In [29]:
# RE-Parse Scene Heading into separate columns,
# if necessary to fix typos
dfsc = scp.parse.Scene_Heading(dfsc)

In [16]:
# Fix df, if any detected
dfsc.loc[dfsc['Element'] == 'I DIDN’T WANTA GO TO YOUR BIRTHDAY', 'Type'] = 'Dialogue'

In [72]:
# Define Translation Dict for Character
Character_en = dfsc.loc[dfsc['Type'] == 'Character', 'Element'].unique()
print(Character_en)

Character_zh = ['健身房员工', '西恩', '布鲁克', '艾莉', '男人',
       '新闻记者', '梅森', '祖儿', '查尔斯', '梅', '凯特',
       '救护人员', '王', '泰勒', '诺亚', '动物园管理员',
       '制服警察', '杏', '柯林斯太太', '科技调查员',
       '邻居', '兴奋的儿童声音']

en2zh_Character = dict(zip(Character_en, Character_zh))
print(en2zh_Character)

['GYM EMPLOYEE' 'SEAN' 'BROOKE' 'ALLISON' 'MAN' 'NEWS REPORTER' 'MASON'
 'ZOEY' 'CHARLES' 'MEI' 'KATE' 'EMERGENCY DISPATCH' 'WANG' 'TYLER' 'NOAH'
 'ZOO DIRECTOR' 'UNIFORMED POLICE' 'XING' 'MRS. COLLINS'
 'TECH INVESTIGATOR' 'NEIGHBOR' 'EXCITED KID’S VOICE']
{'GYM EMPLOYEE': '健身房员工', 'SEAN': '西恩', 'BROOKE': '布鲁克', 'ALLISON': '艾莉', 'MAN': '男人', 'NEWS REPORTER': '新闻记者', 'MASON': '梅森', 'ZOEY': '祖儿', 'CHARLES': '查尔斯', 'MEI': '梅', 'KATE': '凯特', 'EMERGENCY DISPATCH': '救护人员', 'WANG': '王', 'TYLER': '泰勒', 'NOAH': '诺亚', 'ZOO DIRECTOR': '动物园管理员', 'UNIFORMED POLICE': '制服警察', 'XING': '杏', 'MRS. COLLINS': '柯林斯太太', 'TECH INVESTIGATOR': '科技调查员', 'NEIGHBOR': '邻居', 'EXCITED KID’S VOICE': '兴奋的儿童声音'}


In [34]:
# Apply Name Translation to target
def translate_character(x):
  ls_chars = re.findall('|'.join(Character_en), 
                        x, re.IGNORECASE
  )
  for char in ls_chars:
    x = re.sub('(?:(?<=\s)|^)' + char, 
               en2zh_Character[char.upper()], 
               x
              )
  return x

dfsc['Element_{}'.format(tgt_lang)] = \
  dfsc['Element_{}'.format(tgt_lang)].apply(translate_character)

In [None]:
dfsc[50:100]

## Translate SH IE

In [None]:
# Fix IE-related typos, if any.

In [54]:
# Define Translation Dict for IE
IE_en = dfsc['IE'].unique()
print(IE_en)
IE_zh = ['内', None, '外', '外./内']
en2zh_IE = dict(zip(IE_en, IE_zh))
print(en2zh_IE)

['INT' None 'EXT' 'EXT./INT.']
{'INT': '内', None: None, 'EXT': '外', 'EXT./INT.': '外./内'}


In [58]:
# Apply Translation
dfsc['IE_{}'.format(tgt_lang)] = None
dfsc.loc[~dfsc['IE'].isna(), 'IE_{}'.format(tgt_lang)] = \
  dfsc.loc[~dfsc['IE'].isna(), 'IE'].apply(
      lambda x: en2zh_IE[x]
  )

In [53]:
dfsc.head(2)

Unnamed: 0,Scene,Element,Grp,Type,IE,Location,Time,Element_zh,IE_zh
0,1,INT. GYM - NIGHT,H,Scene Heading,INT,GYM,NIGHT,INT. GYM - NIGHT,内
1,1,The last few People spill out into the night.,A,Action,,,,The last few People spill out into the night.,


## Translate SH Time

In [40]:
# Fix Time-related typos, if any.
dfsc.loc[dfsc['Time'] == 'DESCRIPT PLACE - NIGHT', 'Location'] = 'NON-DESCRIPT PLACE'
dfsc.loc[dfsc['Time'] == 'DESCRIPT PLACE - NIGHT', 'Time'] = 'NIGHT'

In [56]:
# Define Translation dict for Time
Time_en = dfsc['Time'].unique()
print(Time_en)
Time_zh = ['夜', None, '日', None, '夜 噩梦', '傍晚', '稍后',
       '雨夜', 'N/A', '继上']
en2zh_Time = dict(zip(Time_en, Time_zh))
print(en2zh_Time)

['NIGHT' None 'DAY' nan 'NIGHT NIGHTMARE' 'DUSK' 'LATER' 'RAINY NIGHT'
 'N/A' 'CONTINUOUS']
{'NIGHT': '夜', None: None, 'DAY': '日', nan: None, 'NIGHT NIGHTMARE': '夜 噩梦', 'DUSK': '傍晚', 'LATER': '稍后', 'RAINY NIGHT': '雨夜', 'N/A': 'N/A', 'CONTINUOUS': '继上'}


In [59]:
# Apply Translation
dfsc['Time_{}'.format(tgt_lang)] = None
dfsc.loc[~dfsc['Time'].isna(), 'Time_{}'.format(tgt_lang)] = \
  dfsc.loc[~dfsc['Time'].isna(), 'Time'].apply(
      lambda x: en2zh_Time[x]
  )

In [61]:
dfsc.head(2)

Unnamed: 0,Scene,Element,Grp,Type,IE,Location,Time,Element_zh,IE_zh,Time_zh
0,1,INT. GYM - NIGHT,H,Scene Heading,INT,GYM,NIGHT,INT. GYM - NIGHT,内,夜
1,1,The last few People spill out into the night.,A,Action,,,,The last few People spill out into the night.,,


## Translate SH Location

In [None]:
# Fix Location-related typos, if any
dfsc.loc[dfsc['Location'] == '/INT BROOKE’S CAR', 'IE'] = 'EXT./INT'
dfsc.loc[dfsc['Location'] == '/INT BROOKE’S CAR', 'Location'] = 'BROOKE’S CAR'

dfsc.loc[dfsc['Location'] == '/INT CAPTAIN MASON’S CAR', 'IE'] = 'EXT./INT'
dfsc.loc[dfsc['Location'] == '/INT CAPTAIN MASON’S CAR', 'Location'] = 'CAPTAIN MASON’S CAR'

In [69]:
# Define Translation dict for Location
Location_en = dfsc['Location'].unique()
print(Location_en)

Location_zh = Location_en.copy()
for i, loc in enumerate(Location_zh):
  if not loc:
    Location_zh[i] = None
  else:
    Location_zh[i] = translate_character(loc)
#print(Location_zh)

Location_zh = ['健身房', None, '艾莉的阳台 / 花房', '台阶 / 街道',
       '艾莉的客厅', '艾莉的家 / 停车位', '祖儿的复式公寓',
       '艾莉的家', '艾莉的沙发', '布鲁克的轿车', '酒吧',
       '布鲁克的顶层豪华公寓', '梅的轿车', '艾莉家的办公房', '动物园',
       '鞋店 / 办公室', '凯特的客厅', '更衣室 / 动物园',
       '走廊 / 动物园', '洛克斐洛先生的笼子', '艾莉的厨房',
       '厨房', '审讯室', '审讯室 # 2',
       '审讯室 # 3', '审讯室 # 4', '客厅',
       '地下室', '大学', '警方发布会会议厅',
       '祖儿的墓前 / 墓地', '墓地', '墓地 / 小径',
       '艾莉的家', '车店', '凯特的轿车', '街上', '艾莉的睡房',
       '溜冰场', '长廊', '医院走廊', '艾莉家走廊',
       '警局 / 梅森的办公室', '梅森的办公室', '走廊',
       '厂房办公室', '发布会', '医院 / 凯特的病房',
       '凯特的病房', '凯特的病房 / 医院', '医院',
       '停车场', '大教堂', '教堂', '教堂走廊的楼梯',
       '警局科技分析室', '艾莉的客厅', '布鲁克的公寓',
       '卫生间窗户', '卫生间', '隔音间',
       '走廊 / 后现代拱廊', '后现代拱廊', '不可描述之地',
       '梅森队长的轿车', '艾莉的物业', '梅森的病房']

en2zh_Location = dict(zip(Location_en, Location_zh))
print(en2zh_Location)

['GYM' None 'ALLISON’S VERANDA / GREENHOUSE' 'STEPS / STREET'
 'ALLISON’S LIVING ROOM' 'ALLISON’S HOUSE / DRIVEWAY' 'ZOEY’S LOFT'
 'ALLISON’S HOUSE' 'ALLISON’S PORCH' 'BROOKE’S CAR' 'BAR'
 'BROOKE PENTHOUSE' 'MEI’S CAR' 'ALLISON’S HOME OFFICE' 'ZOO'
 'SHOES STORE / OFFICE' 'KATE’S LIVING ROOM' 'LOCKER ROOM / ZOO'
 'HALLWAY / ZOO' 'MISTER ROCKEFELLER’S CAGE' 'ALLISON’S KITCHEN' 'KITCHEN'
 'ERROGATION ROOM' 'ERROGATION ROOM # 2' 'ERROGATION ROOM # 3'
 'ERROGATION ROOM # 4' 'LIVINGROOM' 'BASEMENT' 'COLLEGE'
 'POLICE PRESS CONFERENCE' 'ZOEY’S GRAVE / CEMETARY' 'CEMETARY'
 'CEMETARY / PATHWAY' 'ALLISON’S HOME' 'CAR DEALER' 'KATE’S CAR' 'STREET'
 'ALLISON’S BEDROOM' 'ICE RINK' 'CORRIDOR' 'HOSPITAL HALLWAY'
 'ALLISON’S HALLWAY' 'POLICE STATION / MASON’S OFFICE' 'MASON’S OFFICE'
 'HALLWAY' 'WAREHOUSE OFFICE' 'PRESS CONFERENCE' 'HOSPITAL / KATE’S ROOM'
 'KATE’S HOSPITAL ROOM' 'KATE’S ROOM / HOSPITAL' 'HOSPITAL' 'PARKING LOT'
 'CATHEDRAL' 'CHURCH' 'CHURCH HALLWAY STAIRS' 'POLICE TECH LAB'
 'ALLI

In [70]:
# Apply Location Translation to Location
dfsc['Location_{}'.format(tgt_lang)] = None
dfsc.loc[~dfsc['Location'].isna(), 'Location_{}'.format(tgt_lang)] = \
  dfsc.loc[~dfsc['Location'].isna(), 'Location'].apply(
      lambda x: en2zh_Location[x]
  )

# Apply Location Translation to Element
def translate_location(x):
  Location_en_tmp = [l for l in Location_en if l]
  ls_locations = re.findall('|'.join(Location_en_tmp), 
                        x, re.IGNORECASE
  )
  for location in ls_locations:
    x = re.sub('(?:(?<=\s)|^)' + location, 
               en2zh_Location[location.upper()], 
               x
              )
  return x

dfsc['Element_{}'.format(tgt_lang)] = \
  dfsc['Element_{}'.format(tgt_lang)].apply(translate_location)

In [85]:
dfsc.loc[dfsc['Grp'] == 'H', 'IE_{}'.format(tgt_lang)].fillna(
    '', inplace=True)
dfsc.loc[dfsc['Grp'] == 'H', 'Location_{}'.format(tgt_lang)].fillna(
    '', inplace=True)
dfsc.loc[dfsc['Grp'] == 'H', 'Time_{}'.format(tgt_lang)].fillna(
    '', inplace=True)

In [86]:
# Apply IE, Time and Location Translation to Element_{}
dfsc.loc[dfsc['Grp'] == 'H', 'IE_{}'.format(tgt_lang)].fillna(
    '', inplace=True)
dfsc.loc[dfsc['Grp'] == 'H', 'Location_{}'.format(tgt_lang)].fillna(
    '', inplace=True)
dfsc.loc[dfsc['Grp'] == 'H', 'Time_{}'.format(tgt_lang)].fillna(
    '', inplace=True)

dfsc.loc[dfsc['Grp'] == 'H', 'Element_{}'.format(tgt_lang)] = \
  dfsc.loc[dfsc['Grp'] == 'H', 'IE_{}'.format(tgt_lang)].apply(
      lambda x: x + '. ') + \
  dfsc.loc[dfsc['Grp'] == 'H', 'Location_{}'.format(tgt_lang)] + \
  dfsc.loc[dfsc['Grp'] == 'H', 'Time_{}'.format(tgt_lang)].apply(
      lambda x: (' - ' + x) if x else '')        

In [90]:
ls_types = ['Action', 'Dialogue', 
            'Parenthetical_Dialogue', 'Parenthetical_Character'
            ]

dfsc.loc[dfsc['Type'].isin(ls_types), 'Element_{}'.format(tgt_lang)]

1           The last few People spill out into the night.
2       Only a Young Woman in her early 20’s, EMILY, r...
3                                             LOCKER ROOM
4       Emily is alone, busy in conversation on her SM...
5       She glances around, it’s still empty. A naught...
                              ...                        
1955                                   A message from 诺亚:
1956                     “Martyrs dies with admirations”.
1957                                            FADE OUT.
1958    SUPER, OVER BLACK, TYPED OUT AS IF ON COMPUTER...
1959                                      TO BE CONTINUED
Name: Element_zh, Length: 1408, dtype: object

In [93]:
dfsc['Type'].unique()

array(['Scene Heading', 'Action', 'Character', 'Parenthetical_C',
       'Parenthetical_D', 'Dialogue', 'None'], dtype=object)