<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#General-preprocessing:" data-toc-modified-id="General-preprocessing:-1">General preprocessing:</a></span></li><li><span><a href="#Preprocessing-for-&quot;all-characters&quot;-network-analysis:" data-toc-modified-id="Preprocessing-for-&quot;all-characters&quot;-network-analysis:-2">Preprocessing for "all characters" network analysis:</a></span></li></ul></div>

In [2]:
import numpy as np
import pandas as pd
import glob
import re
import nltk

## General preprocessing:

In [25]:
files = glob.glob('../transcripts_friends/season_all/*.csv')
files[:5]

['../transcripts_friends/season_all\\1001.csv',
 '../transcripts_friends/season_all\\1002.csv',
 '../transcripts_friends/season_all\\1003.csv',
 '../transcripts_friends/season_all\\1004.csv',
 '../transcripts_friends/season_all\\1005.csv']

In [105]:
df_total = pd.DataFrame(columns = ['Speaker','Text','Episode','Scene'])

for i in range(len(files)):
    df = pd.read_csv(files[i],error_bad_lines=False,sep=':', names = ['Speaker','Text'])
    
    df.Speaker = [i.lower() for i in df.Speaker]
    
    df = df.drop(df[(df.Speaker == 'written by')].index)
    df = df.drop(df[(df.Speaker == 'transcribed by')].index)
    df = df.drop(df[(df.Speaker == 'additional transcribing by')].index)
    df = df.drop(df[(df.Speaker == '(note')].index)
    df = df.drop(df[(df.Speaker == 'directed by')].index)
    df = df.drop(df[(df.Speaker == 'produced by')].index)
    df = df.drop(df[(df.Speaker == 'flashback clips transcribed by')].index)
    df = df.drop(df[(df.Speaker == 'teleplay by')].index)
    df = df.drop(df[(df.Speaker == 'story by')].index)
    
    df = df.drop(0)
    df['Episode'] = files[i][34:].split('.')[0]

    df = df.dropna().reset_index(drop=True)
    df.Text = [i.lower() for i in df.Text]

#     scene_idx = list(df[(df.Speaker == '[scene') | (df.Speaker == '[ scene')].index)    
    scene_idx = list(df[df['Speaker'].isin([i for i in df.Speaker if '[' in i])].index)
    
    if scene_idx:
        df = df[scene_idx[0]:].reset_index(drop=True)

#         scene_idx = list(df[(df.Speaker == '[scene') | (df.Speaker == '[ scene')].index)
        scene_idx = list(df[df['Speaker'].isin([i for i in df.Speaker if '[' in i])].index)

        scenes = [a+b for a,b in zip(df.iloc[scene_idx].Speaker, df.iloc[scene_idx].Text)]
        scene_lengths = np.diff(scene_idx + [len(df)])
        df['Scene'] = sum([[scenes[i]] * scene_lengths[i] for i in range(len(scene_lengths))],[])

        df = df.drop(scene_idx)
    else:
        df['Scene'] = ['NaN'] * len(df)
        
    
    df_total = df_total.append(df,ignore_index = True)

In [110]:
# Save dataframe as .csv and pickle:
df_total.to_csv('../data/All_Friends_data.csv')
df_total.to_pickle('../data/All_Friends_data.plk')

## Preprocessing for "all characters" network analysis:

In [19]:
df = pd.read_pickle('../Data/All_Friends_data.plk')

In [20]:
df.head(5)

Unnamed: 0,Speaker,Text,Episode,Scene
0,monica,"oh, the way you crushed mike at ping pong was...",1001,"[scene barbados, monica and chandler's room. t..."
1,chandler,"you know, i'd love to, but i'm a little tired.",1001,"[scene barbados, monica and chandler's room. t..."
2,monica,i'll put a pillowcase over my head.,1001,"[scene barbados, monica and chandler's room. t..."
3,chandler,you're on!,1001,"[scene barbados, monica and chandler's room. t..."
4,phoebe,hey!,1001,"[scene barbados, monica and chandler's room. t..."


In [21]:
# List of real names:
Names = nltk.corpus.names.words()
len(Names)

7944

In [22]:
# Speaking characters in all episodes:
# ( Excluding some wired ones..)
exclude = ['/',',',' and ',' to ','every','both','all','together','guys',' on ','&','looks','laugh','see','nodes','past'
           ,'tape','tv','video','radio','machine','tape','message','extra','announcer','commercial','walks','will','guy',
           'hope','cookie','ralph','waiter']

Characters = []
for i in df.Speaker.unique():
    k=re.sub(r'\((.*)','',i)
    if k and any(x.capitalize() in Names for x in k.split(' ')) and not any(x in k for x in exclude) and len(k) < 20:
        if k[-1] == ' ': k = k[:-1] #If last character is a "space"
        Characters.append(k)

In [23]:
# List of all possible names that could be spoken about:
all_Names = list(np.unique(Characters + ['ugly naked guy','mon','rach','chan','joe','phoebs','pheebs']))
len(all_Names)

284

In [24]:
df_names = pd.DataFrame(all_Names)
# Save for faster next time:
df_names.to_pickle('../data/Dataset_all_potential_characters.pkl')
df_names.to_csv('../data/Dataset_all_potential_characters.csv')

In [31]:
# Column for DataFrame with speaking or spoken characters:
# (Characters are added to being is the scene if they are speaking ore being spoken about)
Scene_Characters = []
for scene in list(df.Scene.unique()):
    # Characters speaking
    scene_char = [i for i in df[df.Scene == scene].Speaker.unique() if i in all_Names]
    # Charaters being spoken about:
    scene_char += [e for e in np.unique(sum([i.split(' ') for i in list(df[df.Scene == scene].Text)],[])) if e in all_Names]
    # Ectending column for DataFrame:
    Scene_Characters.extend([np.unique(scene_char)] * len(df[df.Scene == scene]))
    

In [32]:
# Adding column to dataframe:
df['Scene_characters'] = Scene_Characters
df.head()

Unnamed: 0,Speaker,Text,Episode,Scene,Scene_characters
0,monica,"oh, the way you crushed mike at ping pong was...",1001,"[scene barbados, monica and chandler's room. t...","[chandler, charlie, joey, mike, monica, phoebe..."
1,chandler,"you know, i'd love to, but i'm a little tired.",1001,"[scene barbados, monica and chandler's room. t...","[chandler, charlie, joey, mike, monica, phoebe..."
2,monica,i'll put a pillowcase over my head.,1001,"[scene barbados, monica and chandler's room. t...","[chandler, charlie, joey, mike, monica, phoebe..."
3,chandler,you're on!,1001,"[scene barbados, monica and chandler's room. t...","[chandler, charlie, joey, mike, monica, phoebe..."
4,phoebe,hey!,1001,"[scene barbados, monica and chandler's room. t...","[chandler, charlie, joey, mike, monica, phoebe..."


In [33]:
# Save for faster next time:
df.to_pickle('../data/Dataset_with_all_scene_characters.pkl')
df.to_csv('../data/Dataset_with_all_scene_characters.csv')