# Preprocess 

In [1]:
# Import Libraries
import os
import nltk
import csv
import string
import glob
import numpy as np
import pandas as pd


from gensim.parsing.porter import PorterStemmer
from gensim.parsing.preprocessing import remove_stopwords
from gensim import corpora
from gensim import models
from gensim import similarities

## Simpsons Dataset

### Import data

In [2]:
# Import Simpsons data
raw_df = pd.read_csv('./data/simpsons_dataset.csv', sep = ',')

# Glimpse at the dataset
raw_df.head(10)

Unnamed: 0,raw_character_text,spoken_words
0,Miss Hoover,"No, actually, it was a little of both. Sometim..."
1,Lisa Simpson,Where's Mr. Bergstrom?
2,Miss Hoover,I don't know. Although I'd sure like to talk t...
3,Lisa Simpson,That life is worth living.
4,Edna Krabappel-Flanders,The polls will be open from now until the end ...
5,Martin Prince,I don't think there's anything left to say.
6,Edna Krabappel-Flanders,Bart?
7,Bart Simpson,Victory party under the slide!
8,,
9,Lisa Simpson,Mr. Bergstrom! Mr. Bergstrom!


In [3]:
# Top 10 characters by spoken words
raw_df.groupby(['raw_character_text']).count().sort_values(by=['spoken_words'], ascending=False).head(10)

Unnamed: 0_level_0,spoken_words
raw_character_text,Unnamed: 1_level_1
Homer Simpson,27850
Marge Simpson,13172
Bart Simpson,12995
Lisa Simpson,10756
C. Montgomery Burns,3077
Moe Szyslak,2808
Seymour Skinner,2385
Ned Flanders,2056
Grampa Simpson,1802
Chief Wiggum,1790


In [4]:
# Filter by main characters
simpsons_df= raw_df.loc[(raw_df['raw_character_text']=='Homer Simpson') 
                        | (raw_df['raw_character_text']=='Marge Simpson') 
                        | (raw_df['raw_character_text']=='Bart Simpson') 
                        | (raw_df['raw_character_text']=='Lisa Simpson') ]
simpsons_df.head()

Unnamed: 0,raw_character_text,spoken_words
1,Lisa Simpson,Where's Mr. Bergstrom?
3,Lisa Simpson,That life is worth living.
7,Bart Simpson,Victory party under the slide!
9,Lisa Simpson,Mr. Bergstrom! Mr. Bergstrom!
11,Lisa Simpson,Do you know where I could find him?


In [5]:
# Text
simpsons_words = simpsons_df['spoken_words'].values
# Tag
simpsons_tags = simpsons_df['raw_character_text'].values

### Process Data

In [6]:
def process(p, tokenizer, text):
    """ Applies standard pre-processing to given text.
    
    Args:
        p (gensim.parsing.porter.PorterStemmer): stemmer object.
        tokenizer (nltk.tokenize.regexp.RegexpTokenizer): tokenizr object.
        text (str): text to preprocess.
    
    Returns:
        list: preprocessed text.
    
    """
    # Converts to lowercase
    doc_nor = text.lower()
    
    # Removes stopwords
    doc_sw = remove_stopwords(doc_nor)
    
    # Stems text
    doc_stem = p.stem_sentence(doc_sw)
    
    # Lemmatizes text
    # TODO: Lemmatizer
    
    # Returns preprocessed text
    return tokenizer.tokenize(doc_stem)

## Friends Dataset

### Import Data

In [14]:
# Import Friends data

# Text
friends_words = []
# Tag
friends_tags = []

# Dataset dir
for dirname, _, filenames in os.walk('./data/friends_dataset'):
    # Files in dir
    for filename in filenames:
        # Open file
        with open(os.path.join(dirname, filename), encoding="utf8") as fp:
            # Lines in file
            line = fp.readline()
            while line:
                data  = line.strip().split(": ")
                # Verify not a blank line
                if len(data) > 1:
                    # Append tags and words
                    if data[0] == 'Monica' or data[0] == 'Joey' or data[0] == 'Chandler' or data[0] == 'Phoebe' or data[0] == 'Ross' or data[0] == 'Rachel':
                        friends_words.append(data[1])
                        friends_tags.append(data[0])
                line = fp.readline()
                

In [15]:
# Glimpse at the dataset
friends_df = pd.DataFrame()
friends_df['character'] = friends_tags
friends_df['spoken_words'] = friends_words
friends_df.head(10)

Unnamed: 0,character,spoken_words
0,Monica,There's nothing to tell! He's just some guy I ...
1,Joey,"C'mon, you're going out with the guy! There's ..."
2,Chandler,"All right Joey, be nice. So does he have a hu..."
3,Phoebe,"Wait, does he eat chalk?"
4,Phoebe,"Just, 'cause, I don't want her to go through w..."
5,Monica,"Okay, everybody relax. This is not even a date..."
6,Chandler,Sounds like a date to me.
7,Chandler,"Alright, so I'm back in high school, I'm stand..."
8,Chandler,"Then I look down, and I realize there's a phon..."
9,Joey,Instead of...?


In [16]:
len(friends_df)

46477

### Process data