In [1]:
# Basic functionalities
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import string
from collections import Counter
import pickle

# options
pd.set_option('max_colwidth',150)

In [2]:
# Text encoding
from sklearn.feature_extraction import text 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
# Text Processing
from nltk import word_tokenize, pos_tag
import nltk
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()
# nltk.download('wordnet')
# nltk.set_proxy('SYSTEM PROXY')
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')

In [4]:
# Sentiment analysis
from textblob import TextBlob as tb

In [5]:
# Topic Modeling
from gensim import matutils, models
from gensim.models import LsiModel
import scipy.sparse



In [6]:
# Keyword Extraction
from summa.summarizer import summarize
from summa import keywords

In [7]:
# Progress bar
from tqdm import tqdm 

# Retrieve Data

## Data from google drive

In [8]:
df = pd.read_csv('transcript_paragraph_needed.csv')
df = df[["comedian","transcript"]]
df.columns = ["Comedian","Transcript"]
# get corpus
corpus = df[['Comedian','Transcript']].set_index('Comedian')
corpus

Unnamed: 0_level_0,Transcript
Comedian,Unnamed: 1_level_1
andy woodhull,she told me she painted some clear cuz it makes her fingernail shiny and she likes them when to be shiny and i support her in that decision i want...
andy woodhull,we were on a road trip one time and she fell asleep i said honey you got to wake up im sleepy too and we need to talk to me she said i am not asle...
andy woodhull,i mention my girls already theyre my theyre my stepdaughters im a stepdad stepdad took over the lease on a couple of girls few years back thank yo...
andy woodhull,coopers you love the old bo you love the old bum forever cuz thats the boat let you know that you enjoy being the captain of a ship but if there w...
andy woodhull,my girls are teenagers now and got a couple teenagers at home and theyre really embracing it and every time i tell people i have teenage daughte...
...,...
dave chappelle,lets not forget lets not forget ive never met bill cosby so im not defending him lets just remember that he has a valuable legacy that i cant just...
dave chappelle,ahah ahah in every ghetto ahah ahah ahah in every ghetto ahah ahah ahah in every ghetto ahah ahah ahah in every ghet...
dave chappelle,ahah ahah in every ghetto ahah revolution ahah ahah ahah in every ghetto one last thing before you go i just wanted to ackn...
dave chappelle,can i kick it yes you can can i kick it yes you can can i kick it yes you can well im gone go on then can i kick...


#### Clean the data

In [9]:
def apply_data_cleansing(text):
    # Lowercase every words
    text = text.lower()
    # Remove every words with [blah blah blah] format
    text = re.sub('\[.*?\]', '', text)
    # Remove every words with (blah blah blah) format
    text = re.sub('\(.*?\)', '', text)
    # Get rid of the punctuations
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    # Get rid of all the numbers or words that contain numbers
    text = re.sub('\w*\d\w*', '', text)
    # Get rid of these specific punctuations
    text = re.sub('[‘’“”…]', '', text)
    # Get rid of '\n'
    text = re.sub('\n', '', text)
    
    # Tokenizes and lemmetizes (or stems) them
    tokenized = word_tokenize(text)
    stemmed = [porter_stemmer.stem(t) for t in tokenized]
#     lemmetized = [wordnet_lemmatizer.lemmatize(t) for t in tokenized]
    return " ".join(stemmed)

In [10]:
clean_corpus = pd.DataFrame(corpus.Transcript.apply(apply_data_cleansing))
clean_corpus_index = clean_corpus.reset_index()
clean_corpus_index

Unnamed: 0,Comedian,Transcript
0,andy woodhull,she told me she paint some clear cuz it make her fingernail shini and she like them when to be shini and i support her in that decis i want them t...
1,andy woodhull,we were on a road trip one time and she fell asleep i said honey you got to wake up im sleepi too and we need to talk to me she said i am not asle...
2,andy woodhull,i mention my girl alreadi theyr my theyr my stepdaught im a stepdad stepdad took over the leas on a coupl of girl few year back thank you veri muc...
3,andy woodhull,cooper you love the old bo you love the old bum forev cuz that the boat let you know that you enjoy be the captain of a ship but if there wa a hur...
4,andy woodhull,my girl are teenag now and got a coupl teenag at home and theyr realli embrac it and everi time i tell peopl i have teenag daughter at home theyr ...
...,...,...
2835,dave chappelle,let not forget let not forget ive never met bill cosbi so im not defend him let just rememb that he ha a valuabl legaci that i cant just throw awa...
2836,dave chappelle,ahah ahah in everi ghetto ahah ahah ahah in everi ghetto ahah ahah ahah in everi ghetto ahah ahah ahah in everi ghetto wait wait wait wait wait i ...
2837,dave chappelle,ahah ahah in everi ghetto ahah revolut ahah ahah ahah in everi ghetto one last thing befor you go i just want to acknowledg for the real comedi fa...
2838,dave chappelle,can i kick it ye you can can i kick it ye you can can i kick it ye you can well im gone go on then can i kick it to my tribe that flow in layer ri...


In [11]:
df = pd.DataFrame(clean_corpus_index.Comedian)

In [12]:
# Pickles the result for later usage
clean_corpus_index.to_pickle('clean_corpus_index.pkl')

# Analyize the transcripts

#### Text Encoding

In [13]:
def dtm(corpus, stop_words='english', pickle_filename="", save_pickle=False):
    cv = CountVectorizer(stop_words=stop_words)
    data_cv = cv.fit_transform(corpus.Transcript)
    data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
    data_dtm.index = corpus.index
    if save_pickle:
        pickle.dump(cv, open(pickle_filename, "wb"))        
    return data_dtm.transpose()

def tf_idf(corpus, stop_words='english', pickle_filename="", save_pickle=False):
    vectorizer = TfidfVectorizer(stop_words='english')
    data_v = vectorizer.fit_transform(corpus.Transcript)
    data_tfidf = pd.DataFrame(data_v.toarray(), columns=vectorizer.get_feature_names())
    data_tfidf.index = corpus.index
    if save_pickle:
        pickle.dump(vectorizer, open(pickle_filename, "wb"))        
    return data_tfidf.transpose()

#### Top words

In [14]:
def get_top_words(matrix_dataframe, number_of_words = 30):
    top_dict = {}
    for c in matrix_dataframe.columns:
        top = matrix_dataframe[c].sort_values(ascending=False).head(number_of_words)
        top_dict[c]= list(zip(top.index, top.values))
    return top_dict

In [15]:
# Get all kinds of corpus
clean_corpus = pd.read_pickle('clean_corpus_index.pkl')

all_corpuses = [clean_corpus]

In [16]:
df_dtms = []
dtm_tops = []
for corpus in all_corpuses:
    # Document-Term Matrix
    df_dtm = dtm(corpus)
    # Get top words
    dtm_top = get_top_words(df_dtm)
    
    df_dtms.append(df_dtm)
    dtm_tops.append(dtm_top)

Since there are too many words that are common among all the transcript, we need to add them to the stop-word list and redo the text encoding again.

#### Extract all common words among the transcript

In [17]:
commons = []
for c in df.columns:
    words = []
    for r in df[c]:
        top = [word for word in r]
        for t in top:
            words.append(t)    
    commons.append(Counter(words).most_common())

In [18]:
common_words_list = []
for common in commons:
    new = [word for word, count in common if count > 6]
    common_words_list.append(np.array(new))
common_words_list

[array(['e', ' ', 'a', 'r', 'i', 'n', 'l', 's', 'o', 'h', 'c', 'm', 't',
        'd', 'y', 'j', 'g', 'p', 'u', 'b', 'f', 'k', 'v', 'w', 'z', '.',
        '’'], dtype='<U1')]

In [19]:
pickle.dump(common_words_list, open('common_words_list_index.pkl', 'wb'))

# Keyword Extraction

In [24]:
clean_corpus = pd.read_pickle('clean_corpus_index.pkl')

In [25]:
def merge_transcripts(df, filename):
    """
    :param df: transcript/comedians dataframe.
    :param filename: the csv file to save for the comedian/keywords data
    """
    merged_df = df.groupby('Comedian').agg(lambda t: " ".join(t))
    r_df = pd.DataFrame(columns=['Comedian','Keywords'])
    for c in tqdm(merged_df.index, desc="Extracting..."):
        text = merged_df.loc[c][0]
        kw = keywords.keywords(text)
        r_df = r_df.append({'Comedian': c, "Keywords":", ".join(kw.split('\n'))}, ignore_index=True)
    r_df.to_excel(filename)       
    return r_df 


In [26]:
common_words_list = pickle.load(open('common_words_list_index.pkl', 'rb'))
# Additional stop words
common_words_list.append(np.array(['im','youre','theyre','we','hes','shes','yeah','uh','ill','hell',
                          'shell','theyll','well']))
common_words_list.append(np.array([i for i in 'qwertyuiopasdfghjklzxcvbnm']))

In [27]:
stop_words_list = []
for common_words in common_words_list:
    stop_words = text.ENGLISH_STOP_WORDS.union(common_words)
    stop_words_list.extend(list(stop_words))

In [28]:
def remove_stopwords(text):    
    # split the text
    wd = text.split(' ')
    # remove stop words
    resultwords  = [word for word in wd if word.lower() not in stop_words_list]
    return' '.join(resultwords)

In [29]:
clean_corpus.index = clean_corpus.Comedian
clean_corpus = pd.DataFrame(clean_corpus.Transcript.apply(remove_stopwords))

In [30]:
clean_corpus.reset_index()

Unnamed: 0,Comedian,Transcript
0,andy woodhull,told paint clear cuz make fingernail shini like shini support decis want shini just feel like achiev result order popcorn movi theater larg popcor...
1,andy woodhull,road trip time fell asleep said honey got wake sleepi need talk said asleep just rest eye air throat whi mouth open mile start chew becaus gummi b...
2,andy woodhull,mention girl alreadi theyr theyr stepdaught stepdad stepdad took leas coupl girl year thank veri step parent ani broken home ahead abl come step s...
3,andy woodhull,cooper love old bo love old bum forev cuz boat let know enjoy captain ship wa hurrican onli time save boat harbor know pickit lot pressur steppar ...
4,andy woodhull,girl teenag got coupl teenag home theyr realli embrac everi time tell peopl teenag daughter home theyr alway like aha good luck realli tough just ...
...,...,...
2835,dave chappelle,let forget let forget ive met cosbi defend let just rememb ha valuabl legaci just throw away rememb black man win emmi televis rememb guy make car...
2836,dave chappelle,ahah ahah everi ghetto ahah ahah ahah everi ghetto ahah ahah ahah everi ghetto ahah ahah ahah everi ghetto wait wait wait wait wait forgot fourth ...
2837,dave chappelle,ahah ahah everi ghetto ahah revolut ahah ahah ahah everi ghetto thing befor just want acknowledg real comedi fan acknowledg lost fuck juggernaut c...
2838,dave chappelle,kick ye kick ye kick ye gone kick tribe flow layer right phife poem sayer time studio conveyor mr dinkin pleas mayor youll realli big favor boy th...


In [31]:
merge_transcripts(clean_corpus, 'keywords_clean.xlsx')

Extracting...: 100%|███████████████████████████████████████████████████████████████████| 40/40 [09:42<00:00, 14.55s/it]


Unnamed: 0,Comedian,Keywords
0,amy schumer,"right peopl hand wa like okay fuck, thi, guy, dont, theyr, um, think, caus, oh, place question everyon know, love, time, becaus, alway want, got, ..."
1,andy woodhull,"thi, like, want, know, thing, late everyth time wife wa, think gon na, goe dont, snake, theyr, shower said whi, becaus, stuff kid, home, come, gir..."
2,arsenio hall,"know wa, shit, fuck, motherfuck, thi, like gave, got, said, black dont, right, hi, love, use, look smile, phone, caus thought saw, want, ye, oh, s..."
3,aziz ansari,"excus thi, thing, seen stuff gener like video, bottl wa happen, realli, know problem, think, fuck, look, say, situat peopl, littl kid, okay, caus ..."
4,bert kreischer,"goe, wa, room peopl like dont, got, way thi, wife, right, want, fuck dick, dad, think, start, didnt know, ila, men look, come, walk, oh, mom, day ..."
5,bill burr,"right thi, like realli, wa, know ani, come, la, non penso, fuckin, becom fuck everyth goddamn, le, prima squadra di, guy, una cosa, che hanno dovu..."
6,chris d’elia,"fuck, dont, wa, right thi awesom, alway drunk time like, girl, dude, say, look, hey know excus, yo, shit, love, caus, way, real, thing, relax okay..."
7,chris rock,"shit, like, suck fuck, dont, peopl, man drug, right hour, smoke crack got hi job want know wa, say, check thi, look, make, white, good, love, talk..."
8,dave chappelle,"ens, wa fuck cooki, dejar la, una, lo mismo, hora que, said, got, nobodi els, man, di, al, por, thi black guy broke, know job, time, motherfuck, n..."
9,eric andre,"wa, fuck, right, stole thi, day like smoke, know, hi, shit, man legal, guy want, got weed, oh, start, dont look eat, littl, went, parent, text, th..."
