In [1]:
import requests
import re
from bs4 import BeautifulSoup as BS
from collections import Counter
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from pprint import pprint

In [3]:
url = r'https://erikamentari.wordpress.com/2018/02/27/jre-1070-jordan-peterson-transcript/'

content = requests.get(url).text
soup = BS(content, 'lxml')
paragraphs = soup.find_all('p')
paragraphs = [p.get_text().replace('Joe Rogan: ', 'Joe: ').replace('Dr Jordan B Peterson: ', "Jordan: ")\
              for p in paragraphs[8:-10]]
paragraph_text = ' '.join(paragraphs)
print('Paragraphs:', len(paragraphs))
print('Words:', len(paragraph_text))

Paragraphs: 611
Words: 163379


In [6]:
rogan_p = [p.replace('Joe:', '').strip() for p in paragraphs if p.startswith('Joe:')]
peterson_p = [p.replace('Jordan:', '').strip() for p in paragraphs if p.startswith('Jordan:')]
unlabeled = [p for p in paragraphs if not p.startswith('Joe:') and not p.startswith('Jordan:')]
rogan_p.insert(1, unlabeled.pop(0))
print('Rogan paragraph count:', len(rogan_p))
print('Peterson paragraph count:', len(peterson_p))
print('Unassigned text:', unlabeled)

Rogan paragraph count: 306
Peterson paragraph count: 304
Unassigned text: ['18:18']


In [7]:
rogan_words = ' '.join(rogan_p).split(' ')
peterson_words = ' '.join(peterson_p).split(' ')
rogan_count, peterson_count = len(rogan_words), len(peterson_words)
total_count = rogan_count + peterson_count
rogan_percent = rogan_count / total_count
peterson_percent = peterson_count / total_count
print('Total word count:', total_count)
print('Rogan word count:', rogan_count)
print('Peterson word count:', peterson_count)
print('% of conversation (Rogan): {}%'.format(round(100* rogan_percent, 2)))
print('% of conversation (Peterson): {}%'.format(round(100 * peterson_percent, 2)))

Total word count: 29119
Rogan word count: 6736
Peterson word count: 22383
% of conversation (Rogan): 23.13%
% of conversation (Peterson): 76.87%


In [17]:
common_rogan = Counter(rogan_words).most_common()
common_peterson = Counter(peterson_words).most_common()
print('Most common words (Rogan)\n\n' + str(common_rogan[:25]))
print('\nMost common words (Peterson)\n\n' + str(common_peterson[:25]))

Most common words (Rogan)

[('to', 187), ('of', 182), ('the', 178), ('I', 170), ('and', 157), ('a', 155), ('that', 145), ('you', 127), ('is', 126), ('this', 91), ('in', 87), ('people', 65), ('And', 64), ('what', 59), ('have', 59), ('it', 56), ('about', 53), ('think', 51), ('was', 50), ('are', 47), ('you’re', 42), ('not', 42), ('it’s', 41), ('with', 38), ('one', 35)]

Most common words (Peterson)

[('the', 868), ('to', 614), ('and', 524), ('you', 517), ('of', 498), ('a', 493), ('that', 466), ('I', 430), ('is', 291), ('in', 262), ('And', 244), ('it', 219), ('it’s', 195), ('like,', 189), ('that’s', 181), ('It’s', 174), ('what', 168), ('they', 156), ('have', 149), ('so', 144), ('was', 135), ('for', 134), ('people', 133), ('be', 128), ('do', 125)]


In [42]:
rogan_corpus = ' '.join(rogan_words)
peterson_corpus = ' '.join(peterson_words)
rogan_word_toke, rogan_p_toke = word_tokenize(rogan_corpus), sent_tokenize(rogan_corpus)
peterson_word_toke, peterson_p_toke = word_tokenize(peterson_corpus), sent_tokenize(peterson_corpus)
print(len(rogan_word_toke))
print(len(peterson_word_toke))

7649
25981


In [43]:
stop_words = stopwords.words('english')
rogan_word_toke = [w for w in rogan_word_toke if w.lower() not in stop_words]
peterson_word_toke = [w for w in peterson_word_toke if w not in stop_words]

In [35]:
print(len(rogan_word_toke))
print(len(peterson_word_toke))
print(Counter(rogan_word_toke).most_common(25))
print(Counter(peterson_word_toke).most_common(25))

4161
14958
[('.', 446), (',', 368), ('people', 73), ('?', 62), ('think', 53), ('like', 45), ('you’re', 42), ('it’s', 41), ('one', 35), ('things', 30), ('that’s', 28), ('don’t', 28), (';', 25), ('I’m', 24), ('Yeah', 24), ('Right', 23), ('know', 23), ('Yes', 22), ('get', 22), ('going', 22), ('mean', 21), ('It’s', 21), ('way', 20), ('saying', 18), ('right', 17)]
[(',', 1853), ('.', 1345), ('like', 308), ('it’s', 196), ('?', 191), ('that’s', 182), ('It’s', 175), ('well', 165), (';', 163), ('people', 156), ('know', 150), ('think', 120), ('you’re', 102), ('Yeah', 99), ('Well', 97), ('going', 94), ('don’t', 85), ('I’m', 84), ('right', 81), ('say', 80), ('things', 70), ('there’s', 66), ('way', 65), ('want', 65), ('That’s', 59)]


In [72]:
import string

print(string.punctuation)
remove_punct = str.maketrans(string.punctuation, ' '*len(string.punctuation))
a = 'abdlkja;c,3dk!.ejf}]dk'.translate(remove_punct)
#a = 'abdlkja;c,3dk!.ejf}]dk'.remove(string.punctuation)
a

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


'abdlkja c 3dk  ejf  dk'