In [1]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import numpy as np
import warnings
warnings.filterwarnings('ignore')
np.random.seed(42)

In [2]:
comics = pd.read_csv('../data/cleaned_data/comics')

In [3]:
zone = pd.read_csv('../data/cleaned_data/zone')

##### Delibrately used each dataset separately to showcase the frequency of various lemma in each separate dataset 

In [4]:
# There are some null values in both dataframes most likely from the deletion of the 'remove' lemma 
comics.isna().sum()

Unnamed: 0      0
author          0
subreddit       0
lems          171
title_lems      0
dtype: int64

In [5]:
zone.isna().sum()

Unnamed: 0      0
author          0
subreddit       0
lems          121
title_lems      0
dtype: int64

In [6]:
# Based on my subject knowledge; imputed NaN with common terms 
comics.fillna(value='comic', inplace=True)

In [7]:
zone.fillna(value='twilight', inplace=True)

In [8]:
comics.isna().sum()

Unnamed: 0    0
author        0
subreddit     0
lems          0
title_lems    0
dtype: int64

In [9]:
zone.isna().sum()

Unnamed: 0    0
author        0
subreddit     0
lems          0
title_lems    0
dtype: int64

_Comparing Twlight Zone Phrases with Comic Book Phrases_

In [10]:
# Instantiate a vectorizer with a longer n_gram range (3-5 words) for Twilight Zone and Comic Books
cvec_comic_phrase = CountVectorizer(ngram_range= [3,5], max_features = 10_000, stop_words = 'english')
tvec_comic_phrase = TfidfVectorizer(ngram_range= [3,5], max_features = 10_000, stop_words = 'english')
cvec_zone_phrase = CountVectorizer(ngram_range= [3,5], max_features = 10_000, stop_words = 'english')
tvec_zone_phrase = TfidfVectorizer(ngram_range= [3,5], max_features = 10_000, stop_words = 'english')

In [11]:
# Use fit_transform function on the 'lems' text column 
cvec_comics = cvec_comic_phrase.fit_transform(comics['lems'])
tvec_comics = tvec_comic_phrase.fit_transform(comics['lems'])

In [12]:
# Use fit_transform function on the 'lems' text column 
cvec_zone = cvec_zone_phrase.fit_transform(zone['lems'])
tvec_zone = tvec_zone_phrase.fit_transform(zone['lems'])

In [13]:
# creating a df for vectorized words; Comics
cvec_comics_df = pd.DataFrame(cvec_comics.toarray(), columns=cvec_comic_phrase.get_feature_names())
# creating a df for tfidf words
tvec_comics_df = pd.DataFrame(tvec_comics.toarray(), columns=tvec_comic_phrase.get_feature_names())

In [14]:
# creating a df for vectorized words; Twilight Zone
cvec_zone_df = pd.DataFrame(cvec_zone.toarray(), columns=cvec_zone_phrase.get_feature_names())
# creating a df for tfidf words
tvec_zone_df = pd.DataFrame(tvec_zone.toarray(), columns=tvec_zone_phrase.get_feature_names())

In [15]:
# looking at vectorized value counts; Twilight Zone
cvec_counts = cvec_zone_df.sum(axis=0)
cvec_counts.sort_values(ascending=False)[0:10]

twilight zone episode         109
http audioboom com             54
wikipedia org wiki             51
en wikipedia org               51
http en wikipedia              51
http en wikipedia org          51
http en wikipedia org wiki     51
en wikipedia org wiki          51
episode info http en           40
imdb opening narration         40
dtype: int64

In [16]:
# looking at vectorized value counts; Twilight Zone
tvec_counts = tvec_zone_df.sum(axis=0)
tvec_counts.sort_values(ascending=False)[0:10]

twilight zone episode         16.840045
new twilight zone              7.714732
episode twilight zone          7.155298
like twilight zone             6.525209
original twilight zone         6.007025
http en wikipedia org wiki     5.951970
http en wikipedia              5.951970
http en wikipedia org          5.951970
en wikipedia org wiki          5.951970
en wikipedia org               5.951970
dtype: float64

##### Key phrases in The Twilight Zone texts should prove useful for categorization

In [17]:
# looking at vectorized value counts; Comics
cvec_counts = cvec_comics_df.sum(axis=0)
cvec_counts.sort_values(ascending=False)[0:10]

reddit com comicbooks                238
www reddit com comicbooks            237
http www reddit                      237
www reddit com                       237
http www reddit com comicbooks       237
http www reddit com                  237
com comicbooks comment               230
reddit com comicbooks comment        230
www reddit com comicbooks comment    229
utm_source share amp utm_medium      189
dtype: int64

In [18]:
# looking at vectorized value counts; Comics
tvec_counts = tvec_comics_df.sum(axis=0)
tvec_counts.sort_values(ascending=False)[0:10]

amazing spider man                8.365159
http imgur com                    7.881666
amp auto webp                     6.431498
http preview redd                 6.431498
amp auto webp amp                 6.431498
auto webp amp                     6.431498
reddit com comicbooks             5.462987
www reddit com comicbooks         5.428264
http www reddit com comicbooks    5.428264
http www reddit com               5.428264
dtype: float64

##### Key phrases in the Comic Book texts appear to mostly be https code; however, both Ultimate Spider Man and Cosmic Ghost Rider are solid classification leads. 

_Comparing Twilight Zone Words and Comic Book Words_

In [19]:
# Instantiate a vectorizer with a shorter n_gram range (1-2 words) for Twilight Zone and Comic Books
cvec_comic_word = CountVectorizer(ngram_range= [1, 2], max_features = 5000, stop_words = 'english')
tvec_comic_word = TfidfVectorizer(ngram_range= [1, 2], max_features = 5000, stop_words = 'english')
cvec_zone_word = CountVectorizer(ngram_range= [1, 2], max_features = 5000, stop_words = 'english')
tvec_zone_word = TfidfVectorizer(ngram_range= [1, 2], max_features = 5000, stop_words = 'english')

In [20]:
# Use fit_transform function on the 'lems' text column 
cvec_comics_words = cvec_comic_word.fit_transform(comics['lems'])
tvec_comics_words = tvec_comic_word.fit_transform(comics['lems'])

In [21]:
# Use fit_transform function on the 'lems' text column 
cvec_zone_words = cvec_zone_word.fit_transform(zone['lems'])
tvec_zone_words = tvec_zone_word.fit_transform(zone['lems'])

In [22]:
# creating a df for vectorized words; Comics
cvec_comics_df_words = pd.DataFrame(cvec_comics_words.toarray(), columns=cvec_comic_word.get_feature_names())
# creating a df for tfidf words
tvec_comics_df_words = pd.DataFrame(tvec_comics_words.toarray(), columns=tvec_comic_word.get_feature_names())

In [23]:
# creating a df for vectorized words; Twilight Zone
cvec_zone_df_words = pd.DataFrame(cvec_zone_words.toarray(), columns=cvec_zone_word.get_feature_names())
# creating a df for tfidf words
tvec_zone_df_words = pd.DataFrame(tvec_zone_words.toarray(), columns=tvec_zone_word.get_feature_names())

In [24]:
# looking at vectorized value counts; Twilight Zone
cvec_counts = cvec_zone_df_words.sum(axis=0)
cvec_counts.sort_values(ascending=False)[0:10]

episode          2000
wa               1068
like              787
just              782
twilight          743
zone              698
twilight zone     613
time              475
think             435
ha                405
dtype: int64

In [25]:
# looking at vectorized value counts; Twilight Zone
tvec_counts = tvec_zone_df_words.sum(axis=0)
tvec_counts.sort_values(ascending=False)[0:10]

twilight         148.221073
episode           62.118515
wa                41.703851
zone              33.029435
just              31.630621
like              30.307026
twilight zone     29.822896
know              22.290853
think             22.189892
time              22.006632
dtype: float64

##### Keywords such as 'episode' and 'twilight' should prove useful as classification predictors. Also, models should explicitly state an n_gram range of only one or two words. 

In [26]:
# looking at vectorized value counts; Comics
cvec_counts = cvec_comics_df_words.sum(axis=0)
cvec_counts.sort_values(ascending=False)[0:10]

comic    1434
wa        700
amp       695
http      691
like      644
book      624
just      542
com       530
read      511
know      422
dtype: int64

In [27]:
# looking at vectorized value counts; Comics
tvec_counts = tvec_comics_df_words.sum(axis=0)
tvec_counts.sort_values(ascending=False)[0:10]

comic      228.656371
removed     99.067142
wa          43.282156
read        41.382454
like        41.092841
book        37.686402
just        36.966652
know        33.891183
amp         32.668809
story       29.320749
dtype: float64

##### Keywords such as 'comic' and 'wa' should prove useful as classification predictors. Also, models should explicitly state an n_gram range of only one or two words. 