In [1]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import numpy as np
import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_columns = 5_000
np.random.seed(42)

In [2]:
comics = pd.read_csv('../data/cleaned_data/comics')

In [3]:
zone = pd.read_csv('../data/cleaned_data/zone')

##### Delibrately used each dataset separately to showcase the frequency of various lemma in each separate dataset 

In [4]:
# There are some null values in both dataframes most likely from the deletion of the 'remove' lemma 
comics.isna().sum()

Unnamed: 0     0
author         0
subreddit      0
lems          78
title_lems     0
dtype: int64

Unnamed: 0     0
author         0
subreddit      0
lems          78
title_lems     0
dtype: int64

In [5]:
zone.isna().sum()

Unnamed: 0     0
author         0
subreddit      0
lems          54
title_lems     0
dtype: int64

Unnamed: 0     0
author         0
subreddit      0
lems          54
title_lems     0
dtype: int64

In [6]:
# Based on my subject knowledge; imputed NaN with common terms 
comics.fillna(value='comic', inplace=True)

In [7]:
zone.fillna(value='twilight', inplace=True)

In [8]:
comics.isna().sum()

Unnamed: 0    0
author        0
subreddit     0
lems          0
title_lems    0
dtype: int64

Unnamed: 0    0
author        0
subreddit     0
lems          0
title_lems    0
dtype: int64

In [9]:
zone.isna().sum()

Unnamed: 0    0
author        0
subreddit     0
lems          0
title_lems    0
dtype: int64

Unnamed: 0    0
author        0
subreddit     0
lems          0
title_lems    0
dtype: int64

_Comparing Twlight Zone Phrases with Comic Book Phrases_

In [10]:
# Instantiate a vectorizer with a longer n_gram range (3-5 words) for Twilight Zone and Comic Books
cvec_comic_phrase = CountVectorizer(ngram_range= [3,5], max_features = 10_000, stop_words = 'english')
tvec_comic_phrase = TfidfVectorizer(ngram_range= [3,5], max_features = 10_000, stop_words = 'english')
cvec_zone_phrase = CountVectorizer(ngram_range= [3,5], max_features = 10_000, stop_words = 'english')
tvec_zone_phrase = TfidfVectorizer(ngram_range= [3,5], max_features = 10_000, stop_words = 'english')

In [11]:
# Use fit_transform function on the 'lems' text column 
cvec_comics = cvec_comic_phrase.fit_transform(comics['lems'])
tvec_comics = tvec_comic_phrase.fit_transform(comics['lems'])

In [12]:
# Use fit_transform function on the 'lems' text column 
cvec_zone = cvec_zone_phrase.fit_transform(zone['lems'])
tvec_zone = tvec_zone_phrase.fit_transform(zone['lems'])

In [13]:
# creating a df for vectorized words; Comics
cvec_comics_df = pd.DataFrame(cvec_comics.toarray(), columns=cvec_comic_phrase.get_feature_names())
# creating a df for tfidf words
tvec_comics_df = pd.DataFrame(tvec_comics.toarray(), columns=tvec_comic_phrase.get_feature_names())

In [14]:
# creating a df for vectorized words; Twilight Zone
cvec_zone_df = pd.DataFrame(cvec_zone.toarray(), columns=cvec_zone_phrase.get_feature_names())
# creating a df for tfidf words
tvec_zone_df = pd.DataFrame(tvec_zone.toarray(), columns=tvec_zone_phrase.get_feature_names())

In [15]:
# looking at vectorized value counts; Twilight Zone
cvec_counts = cvec_zone_df.sum(axis=0)
cvec_counts.sort_values(ascending=False)[0:10]

twilight zone episode         18
new twilight zone             11
nightmare 30 000               9
en wikipedia org               7
http en wikipedia org wiki     7
en wikipedia org wiki          7
30 000 feet                    7
wikipedia org wiki             7
like twilight zone             7
http en wikipedia org          7
dtype: int64

twilight zone episode         18
new twilight zone             11
nightmare 30 000               9
en wikipedia org               7
http en wikipedia org wiki     7
en wikipedia org wiki          7
30 000 feet                    7
wikipedia org wiki             7
like twilight zone             7
http en wikipedia org          7
dtype: int64

In [16]:
# looking at vectorized value counts; Twilight Zone
tvec_counts = tvec_zone_df.sum(axis=0)
tvec_counts.sort_values(ascending=False)[0:10]

new twilight zone        2.876261
twilight zone episode    2.215771
nightmare 30 000         1.770906
30 000 foot              1.496494
shot black white         1.492005
does know episode        1.475496
time new episode         1.429422
really want watch        1.335776
twilight zone 2019       1.309941
amp x200b http           1.192770
dtype: float64

new twilight zone        2.876261
twilight zone episode    2.215771
nightmare 30 000         1.770906
30 000 foot              1.496494
shot black white         1.492005
does know episode        1.475496
time new episode         1.429422
really want watch        1.335776
twilight zone 2019       1.309941
amp x200b http           1.192770
dtype: float64

##### Key phrases in The Twilight Zone texts should prove useful for categorization

In [17]:
# looking at vectorized value counts; Comics
cvec_counts = cvec_comics_df.sum(axis=0)
cvec_counts.sort_values(ascending=False)[0:10]

www reddit com                       91
http www reddit com                  91
http www reddit                      91
http www reddit com comicbooks       88
www reddit com comicbooks            88
reddit com comicbooks                88
com comicbooks comment               85
reddit com comicbooks comment        85
www reddit com comicbooks comment    85
http twitter com                     30
dtype: int64

www reddit com                       91
http www reddit com                  91
http www reddit                      91
http www reddit com comicbooks       88
www reddit com comicbooks            88
reddit com comicbooks                88
com comicbooks comment               85
reddit com comicbooks comment        85
www reddit com comicbooks comment    85
http twitter com                     30
dtype: int64

In [18]:
# looking at vectorized value counts; Comics
tvec_counts = tvec_comics_df.sum(axis=0)
tvec_counts.sort_values(ascending=False)[0:10]

ultimate spider man    4.372686
amp x200b http         2.600831
x200b http redd        2.534287
amp x200b http redd    2.534287
comic book store       2.450544
http imgur com         2.173068
like title say         2.165949
cosmic ghost rider     2.153964
amp x200b amp x200b    2.023190
amp x200b amp          2.023190
dtype: float64

ultimate spider man    4.372686
amp x200b http         2.600831
x200b http redd        2.534287
amp x200b http redd    2.534287
comic book store       2.450544
http imgur com         2.173068
like title say         2.165949
cosmic ghost rider     2.153964
amp x200b amp x200b    2.023190
amp x200b amp          2.023190
dtype: float64

##### Key phrases in the Comic Book texts appear to mostly be https code; however, both Ultimate Spider Man and Cosmic Ghost Rider are solid classification leads. 

_Comparing Twilight Zone Words and Comic Book Words_

In [19]:
# Instantiate a vectorizer with a shorter n_gram range (1-2 words) for Twilight Zone and Comic Books
cvec_comic_word = CountVectorizer(ngram_range= [1, 2], max_features = 5000, stop_words = 'english')
tvec_comic_word = TfidfVectorizer(ngram_range= [1, 2], max_features = 5000, stop_words = 'english')
cvec_zone_word = CountVectorizer(ngram_range= [1, 2], max_features = 5000, stop_words = 'english')
tvec_zone_word = TfidfVectorizer(ngram_range= [1, 2], max_features = 5000, stop_words = 'english')

In [20]:
# Use fit_transform function on the 'lems' text column 
cvec_comics_words = cvec_comic_word.fit_transform(comics['lems'])
tvec_comics_words = tvec_comic_word.fit_transform(comics['lems'])

In [21]:
# Use fit_transform function on the 'lems' text column 
cvec_zone_words = cvec_zone_word.fit_transform(zone['lems'])
tvec_zone_words = tvec_zone_word.fit_transform(zone['lems'])

In [22]:
# creating a df for vectorized words; Comics
cvec_comics_df_words = pd.DataFrame(cvec_comics_words.toarray(), columns=cvec_comic_word.get_feature_names())
# creating a df for tfidf words
tvec_comics_df_words = pd.DataFrame(tvec_comics_words.toarray(), columns=tvec_comic_word.get_feature_names())

In [23]:
# creating a df for vectorized words; Twilight Zone
cvec_zone_df_words = pd.DataFrame(cvec_zone_words.toarray(), columns=cvec_zone_word.get_feature_names())
# creating a df for tfidf words
tvec_zone_df_words = pd.DataFrame(tvec_zone_words.toarray(), columns=tvec_zone_word.get_feature_names())

In [24]:
# looking at vectorized value counts; Twilight Zone
cvec_counts = cvec_zone_df_words.sum(axis=0)
cvec_counts.sort_values(ascending=False)[0:10]

episode          427
wa               291
just             231
twilight         210
like             199
10               192
zone             161
twilight zone    153
think            131
people           128
dtype: int64

episode          427
wa               291
just             231
twilight         210
like             199
10               192
zone             161
twilight zone    153
think            131
people           128
dtype: int64

In [25]:
# looking at vectorized value counts; Twilight Zone
tvec_counts = tvec_zone_df_words.sum(axis=0)
tvec_counts.sort_values(ascending=False)[0:10]

twilight         62.093802
episode          18.964003
wa               13.938632
10               13.907478
just             11.019301
zone             10.034104
twilight zone     9.771264
like              8.955099
know              8.130613
think             7.866082
dtype: float64

twilight         62.093802
episode          18.964003
wa               13.938632
10               13.907478
just             11.019301
zone             10.034104
twilight zone     9.771264
like              8.955099
know              8.130613
think             7.866082
dtype: float64

##### Keywords such as 'episode' and 'twilight' should prove useful as classification predictors. Also, models should explicitly state an n_gram range of only one or two words. 

In [26]:
# looking at vectorized value counts; Comics
cvec_counts = cvec_comics_df_words.sum(axis=0)
cvec_counts.sort_values(ascending=False)[0:10]

comic    780
wa       490
like     421
http     349
book     342
just     322
read     305
com      282
amp      252
know     226
dtype: int64

comic    780
wa       490
like     421
http     349
book     342
just     322
read     305
com      282
amp      252
know     226
dtype: int64

In [27]:
# looking at vectorized value counts; Comics
tvec_counts = tvec_comics_df_words.sum(axis=0)
tvec_counts.sort_values(ascending=False)[0:10]

comic        108.422742
wa            25.970053
like          23.110696
read          22.801493
just          20.668383
book          20.468026
removed       17.405006
know          16.655738
character     14.473864
story         14.345642
dtype: float64

comic        108.422742
wa            25.970053
like          23.110696
read          22.801493
just          20.668383
book          20.468026
removed       17.405006
know          16.655738
character     14.473864
story         14.345642
dtype: float64

##### Keywords such as 'comic' and 'wa' should prove useful as classification predictors. Also, models should explicitly state an n_gram range of only one or two words. 