In [1]:
# Import Libraries
import pandas as pd
import regex as re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer

pd.set_option('display.max_colwidth', -1)
pd.options.display.max_columns = 999

In [2]:
# Read in two volumes 
zone = pd.read_csv('../data/raw_data/twilight_zone_raw')
comics = pd.read_csv('../data/raw_data/comicbooks_raw')

In [3]:
# Going to drop the 'Unnamed' column for each dataset and use the pandas index
zone.drop(columns='Unnamed: 0', inplace=True)
comics.drop(columns='Unnamed: 0', inplace=True)

In [4]:
zone.shape

(422, 8)

In [5]:
comics.shape

(888, 8)

In [6]:
# How many of these columns are useful to modeling? 
zone.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 422 entries, 0 to 421
Data columns (total 8 columns):
selftext        368 non-null object
author          422 non-null object
title           422 non-null object
created_utc     422 non-null int64
num_comments    422 non-null int64
is_self         422 non-null bool
subreddit       422 non-null object
timestamp       422 non-null object
dtypes: bool(1), int64(2), object(5)
memory usage: 23.6+ KB


In [7]:
# Returns 'True' for every column therefore not going to be a helpful classifier
zone['is_self'].value_counts()

True    422
Name: is_self, dtype: int64

In [8]:
# Returns 'True' for every column therefore not going to be a helpful classifier
comics['is_self'].value_counts()

True    888
Name: is_self, dtype: int64

In [9]:
# Dropping the 'is_self' columns from each dataset
zone.drop(columns='is_self', inplace=True)
comics.drop(columns='is_self', inplace=True)

In [10]:
# Dropping the 'created_utc' column from both datasets
zone.drop(columns='created_utc', inplace=True)
comics.drop(columns='created_utc', inplace=True)

In [11]:
# Need to convert dtypes to strings in appropriate columns 
for col in ['selftext', 'author', 'title', 'subreddit']:
    zone[col] = zone[col].astype(str)

In [12]:
for col in ['selftext', 'author', 'title', 'subreddit']:
    comics[col] = comics[col].astype(str)

In [13]:
# Removing all columns that are not text-based
zone.drop(['num_comments','timestamp'], axis=1, inplace=True)

In [14]:
comics.drop(['num_comments','timestamp'], axis=1, inplace=True)

In [15]:
# Checking for null values in each dataframe 
comics.isna().sum()

selftext     0
author       0
title        0
subreddit    0
dtype: int64

In [16]:
zone.isna().sum()

selftext     0
author       0
title        0
subreddit    0
dtype: int64

In [17]:
comics.head(2)

Unnamed: 0,selftext,author,title,subreddit
0,"I came across a video on YouTube of Batman talking about his plans that he has to stop members of the Justice League if they ever turned against the team.\n\nLink to the video: https://youtu.be/ZJVvrmLSTsg\n\nI noticed that there were some that he didn't have a full plan for, so I wanted to continue them for myself. I noticed he had cool code names for the members he made a file on like ""Polymer"" for Plastic Man and ""Red Sands"" for Martian Manhunter. Can anyone come up with one of those for Green Arrow?",AnonymousGuyChillin,Can someone help me with a code name?,comicbooks
1,"Say what you want about Morrison's Batman or Snyder's, but those runs are legend. They are so good. Even when I don't understand Morrison sometimes I still get the idea and understand maybe 80 percent, Snyder's batman is radically different in terms of hopey preachy batman. Tom King just doesn't do it for me. It's so boring. He uses the same panel shots through story telling and corny lines like ""because I'm batman"" are so bad. And I think... Maybe it's not fair because you had 2 greats before who fucking made crazy ass story arcs with their writing. I gave King a chance. 50 issues is too gracious, but I can't anymore.",Comicboy87,It's really hard for me to like Tom King's Batman when it was Grant Morrison and Scott Snyder before him.,comicbooks


In [18]:
zone.head(2)

Unnamed: 0,selftext,author,title,subreddit
0,[removed],ryanseanoreilly,Podcast Review of stories by Charles Beaumont,TwilightZone
1,"Doing a marathon here of the original series. This one had me gripped. I know it's a TV show yadda yadda, but watching John Denver play a great character rubbishing superstition only to be scared out his wits and ultimately facing his doom. \n\nHis journey through the park, reminds me how I felt having to do it late at night so many times myself. Paranoia such a bitch. \n\nAmazing episode, definitely one of my favourites.",neads1,"The Jungle (S3, ep 12)",TwilightZone


### Lemmatizing Text Columns 
Need to preprocess and create a new 'combined_text' column for our combined dataframe, comics_zone. Kept dataframes separate to avoid confusion before concatenation. 

In [19]:
# setting up tokenizer and lemmatizer
tokenizer = RegexpTokenizer(r'\w+')
lemmatizer = WordNetLemmatizer()

In [20]:
# function to lemmatize (with help from Kate Dowdy)
def lemma(text):
    tokens = tokenizer.tokenize(str(text))
    lems = [lemmatizer.lemmatize(i) for i in tokens]
    
    return(" ".join(lems))

In [21]:
# create a column for lemmatized words 
zone['lems'] = zone['selftext'].apply(lambda x: lemma(x))

In [22]:
zone['title_lems'] = zone['title'].apply(lambda x: lemma(x))

In [23]:
# create a column for lemmatized words 
comics['lems'] = comics['selftext'].apply(lambda x: lemma(x))

In [24]:
comics['title_lems'] = comics['title'].apply(lambda x: lemma(x))

In [25]:
# function to remove hanging contraction leftovers (given to me by Kate Dowdy)
def nocontract(x):
    x = re.sub("([ ][r][e][ ])", " ", x)
    x = re.sub("([ ][v][e][ ])", " ", x)
    x = re.sub("([ ][l][l][ ])", " ", x)
    x = re.sub("([ ][d][ ])", " ", x)
    x = re.sub("([ ][t][ ])", " ", x)
    x = re.sub("([ ][m][ ])", " ", x)
    x = re.sub("([ ][s][ ])", " ", x)
    return x

In [26]:
# applying no contractions function to both dataframes
zone['lems'] = zone['lems'].apply(lambda x: nocontract(x))
zone['title_lems'] = zone['title_lems'].apply(lambda x: nocontract(x))
comics['lems'] = comics['lems'].apply(lambda x: nocontract(x))
comics['title_lems'] = comics['title_lems'].apply(lambda x: nocontract(x))

In [27]:
zone.head(2)

Unnamed: 0,selftext,author,title,subreddit,lems,title_lems
0,[removed],ryanseanoreilly,Podcast Review of stories by Charles Beaumont,TwilightZone,removed,Podcast Review of story by Charles Beaumont
1,"Doing a marathon here of the original series. This one had me gripped. I know it's a TV show yadda yadda, but watching John Denver play a great character rubbishing superstition only to be scared out his wits and ultimately facing his doom. \n\nHis journey through the park, reminds me how I felt having to do it late at night so many times myself. Paranoia such a bitch. \n\nAmazing episode, definitely one of my favourites.",neads1,"The Jungle (S3, ep 12)",TwilightZone,Doing a marathon here of the original series This one had me gripped I know it a TV show yadda yadda but watching John Denver play a great character rubbishing superstition only to be scared out his wit and ultimately facing his doom His journey through the park reminds me how I felt having to do it late at night so many time myself Paranoia such a bitch Amazing episode definitely one of my favourite,The Jungle S3 ep 12


In [28]:
comics.head(1)

Unnamed: 0,selftext,author,title,subreddit,lems,title_lems
0,"I came across a video on YouTube of Batman talking about his plans that he has to stop members of the Justice League if they ever turned against the team.\n\nLink to the video: https://youtu.be/ZJVvrmLSTsg\n\nI noticed that there were some that he didn't have a full plan for, so I wanted to continue them for myself. I noticed he had cool code names for the members he made a file on like ""Polymer"" for Plastic Man and ""Red Sands"" for Martian Manhunter. Can anyone come up with one of those for Green Arrow?",AnonymousGuyChillin,Can someone help me with a code name?,comicbooks,I came across a video on YouTube of Batman talking about his plan that he ha to stop member of the Justice League if they ever turned against the team Link to the video http youtu be ZJVvrmLSTsg I noticed that there were some that he didn have a full plan for so I wanted to continue them for myself I noticed he had cool code name for the member he made a file on like Polymer for Plastic Man and Red Sands for Martian Manhunter Can anyone come up with one of those for Green Arrow,Can someone help me with a code name


In [29]:
# Dropping vestigal unprocessed text columns
zone.drop(columns=['selftext', 'title'], inplace=True)
comics.drop(columns=['selftext', 'title'], inplace=True)

In [30]:
comics.head(2)

Unnamed: 0,author,subreddit,lems,title_lems
0,AnonymousGuyChillin,comicbooks,I came across a video on YouTube of Batman talking about his plan that he ha to stop member of the Justice League if they ever turned against the team Link to the video http youtu be ZJVvrmLSTsg I noticed that there were some that he didn have a full plan for so I wanted to continue them for myself I noticed he had cool code name for the member he made a file on like Polymer for Plastic Man and Red Sands for Martian Manhunter Can anyone come up with one of those for Green Arrow,Can someone help me with a code name
1,Comicboy87,comicbooks,Say what you want about Morrison Batman or Snyder but those run are legend They are so good Even when I don understand Morrison sometimes I still get the idea and understand maybe 80 percent Snyder batman is radically different in term of hopey preachy batman Tom King just doesn do it for me It so boring He us the same panel shot through story telling and corny line like because I batman are so bad And I think Maybe it not fair because you had 2 great before who fucking made crazy as story arc with their writing I gave King a chance 50 issue is too gracious but I can anymore,It really hard for me to like Tom King Batman when it wa Grant Morrison and Scott Snyder before him


In [31]:
zone.head(2)

Unnamed: 0,author,subreddit,lems,title_lems
0,ryanseanoreilly,TwilightZone,removed,Podcast Review of story by Charles Beaumont
1,neads1,TwilightZone,Doing a marathon here of the original series This one had me gripped I know it a TV show yadda yadda but watching John Denver play a great character rubbishing superstition only to be scared out his wit and ultimately facing his doom His journey through the park reminds me how I felt having to do it late at night so many time myself Paranoia such a bitch Amazing episode definitely one of my favourite,The Jungle S3 ep 12


In [32]:
# Saving cleaned comics dataset to csv 
comics.to_csv('../data/cleaned_data/comics')

In [33]:
# Saving cleaned zone dataset to csv
zone.to_csv('../data/cleaned_data/zone')

In [34]:
# Concatenating dataframes 
comics_zone = pd.concat([zone, comics])

In [35]:
comics_zone['subreddit'].value_counts()

comicbooks      888
TwilightZone    422
Name: subreddit, dtype: int64

In [36]:
# Changing 'subreddit' values into binary 1 = Twilight Zone, 0 = 'scifi'
comics_zone['subreddit'] = comics_zone['subreddit'].map({'comicbooks': 0, 'TwilightZone': 1})
comics_zone.head(2)

Unnamed: 0,author,subreddit,lems,title_lems
0,ryanseanoreilly,1,removed,Podcast Review of story by Charles Beaumont
1,neads1,1,Doing a marathon here of the original series This one had me gripped I know it a TV show yadda yadda but watching John Denver play a great character rubbishing superstition only to be scared out his wit and ultimately facing his doom His journey through the park reminds me how I felt having to do it late at night so many time myself Paranoia such a bitch Amazing episode definitely one of my favourite,The Jungle S3 ep 12


In [37]:
# Create a combined column for all of our text 
comics_zone['combined_text'] = comics_zone['lems'] + comics_zone['title_lems']

In [38]:
comics_zone.head(2)

Unnamed: 0,author,subreddit,lems,title_lems,combined_text
0,ryanseanoreilly,1,removed,Podcast Review of story by Charles Beaumont,removedPodcast Review of story by Charles Beaumont
1,neads1,1,Doing a marathon here of the original series This one had me gripped I know it a TV show yadda yadda but watching John Denver play a great character rubbishing superstition only to be scared out his wit and ultimately facing his doom His journey through the park reminds me how I felt having to do it late at night so many time myself Paranoia such a bitch Amazing episode definitely one of my favourite,The Jungle S3 ep 12,Doing a marathon here of the original series This one had me gripped I know it a TV show yadda yadda but watching John Denver play a great character rubbishing superstition only to be scared out his wit and ultimately facing his doom His journey through the park reminds me how I felt having to do it late at night so many time myself Paranoia such a bitch Amazing episode definitely one of my favouriteThe Jungle S3 ep 12


In [39]:
# Save cleaned and lemmatized data to csv
comics_zone.to_csv('../data/cleaned_data/comics_zone')