# Project 3: Web APIs & NLP - 03

Kelly Slatery | US-DSI-10 | 01.31.2020

In [1]:
# Imports
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

# Import and explore data

In [2]:
# Set viewing options
pd.set_option('display.max_columns', 10000)

### Submissions

In [3]:
# Import beatles submissions (20,000)
beatles_subs = pd.read_csv('./data/clean_beatles_subs.csv')

In [4]:
# Look at shape (rows, columns)
beatles_subs.shape

(19961, 9)

In [5]:
# Import queen submissions (20,000)
queen_subs = pd.read_csv('./data/clean_queen_subs.csv')

In [6]:
# Look at shape (rows, columns)
queen_subs.shape

(19491, 9)

In [7]:
# Combine dataframes
submissions = pd.concat([beatles_subs, queen_subs])

In [8]:
# How many duplicates are there?
submissions.duplicated().sum()

2

In [9]:
# Look at duplicated rows
submissions[submissions.duplicated()]

Unnamed: 0,author,author_flair_text,created_utc,score,selftext,subreddit,title,author_full,all_text
4504,funnybunnybread2_0,-,1564754820,1,-,queen,How do you think John feels about Brian and Ro...,funnybunnybread2_0,How do you think John feels about Brian and Ro...
13660,two_tits_,-,1542361942,1,-,queen,I've paid my dues,two_tits_,I've paid my dues


In [10]:
# Drop duplicated rows
submissions.drop_duplicates(inplace=True)

In [11]:
# Dummify 'subreddit' column for classification
submissions['subreddit'] = [1 if x == 'beatles' else 0 for x in submissions['subreddit']]

In [12]:
submissions.head(3)

Unnamed: 0,author,author_flair_text,created_utc,score,selftext,subreddit,title,author_full,all_text
0,Lanovart,-,1580154959,1,-,1,Fan art. Magnet from gypsum,Lanovart,Fan art. Magnet from gypsum
1,jackjoy1992,-,1580154755,1,-,1,"EMI Studios, 1963.",jackjoy1992,"EMI Studios, 1963."
2,jackjoy1992,-,1580154655,1,-,1,"Klein, Lennon and Ono, 1969.",jackjoy1992,"Klein, Lennon and Ono, 1969."


In [13]:
submissions.tail(3)

Unnamed: 0,author,author_flair_text,created_utc,score,selftext,subreddit,title,author_full,all_text
19488,QueenSongoftheDay,-,1371580630,15,-,0,Song of the Day #17- Drowse,QueenSongoftheDay,Song of the Day #17- Drowse
19489,QueenSongoftheDay,-,1371485018,27,-,0,Song of the Day #16- Death on Two Legs (Dedica...,QueenSongoftheDay,Song of the Day #16- Death on Two Legs (Dedica...
19490,woehoe,-,1371419736,24,-,0,Some of Freddie's funniest live moments,woehoe,Some of Freddie's funniest live moments


In [14]:
# Look at datatypes
submissions.dtypes

author               object
author_flair_text    object
created_utc           int64
score                 int64
selftext             object
subreddit             int64
title                object
author_full          object
all_text             object
dtype: object

### Comments

In [15]:
# Import beatles comments (20,000)
beatles_coms = pd.read_csv('./data/clean_beatles_coms.csv')

In [16]:
beatles_coms.shape

(19983, 10)

In [17]:
# Import queen comments (20,000)
queen_coms = pd.read_csv('./data/clean_queen_coms.csv')

In [18]:
queen_coms.shape

(20000, 10)

In [19]:
# Combine dataframes
comments = pd.concat([beatles_coms, queen_coms])

In [20]:
# How many duplicates are there?
comments.duplicated().sum()

2

In [21]:
# Look at duplicated rows
comments[comments.duplicated()]

Unnamed: 0,author,author_flair_text,body,created_utc,score,selftext,subreddit,title,author_full,all_text
4942,funnybunnybread2_0,-,-,1564755000.0,1.0,-,queen,How do you think John feels about Brian and Ro...,funnybunnybread2_0,How do you think John feels about Brian and Ro...
14106,two_tits_,-,-,1542362000.0,1.0,-,queen,I've paid my dues,two_tits_,I've paid my dues


Odd that both submissions and comments data have the same duplicates, and by different authors. Assuming that since other values in the dataframes aren't the same, these two authors were probably really trying to get their point across.

In [22]:
# Drop duplicated rows
comments.drop_duplicates(inplace=True)

In [23]:
# Dummify 'subreddit' column for classification
comments['subreddit'] = [1 if x == 'beatles' else 0 for x in comments['subreddit']]

In [24]:
comments.head(3)

Unnamed: 0,author,author_flair_text,body,created_utc,score,selftext,subreddit,title,author_full,all_text
0,356BC,-,"Sorry, I wasn't trying to sound like a dick. I...",1580156000.0,1.0,-,1,-,356BC,"Sorry, I wasn't trying to sound like a dick. I..."
1,EveningsAndWeekends,-,"Oh man to be one of those standing there, watc...",1580156000.0,1.0,-,1,-,EveningsAndWeekends,"Oh man to be one of those standing there, watc..."
2,EveningsAndWeekends,-,Ded from those sick beats,1580156000.0,1.0,-,1,-,EveningsAndWeekends,Ded from those sick beats


In [25]:
comments.tail(3)

Unnamed: 0,author,author_flair_text,body,created_utc,score,selftext,subreddit,title,author_full,all_text
19997,Kastain,-,-,1392904000.0,18.0,-,0,John Deacon with an awesome hat in the music v...,Kastain,John Deacon with an awesome hat in the music v...
19998,Pinkiepoi,-,-,1392852000.0,12.0,-,0,"Oh Freddy, you pervert...",Pinkiepoi,"Oh Freddy, you pervert..."
19999,Mick_Wyld,-,-,1392825000.0,18.0,-,0,The A.V. CLUB breaks down The Game,Mick_Wyld,The A.V. CLUB breaks down The Game


In [26]:
# Look at datatypes
comments.dtypes

author                object
author_flair_text     object
body                  object
created_utc          float64
score                float64
selftext              object
subreddit              int64
title                 object
author_full           object
all_text              object
dtype: object

## NLP Parsing

In [27]:
# Make list of columns for submissions for primary NLP parsing (no author data)
cols_sub_nlp = ['selftext', 'title', 'all_text']

# Make list of columns for comments for primary NLP parsing (no author data)
cols_com_nlp = ['body', 'selftext', 'title', 'all_text']

### Tokenize text data

In [28]:
# Define a function to clean text data (remove whitespace, 
# keep only alphabet characters & make lowercase)

def tokenize_data(data, col_list):
    tokenizer = RegexpTokenizer(r'\w+')
    for col in col_list:
        tokenized_items = [tokenizer.tokenize(item.lower()) for item in data[col]]
        data[f'tokenized_{col}'] = tokenized_items
    return data  


In [29]:
# Tokenize submissions text data 
submissions = tokenize_data(submissions, cols_sub_nlp)
submissions.head(3)

Unnamed: 0,author,author_flair_text,created_utc,score,selftext,subreddit,title,author_full,all_text,tokenized_selftext,tokenized_title,tokenized_all_text
0,Lanovart,-,1580154959,1,-,1,Fan art. Magnet from gypsum,Lanovart,Fan art. Magnet from gypsum,[],"[fan, art, magnet, from, gypsum]","[fan, art, magnet, from, gypsum]"
1,jackjoy1992,-,1580154755,1,-,1,"EMI Studios, 1963.",jackjoy1992,"EMI Studios, 1963.",[],"[emi, studios, 1963]","[emi, studios, 1963]"
2,jackjoy1992,-,1580154655,1,-,1,"Klein, Lennon and Ono, 1969.",jackjoy1992,"Klein, Lennon and Ono, 1969.",[],"[klein, lennon, and, ono, 1969]","[klein, lennon, and, ono, 1969]"


In [30]:
# Tokenize comments text data 
comments = tokenize_data(comments, cols_sub_nlp)
comments.head(3)

Unnamed: 0,author,author_flair_text,body,created_utc,score,selftext,subreddit,title,author_full,all_text,tokenized_selftext,tokenized_title,tokenized_all_text
0,356BC,-,"Sorry, I wasn't trying to sound like a dick. I...",1580156000.0,1.0,-,1,-,356BC,"Sorry, I wasn't trying to sound like a dick. I...",[],[],"[sorry, i, wasn, t, trying, to, sound, like, a..."
1,EveningsAndWeekends,-,"Oh man to be one of those standing there, watc...",1580156000.0,1.0,-,1,-,EveningsAndWeekends,"Oh man to be one of those standing there, watc...",[],[],"[oh, man, to, be, one, of, those, standing, th..."
2,EveningsAndWeekends,-,Ded from those sick beats,1580156000.0,1.0,-,1,-,EveningsAndWeekends,Ded from those sick beats,[],[],"[ded, from, those, sick, beats]"


### Lemmatize text data

In [31]:
# Make list of columns for submissions for primary NLP parsing (no author data)
tok_cols_sub_nlp = ['tokenized_selftext', 'tokenized_title', 'tokenized_all_text']

# Make list of columns for comments for primary NLP parsing (no author data)
tok_cols_com_nlp = ['tokenized_body', 'tokenized_selftext', 'tokenized_title', 'tokenized_all_text']

In [32]:
# Define a function to lemmatize text data 

def lemmatize_data(data, col_list):
    lemmatizer = WordNetLemmatizer()
    for col in col_list:
        lemmed_items = []
        for row in data[col]:
            lemmed_row = [lemmatizer.lemmatize(word) for word in row]
            lemmed_items.append(lemmed_row)
        data[f'lemmatized_{col}'] = lemmed_items
    return data  


In [33]:
# Lemmatize submissions text data 
submissions = lemmatize_data(submissions, tok_cols_sub_nlp)

In [34]:
submissions.head(3)

Unnamed: 0,author,author_flair_text,created_utc,score,selftext,subreddit,title,author_full,all_text,tokenized_selftext,tokenized_title,tokenized_all_text,lemmatized_tokenized_selftext,lemmatized_tokenized_title,lemmatized_tokenized_all_text
0,Lanovart,-,1580154959,1,-,1,Fan art. Magnet from gypsum,Lanovart,Fan art. Magnet from gypsum,[],"[fan, art, magnet, from, gypsum]","[fan, art, magnet, from, gypsum]",[],"[fan, art, magnet, from, gypsum]","[fan, art, magnet, from, gypsum]"
1,jackjoy1992,-,1580154755,1,-,1,"EMI Studios, 1963.",jackjoy1992,"EMI Studios, 1963.",[],"[emi, studios, 1963]","[emi, studios, 1963]",[],"[emi, studio, 1963]","[emi, studio, 1963]"
2,jackjoy1992,-,1580154655,1,-,1,"Klein, Lennon and Ono, 1969.",jackjoy1992,"Klein, Lennon and Ono, 1969.",[],"[klein, lennon, and, ono, 1969]","[klein, lennon, and, ono, 1969]",[],"[klein, lennon, and, ono, 1969]","[klein, lennon, and, ono, 1969]"


In [35]:
# Lemmatize comments text data 
comments = lemmatize_data(comments, tok_cols_sub_nlp)

In [36]:
comments.head(3)

Unnamed: 0,author,author_flair_text,body,created_utc,score,selftext,subreddit,title,author_full,all_text,tokenized_selftext,tokenized_title,tokenized_all_text,lemmatized_tokenized_selftext,lemmatized_tokenized_title,lemmatized_tokenized_all_text
0,356BC,-,"Sorry, I wasn't trying to sound like a dick. I...",1580156000.0,1.0,-,1,-,356BC,"Sorry, I wasn't trying to sound like a dick. I...",[],[],"[sorry, i, wasn, t, trying, to, sound, like, a...",[],[],"[sorry, i, wasn, t, trying, to, sound, like, a..."
1,EveningsAndWeekends,-,"Oh man to be one of those standing there, watc...",1580156000.0,1.0,-,1,-,EveningsAndWeekends,"Oh man to be one of those standing there, watc...",[],[],"[oh, man, to, be, one, of, those, standing, th...",[],[],"[oh, man, to, be, one, of, those, standing, th..."
2,EveningsAndWeekends,-,Ded from those sick beats,1580156000.0,1.0,-,1,-,EveningsAndWeekends,Ded from those sick beats,[],[],"[ded, from, those, sick, beats]",[],[],"[ded, from, those, sick, beat]"


### Stemmatize text data

In [37]:
# Define a function to stemmatize text data 

def stemmatize_data(data, col_list):
    p_stemmer = PorterStemmer()
    for col in col_list:
        stemmed_items = []
        for row in data[col]:
            stemmed_row = [p_stemmer.stem(word) for word in row]
            stemmed_items.append(stemmed_row)
        data[f'stemmatized_{col}'] = stemmed_items
    return data  


In [38]:
# Stemmatize submissions text data 
submissions = stemmatize_data(submissions, tok_cols_sub_nlp)

In [39]:
submissions.head(3)

Unnamed: 0,author,author_flair_text,created_utc,score,selftext,subreddit,title,author_full,all_text,tokenized_selftext,tokenized_title,tokenized_all_text,lemmatized_tokenized_selftext,lemmatized_tokenized_title,lemmatized_tokenized_all_text,stemmatized_tokenized_selftext,stemmatized_tokenized_title,stemmatized_tokenized_all_text
0,Lanovart,-,1580154959,1,-,1,Fan art. Magnet from gypsum,Lanovart,Fan art. Magnet from gypsum,[],"[fan, art, magnet, from, gypsum]","[fan, art, magnet, from, gypsum]",[],"[fan, art, magnet, from, gypsum]","[fan, art, magnet, from, gypsum]",[],"[fan, art, magnet, from, gypsum]","[fan, art, magnet, from, gypsum]"
1,jackjoy1992,-,1580154755,1,-,1,"EMI Studios, 1963.",jackjoy1992,"EMI Studios, 1963.",[],"[emi, studios, 1963]","[emi, studios, 1963]",[],"[emi, studio, 1963]","[emi, studio, 1963]",[],"[emi, studio, 1963]","[emi, studio, 1963]"
2,jackjoy1992,-,1580154655,1,-,1,"Klein, Lennon and Ono, 1969.",jackjoy1992,"Klein, Lennon and Ono, 1969.",[],"[klein, lennon, and, ono, 1969]","[klein, lennon, and, ono, 1969]",[],"[klein, lennon, and, ono, 1969]","[klein, lennon, and, ono, 1969]",[],"[klein, lennon, and, ono, 1969]","[klein, lennon, and, ono, 1969]"


In [40]:
# Stemmatize comments text data 
comments = stemmatize_data(comments, tok_cols_sub_nlp)

In [41]:
comments.head(3)

Unnamed: 0,author,author_flair_text,body,created_utc,score,selftext,subreddit,title,author_full,all_text,tokenized_selftext,tokenized_title,tokenized_all_text,lemmatized_tokenized_selftext,lemmatized_tokenized_title,lemmatized_tokenized_all_text,stemmatized_tokenized_selftext,stemmatized_tokenized_title,stemmatized_tokenized_all_text
0,356BC,-,"Sorry, I wasn't trying to sound like a dick. I...",1580156000.0,1.0,-,1,-,356BC,"Sorry, I wasn't trying to sound like a dick. I...",[],[],"[sorry, i, wasn, t, trying, to, sound, like, a...",[],[],"[sorry, i, wasn, t, trying, to, sound, like, a...",[],[],"[sorri, i, wasn, t, tri, to, sound, like, a, d..."
1,EveningsAndWeekends,-,"Oh man to be one of those standing there, watc...",1580156000.0,1.0,-,1,-,EveningsAndWeekends,"Oh man to be one of those standing there, watc...",[],[],"[oh, man, to, be, one, of, those, standing, th...",[],[],"[oh, man, to, be, one, of, those, standing, th...",[],[],"[oh, man, to, be, one, of, those, stand, there..."
2,EveningsAndWeekends,-,Ded from those sick beats,1580156000.0,1.0,-,1,-,EveningsAndWeekends,Ded from those sick beats,[],[],"[ded, from, those, sick, beats]",[],[],"[ded, from, those, sick, beat]",[],[],"[ded, from, those, sick, beat]"


# Additional Cleaning

Through further analysis, found strings with high frequencies but low importance to interpretation. Remove strings or rows.

### Submissions

In [42]:
# Change all instances of 'favourite' to 'favorite'
submissions['all_text'] = [row.replace('favourite', 'favorite') for row in submissions['all_text']]

# Remove all strings marking websites or other formatting
for word in ["www.", "https", ".com", '#x200B', '&amp']:
    submissions['all_text'] = [row.replace(word, '') for row in submissions['all_text']]

# Remove all rows containing 'ufc' because they look like ads
indices_to_drop = []

for index in submissions['all_text'].index:
    if 'ufc' in submissions.loc[index, 'all_text']:
        indices_to_drop.append(index)
        
submissions.drop(submissions.index[indices_to_drop], axis=0, inplace=True)


### Comments

In [43]:
# Change all instances of 'favourite' to 'favorite' and drop unrelated words
comments['all_text'] = [row.replace('favourite', 'favorite') for row in comments['all_text']]

# Remove all strings marking websites or other formatting
for word in ["www.", "https", ".com", '#x200B', '&amp']:
    comments['all_text'] = [row.replace(word, '') for row in comments['all_text']]

# Remove all rows containing 'ufc'
indices_to_drop = []

for index in comments['all_text'].index:
    if 'ufc' in comments.loc[index, 'all_text']:
        indices_to_drop.append(index)
        
comments.drop(comments.index[indices_to_drop], axis=0, inplace=True)


# Export parsable data

In [44]:
# Export submissions data to csv
submissions.to_csv('./data/submissions.csv', index=False)

In [45]:
# Export comments data to csv
comments.to_csv('./data/comments.csv', index=False)

# Data Dictionary

In [76]:
# Create an empty list to add a dictionary for each row
data_list = []

In [77]:
# Fill the list with name, data type, and modifications for all columns
for col in submissions.columns:
    this = {}
    this['Column Name'] = col
    this['Data Type'] = submissions[col].dtype
    this['Modifications'] = 'None'
    if 'tokenized' in col:
        this['Modifications'] = 'Text data: tokenized with regular expression'
    if 'lemmatized' in col:
        this['Modifications'] = this['Modifications'] + ', lemmatized with WordNetLemmatizer()'
    if 'stemmatized' in col:
        this['Modifications'] = this['Modifications'] + ', stemmatized with PorterStemmer()'
    if col == 'subreddit':
        this['Modifications'] = 'Binarized'
    if col == 'selftext' or col == 'title':
        this['Modifications'] = "Nulls replaced with '-'"
    data_list.append(this)

In [78]:
# Transform dictionary into dataframe
data_df = pd.DataFrame(data_list)
data_df.head()

Unnamed: 0,Column Name,Data Type,Modifications
0,author,object,
1,author_flair_text,object,
2,created_utc,int64,
3,score,int64,
4,selftext,object,Nulls replaced with '-'


In [80]:
# Add a 'Description' column
data_df['Description'] = ['Author of post/submission', 
                          'Non-ascii parts of author username', 
                          'Time post was created in UTC', 
                          'Aggregate sum of upvotes and downvotes (no negatives)', 
                          'Body of post/submission', 
                          'Subreddit to which the post/submission belongs (1=Beatles, 0=Queen)', 
                          'Title of post/submission', 
                          "Combined: 'author' and 'author_flair_text'", 
                          "Combined: 'selftext' and 'title'", 
                          "Tokenized 'selftext'", 
                          "Tokenized 'title'", 
                          "Tokenized 'all_text'", 
                          "Lemmatized 'tokenized_selftext'", 
                          "Lemmatized 'tokenized_title'", 
                          "Lemmatized 'tokenized_all_text'",
                          "Stemmatized 'tokenized_selftext'", 
                          "Stemmatized 'tokenized_title'", 
                          "Stemmatized 'tokenized_all_text'"
                         ]


In [82]:
# Export data dictionary
data_df.to_csv('./data/data_dictionary.csv')