# Cleaning and EDA

## Imports

In [2]:
#Imports
import pandas as pd
import numpy as np

#panda character display limit
pd.options.display.max_colwidth = 400

## Reading in collected data from our csv

In [3]:
data = pd.read_csv('../data/subreddit_data.csv')

## Analysis

In [4]:
data.shape

(9997, 3)

#### I do not plan on using selftext for my classification model. Due to the nature (no pun intended) of the subreddits, the majority of posts having selftext values is expected.

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9997 entries, 0 to 9996
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   subreddit  9997 non-null   object
 1   selftext   299 non-null    object
 2   title      9997 non-null   object
dtypes: object(3)
memory usage: 234.4+ KB


In [40]:
data.describe()

Unnamed: 0,title_length,title_word_count
count,9996.0,9996.0
mean,56.127651,9.895758
std,46.63815,8.179608
min,1.0,1.0
25%,28.0,5.0
50%,43.5,8.0
75%,67.0,12.0
max,304.0,61.0


#### Not going to bother normalizing this as we've intentionally scraped an even amount of posts from each subreddit

In [6]:
data['subreddit'].value_counts()

natureismetal         5000
NatureIsFuckingLit    4997
Name: subreddit, dtype: int64

#### Subreddit Appears to not allow selftext in their posts

In [7]:
data['selftext'].value_counts()

[removed]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               185
[deleted]                                                   

#### Curious about the one undeleted self-text post, turns out to be a spoiler post about the recent Spiderman Movie

In [8]:
data[(data['selftext']!='[removed]') & (data['selftext']!='[deleted]') & (data['selftext'].isna()==False)]

Unnamed: 0,subreddit,selftext,title
4836,natureismetal,"Hey everyone! Today is the release date of Marvel's next movie *Spider-Man: No Way Home*, and thus we've added pretty extensive spoiler code to our auto-mod, so we're hoping to prevent anything from being ruined for anyone who plans to and hasn't yet seen the movie. This will be in place for one week, or until December 24 (ish). \n\nOf course, we can't catch everything, so we've also includ...",Regarding Spider-Man: No Way Home


#### Dropping it as it's the one post in the entire dataset with selftext and it's obviously unrelated to the theme of either subreddit. [Link for the Curious](https://www.reddit.com/r/natureismetal/comments/rin4s6/regarding_spiderman_no_way_home/)

In [9]:
data.drop(index=4836,inplace=True);

In [10]:
data.shape

(9996, 3)

#### Checking to see how many unique titles are in the dataset, there are at least 549 duplicate titles in the dataset which is unsurprising given reddit's affinity for reposting content in subreddits.

In [11]:
unique = data['title'].unique()
len(unique)

9447

#### Unique titles in r/NatureisFuckingLit

In [12]:
lit_unique = data[data['subreddit']=='NatureIsFuckingLit']['title'].unique()
len(lit_unique)

4787

#### Unique titles in r/natureismetal

In [13]:
metal_unique = data[data['subreddit']=='natureismetal']['title'].unique()
len(metal_unique)

4737

#### List of post titles that appear in both subreddits

In [14]:
shared_titles = [title for title in lit_unique if title in metal_unique]

In [15]:
len(shared_titles)

77

#### Checking that unique titles calculations are valid

In [16]:
len(unique) == ((len(lit_unique) + len(metal_unique)) - len(shared_titles))

True

#### Most common post title that appears in both subreddits

In [17]:
most_common_title = ''
count = 0
for n in shared_titles:
    if len(data[data['title']==n]) > count:
        count = len(data[data['title']==n])
        most_common_title = n
print(f'The most common post title that appears in both subreddits is "{most_common_title}", appearing {count} times')

The most common post title that appears in both subreddits is "4 Tips to keep yourself mentally fit", appearing 10 times


In [18]:
data[data['subreddit']=='natureismetal']['title'].mode()

0    Watch "No Copyright Free , Nature Videos Copyright Free Download, Copyright Free Background, 4k Free Stock" on YouTube
dtype: object

#### Most common post title that appears in the dataset

In [35]:
most_common_title = data['title'].mode()
most_common_title.iloc[0], len(data[data['title']==most_common_title.iloc[0]])

('Watch "No Copyright Free , Nature Videos Copyright Free Download, Copyright Free Background, 4k Free Stock" on YouTube',
 13)

### When I started analyzing the data, I learned that r/NatureIsFuckingLit has a rule (rule #2) that dictates: "Titles must start with the 🔥 emoji"
#### This will have a significant impact on title-based classification, I may end up making models where the 🔥 is stripped from all posts, and others where the 🔥 are not removed, as this could likely make classification too "easy" for the model.
#### This rule supposedly is enforced under threat of post deletion, but there are many submissions that do not contain the 🔥 emoji.

In [20]:
data['title'][data['subreddit']=='NatureIsFuckingLit'][data['title'].str.contains('🔥')].head(3)

5002                  🔥 kissing camels, Desert, Algeria 🐪
5003                                   🔥 Piebald Peacocks
5005    🔥 Rhino running through the savannah. So Powerful
Name: title, dtype: object

#### Number of Posts in r/NatureIsFuckingLit that contain a 🔥 emoji

In [21]:
data['title'][data['subreddit']=='NatureIsFuckingLit'].str.contains('🔥').sum()

2817

#### Number of Posts in r/natureismetal that contain a 🔥 emoji

In [22]:
data['title'][data['subreddit']=='natureismetal'].str.contains('🔥').sum()

17

In [23]:
data[['title']][data['subreddit']=='natureismetal'][data['title'].str.contains('🔥')]

  """Entry point for launching an IPython kernel.


Unnamed: 0,title
9,🔥 Giant Earthworm
311,🔥 ITAP of an eagle eating a jackal that had been killed by a lion (NSFL).
597,🔥 Cormorant killed by own appetite [Waal river-Holland]
1080,Pointed skiess🔥
1180,🔥 Just a few days left until the Gravis Finance IDO starts! Join our simple Twitter contest
1926,🔥 Iceland filmed like I just watched Oblivion
2646,Sitting on the bench while listening to some music and watching nature burn in sunset flames 🔥
2891,🔥 this sleek bird is a cedar waxwing - the name comes from the red waxy wingtips rather than the overall smooth appearance
2901,This CHONKY bug i found🔥
3288,🔥 Camouflage!


### Create title character length and title_word_count columns for the dataframe to facilitate exploratory data visuals

In [24]:
data['title_length'] = [len(char) for char in data['title']]

In [25]:
data['title_word_count'] = [len(n.split()) for n in data['title']]

In [26]:
data.describe()

Unnamed: 0,title_length,title_word_count
count,9996.0,9996.0
mean,56.127651,9.895758
std,46.63815,8.179608
min,1.0,1.0
25%,28.0,5.0
50%,43.5,8.0
75%,67.0,12.0
max,304.0,61.0


In [27]:
data.groupby(data['subreddit']).describe().T

Unnamed: 0,subreddit,NatureIsFuckingLit,natureismetal
title_length,count,4997.0,4999.0
title_length,mean,53.938963,58.315463
title_length,std,45.51742,47.636284
title_length,min,1.0,1.0
title_length,25%,27.0,29.0
title_length,50%,41.0,45.0
title_length,75%,64.0,70.0
title_length,max,304.0,302.0
title_word_count,count,4997.0,4999.0
title_word_count,mean,9.553932,10.237447


#### Curious about the titles that were only one character in length

In [28]:
data[data['title_length']==1]

Unnamed: 0,subreddit,selftext,title,title_length,title_word_count
43,natureismetal,,.,1,1
3139,natureismetal,,🐇,1,1
5891,NatureIsFuckingLit,,🌆,1,1
7030,NatureIsFuckingLit,,A,1,1
8562,NatureIsFuckingLit,,🔥,1,1


In [29]:
data.to_csv('../data/edited_data.csv',index=False)