# Data Cleaning and Processing

This Notebook will process the data and extract all the relevant information needed for analysis. This will include:
- Feature Selection
- Imputing missing values
- Removing possible Duplicates (in case it was not properly preformed in the data collection process)
- Normalizing and Encoding categorical variables

Then the cleaned data will be exported into a format for the next step of EDA.


## Import Data

In [223]:
## Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

#Ignore warnings
from warnings import filterwarnings
filterwarnings('ignore')

In [224]:
## Load data
agg_df = pd.read_csv('../2_EDA/data/00_master.csv')

print(agg_df.shape)
agg_df.head()

(13000, 118)


Unnamed: 0,approved_at_utc,subreddit,selftext,author_fullname,saved,mod_reason_title,gilded,clicked,title,link_flair_richtext,...,preview,is_gallery,media_metadata,gallery_data,author_cakeday,link_flair_template_id,poll_data,crosspost_parent_list,crosspost_parent,collections
0,,Army,This was a convo I had with one of my buddies ...,t2_3vmh30ad,False,,0,False,If you could create a new MRE based on a Fast ...,[],...,,,,,,,,,,
1,,Army,BLUF: how do you overcome imposter syndrome?\n...,t2_9mqncmmb,False,,0,False,how do you even Army?,[],...,,,,,,,,,,
2,,Army,"Long story short, my estranged (soon to be ex)...",t2_ag69n7u7,False,,0,False,Command Directed No-Contact Order?,[],...,,,,,,,,,,
3,,Army,"\nMy husband is 35T, and just graduated AIT. W...",t2_lb56g2zm,False,,0,False,Anyone 35T?,[],...,,,,,,,,,,
4,,Army,I could use some advice on going recruiting. I...,t2_i4rellgt,False,,0,False,Thinking of going recruiter as brand new E5,[],...,,,,,,,,,,


## Data Statistics

In [225]:
agg_df.columns

Index(['approved_at_utc', 'subreddit', 'selftext', 'author_fullname', 'saved',
       'mod_reason_title', 'gilded', 'clicked', 'title', 'link_flair_richtext',
       ...
       'preview', 'is_gallery', 'media_metadata', 'gallery_data',
       'author_cakeday', 'link_flair_template_id', 'poll_data',
       'crosspost_parent_list', 'crosspost_parent', 'collections'],
      dtype='object', length=118)

In [226]:
agg_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13000 entries, 0 to 12999
Columns: 118 entries, approved_at_utc to collections
dtypes: bool(28), float64(26), int64(8), object(56)
memory usage: 9.3+ MB


In [227]:
agg_df.dtypes

approved_at_utc           float64
subreddit                  object
selftext                   object
author_fullname            object
saved                        bool
                           ...   
link_flair_template_id     object
poll_data                  object
crosspost_parent_list      object
crosspost_parent           object
collections                object
Length: 118, dtype: object

## Feature Selection

### Dropped Columns

In [228]:
# Drop columns: 'thumbnail_width', 'media_embed' and 74 other columns
agg_df = agg_df.drop(columns=['thumbnail_width', 'media_embed', 'author_flair_template_id', 'is_original_content', 'user_reports', 'secure_media', 'is_reddit_media_domain', 'is_meta', 'category', 'secure_media_embed', 'link_flair_text', 'can_mod_post', 'approved_by', 'is_created_from_ads_ui', 'author_premium', 'gildings', 'content_categories', 'mod_note', 'wls', 'removed_by_category', 'banned_by', 'link_flair_type', 'allow_live_comments', 'selftext_html', 'likes', 'suggested_sort', 'banned_at_utc', 'view_count', 'archived', 'is_crosspostable', 'pinned', 'all_awardings', 'awarders', 'media_only', 'can_gild', 'spoiler', 'treatment_tags', 'visited', 'removed_by', 'num_reports', 'distinguished', 'subreddit_id', 'author_is_blocked', 'mod_reason_by', 'removal_reason', 'link_flair_background_color', 'id', 'is_robot_indexable', 'report_reasons', 'discussion_type', 'send_replies', 'whitelist_status', 'contest_mode', 'mod_reports', 'author_patreon_flair', 'author_flair_text_color', 'permalink', 'parent_whitelist_status', 'stickied', 'url', 'subreddit_subscribers', 'created_utc', 'num_crossposts', 'media', 'post_hint', 'url_overridden_by_dest', 'preview', 'is_gallery', 'media_metadata', 'gallery_data', 'author_cakeday', 'link_flair_template_id', 'poll_data', 'crosspost_parent_list', 'crosspost_parent', 'collections', 'approved_at_utc', 'saved', 'mod_reason_title', 'gilded', 'clicked', 'hidden', 'pwls', 'link_flair_css_class', 'downs', 'thumbnail_height', 'top_awarded_type', 'hide_score', 'name', 'quarantine', 'link_flair_text_color', 'author_flair_background_color', 'subreddit_type', 'total_awards_received'])

In [229]:
agg_df.isnull().sum().sort_values(ascending=False)


author_flair_css_class     11820
author_flair_text           9614
thumbnail                   2500
selftext                    1974
author_flair_richtext         60
author_fullname               60
author_flair_type             60
created                        0
num_comments                   0
author                         0
locked                         0
over_18                        0
no_follow                      0
domain                         0
subreddit                      0
is_self                        0
edited                         0
score                          0
ups                            0
upvote_ratio                   0
subreddit_name_prefixed        0
link_flair_richtext            0
title                          0
is_video                       0
dtype: int64

### Author Flair CSS Class

In [230]:
#Replace null values with 'blank' in 'author_flair_css_class'
agg_df['author_flair_css_class'].value_counts(ascending=True)


author_flair_css_class
recruiter                   2
civilaffairs                3
electronicwarfare           3
darkgoldenrod               5
none                        5
jag                         5
Chaplain                    5
spc                         5
chemical                    7
medicalservice              8
a                          10
Cyber                      13
AquisitionCorps            15
cavalry                    18
publicaffairs              20
psycologicaloperations     22
medicalspecialist          23
transportation             24
armor                      30
militarypolice             32
fieldartillery             36
airdefenseartillery        39
adjutantgeneral            40
quartermaster              44
engineer                   46
ordnance                   60
medical                    66
militaryintelligence       77
aviation                   79
signal                    110
blank                     152
infantry                  176
Name: count, dtyp

### Author flair text

In [231]:
agg_df['author_flair_text'].value_counts()
agg_df.drop(columns=['author_flair_text'], inplace=True)

### Thumbnail

In [232]:
agg_df['thumbnail'].value_counts()
agg_df.drop(columns=['thumbnail'], inplace=True)

### Self Text

In [233]:
agg_df['selftext'].value_counts()

selftext
My plan when I joined the Marine Corps was only to do a single contract. Serve my time while I'm young and capable, do a good job, then get out... I feel I've succeeded in these goals and made a tangible and identifiable difference. Joining this organization has been the most influential and life changing decision I've ever made in my life. The experiences and lessons learned here have completely changed my outlook on life and reversed negative mental habits I sustained as a kid (most of us here can relate).\n\nNow, I'm finding myself having made it on the CRP, and it's creating friction in what I thought was an already decided decision to get out.\n\nDidn't utilize my TA for three years? Now, I could take advantage of it with a second contract. Fatigued from the workload of being a communicator in a Victor unit? Now I have the opportunity to work at a reserve station in my home state. \n\nMy long-term career plan is to run for public office, and that is unmovable and defined.

### Author Flair richtext

In [234]:
agg_df['author_flair_richtext'].value_counts()
agg_df.drop(columns=['author_flair_richtext'], inplace=True)

### Author Full Name

In [235]:
agg_df['author_fullname'].value_counts()
agg_df.drop(columns=['author_fullname'], inplace=True)

### Author Flair Type

In [236]:
agg_df['author_flair_type'].value_counts()


author_flair_type
text        11215
richtext     1725
Name: count, dtype: int64

### Original Columns

In [237]:
print(agg_df.shape)
agg_df.dtypes

(13000, 20)


subreddit                   object
selftext                    object
title                       object
link_flair_richtext         object
subreddit_name_prefixed     object
upvote_ratio               float64
ups                          int64
score                        int64
edited                      object
author_flair_css_class      object
is_self                       bool
created                    float64
author_flair_type           object
domain                      object
no_follow                     bool
over_18                       bool
locked                        bool
author                      object
num_comments                 int64
is_video                      bool
dtype: object

## Imputing missing Values

### Author Flair CSS Class

In [238]:
agg_df['author_flair_css_class'].fillna('blank', inplace=True)
agg_df['author_flair_css_class'].value_counts()
print(f'Author Flair CSS Class Null is: {agg_df["author_flair_css_class"].isnull().sum()}')

Author Flair CSS Class Null is: 0


### Author Flair Type

In [239]:
agg_df['author_flair_type'].fillna('unknown', inplace=True)
agg_df['author_flair_type'].value_counts()
print(f'Author Flair Type Null is: {agg_df["author_flair_type"].isnull().sum()}')

Author Flair Type Null is: 0


### Self Text

In [240]:
agg_df['selftext'].fillna('title_only', inplace=True)
agg_df['selftext'].value_counts()
print(f'Self Text Null is: {agg_df["selftext"].isnull().sum()}')

Self Text Null is: 0


### Values Count

In [241]:
agg_df.isnull().sum().sort_values(ascending=False)

subreddit                  0
selftext                   0
num_comments               0
author                     0
locked                     0
over_18                    0
no_follow                  0
domain                     0
author_flair_type          0
created                    0
is_self                    0
author_flair_css_class     0
edited                     0
score                      0
ups                        0
upvote_ratio               0
subreddit_name_prefixed    0
link_flair_richtext        0
title                      0
is_video                   0
dtype: int64

## Removing Duplicates

In [242]:
agg_df.duplicated(subset=['author']).sum()

10967

In [243]:
agg_df.duplicated(subset=['title']).sum()

10568

In [244]:
agg_df.duplicated(subset=['selftext']).sum()

10858

In [245]:
agg_df.drop_duplicates(subset=['selftext'], inplace=True)

agg_df.duplicated(subset=['selftext']).sum()

0

In [246]:
agg_df.shape

(2142, 20)

## Normalizing and Encoding

# Cleaned Data