# Data Cleaning and Processing

This Notebook will process the data and extract all the relevant information needed for analysis. This will include:
- Feature Selection
- Imputing missing values
- Removing possible Duplicates (in case it was not properly preformed in the data collection process)
- Normalizing and Encoding categorical variables

Then the cleaned data will be exported into a format for the next step of EDA.


## Import Data

In [193]:
## Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

#Ignore warnings
from warnings import filterwarnings
filterwarnings('ignore')

In [194]:
## Load data
agg_df = pd.read_csv('../2_EDA/data/00_master.csv')

print(agg_df.shape)
agg_df.head()

(9600, 118)


Unnamed: 0,approved_at_utc,subreddit,selftext,author_fullname,saved,mod_reason_title,gilded,clicked,title,link_flair_richtext,...,preview,is_gallery,media_metadata,gallery_data,author_cakeday,link_flair_template_id,poll_data,crosspost_parent_list,crosspost_parent,collections
0,,Army,This was a convo I had with one of my buddies ...,t2_3vmh30ad,False,,0,False,If you could create a new MRE based on a Fast ...,[],...,,,,,,,,,,
1,,Army,BLUF: how do you overcome imposter syndrome?\n...,t2_9mqncmmb,False,,0,False,how do you even Army?,[],...,,,,,,,,,,
2,,Army,"Long story short, my estranged (soon to be ex)...",t2_ag69n7u7,False,,0,False,Command Directed No-Contact Order?,[],...,,,,,,,,,,
3,,Army,"\nMy husband is 35T, and just graduated AIT. W...",t2_lb56g2zm,False,,0,False,Anyone 35T?,[],...,,,,,,,,,,
4,,Army,I could use some advice on going recruiting. I...,t2_i4rellgt,False,,0,False,Thinking of going recruiter as brand new E5,[],...,,,,,,,,,,


## Data Statistics

In [195]:
agg_df.columns

Index(['approved_at_utc', 'subreddit', 'selftext', 'author_fullname', 'saved',
       'mod_reason_title', 'gilded', 'clicked', 'title', 'link_flair_richtext',
       ...
       'preview', 'is_gallery', 'media_metadata', 'gallery_data',
       'author_cakeday', 'link_flair_template_id', 'poll_data',
       'crosspost_parent_list', 'crosspost_parent', 'collections'],
      dtype='object', length=118)

In [196]:
agg_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9600 entries, 0 to 9599
Columns: 118 entries, approved_at_utc to collections
dtypes: bool(28), float64(26), int64(8), object(56)
memory usage: 6.8+ MB


In [197]:
agg_df.dtypes

approved_at_utc           float64
subreddit                  object
selftext                   object
author_fullname            object
saved                        bool
                           ...   
link_flair_template_id     object
poll_data                  object
crosspost_parent_list      object
crosspost_parent           object
collections                object
Length: 118, dtype: object

## Feature Selection

### Dropped Columns

In [198]:
# Drop columns: 'thumbnail_width', 'media_embed' and 74 other columns
agg_df = agg_df.drop(columns=['thumbnail_width', 'media_embed', 'author_flair_template_id', 'is_original_content', 'user_reports', 'secure_media', 'is_reddit_media_domain', 'is_meta', 'category', 'secure_media_embed', 'link_flair_text', 'can_mod_post', 'approved_by', 'is_created_from_ads_ui', 'author_premium', 'gildings', 'content_categories', 'mod_note', 'wls', 'removed_by_category', 'banned_by', 'link_flair_type', 'allow_live_comments', 'selftext_html', 'likes', 'suggested_sort', 'banned_at_utc', 'view_count', 'archived', 'is_crosspostable', 'pinned', 'all_awardings', 'awarders', 'media_only', 'can_gild', 'spoiler', 'treatment_tags', 'visited', 'removed_by', 'num_reports', 'distinguished', 'subreddit_id', 'author_is_blocked', 'mod_reason_by', 'removal_reason', 'link_flair_background_color', 'id', 'is_robot_indexable', 'report_reasons', 'discussion_type', 'send_replies', 'whitelist_status', 'contest_mode', 'mod_reports', 'author_patreon_flair', 'author_flair_text_color', 'permalink', 'parent_whitelist_status', 'stickied', 'url', 'subreddit_subscribers', 'created_utc', 'num_crossposts', 'media', 'post_hint', 'url_overridden_by_dest', 'preview', 'is_gallery', 'media_metadata', 'gallery_data', 'author_cakeday', 'link_flair_template_id', 'poll_data', 'crosspost_parent_list', 'crosspost_parent', 'collections', 'approved_at_utc', 'saved', 'mod_reason_title', 'gilded', 'clicked', 'hidden', 'pwls', 'link_flair_css_class', 'downs', 'thumbnail_height', 'top_awarded_type', 'hide_score', 'name', 'quarantine', 'link_flair_text_color', 'author_flair_background_color', 'subreddit_type', 'total_awards_received'])

In [199]:
agg_df.isnull().sum().sort_values(ascending=False)


author_flair_css_class     8705
author_flair_text          7094
thumbnail                  1800
selftext                   1471
author_flair_richtext        43
author_fullname              43
author_flair_type            43
created                       0
num_comments                  0
author                        0
locked                        0
over_18                       0
no_follow                     0
domain                        0
subreddit                     0
is_self                       0
edited                        0
score                         0
ups                           0
upvote_ratio                  0
subreddit_name_prefixed       0
link_flair_richtext           0
title                         0
is_video                      0
dtype: int64

### Author Flair CSS Class

In [200]:
#Replace null values with 'blank' in 'author_flair_css_class'
agg_df['author_flair_css_class'].value_counts(ascending=True)


author_flair_css_class
civilaffairs                1
recruiter                   1
electronicwarfare           2
darkgoldenrod               4
Chaplain                    4
jag                         4
none                        4
spc                         4
chemical                    5
medicalservice              6
a                           8
Cyber                       9
AquisitionCorps            12
cavalry                    14
medicalspecialist          16
publicaffairs              16
psycologicaloperations     17
transportation             18
armor                      23
militarypolice             24
fieldartillery             27
adjutantgeneral            29
airdefenseartillery        31
quartermaster              34
engineer                   35
ordnance                   44
medical                    50
militaryintelligence       58
aviation                   60
signal                     83
blank                     117
infantry                  135
Name: count, dtyp

### Author flair text

In [201]:
agg_df['author_flair_text'].value_counts()
agg_df.drop(columns=['author_flair_text'], inplace=True)

### Thumbnail

In [202]:
agg_df['thumbnail'].value_counts()
agg_df.drop(columns=['thumbnail'], inplace=True)

### Self Text

In [203]:
agg_df['selftext'].value_counts()

selftext
What happens if you forget to submit you billet accomplishments and it’s literally left blank and got routed up?                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               

### Author Flair richtext

In [204]:
agg_df['author_flair_richtext'].value_counts()
agg_df.drop(columns=['author_flair_richtext'], inplace=True)

### Author Full Name

In [205]:
agg_df['author_fullname'].value_counts()
agg_df.drop(columns=['author_fullname'], inplace=True)

### Author Flair Type

In [206]:
agg_df['author_flair_type'].value_counts()


author_flair_type
text        8272
richtext    1285
Name: count, dtype: int64

### Original Columns

In [207]:
print(agg_df.shape)
agg_df.dtypes

(9600, 20)


subreddit                   object
selftext                    object
title                       object
link_flair_richtext         object
subreddit_name_prefixed     object
upvote_ratio               float64
ups                          int64
score                        int64
edited                      object
author_flair_css_class      object
is_self                       bool
created                    float64
author_flair_type           object
domain                      object
no_follow                     bool
over_18                       bool
locked                        bool
author                      object
num_comments                 int64
is_video                      bool
dtype: object

## Imputing missing Values

### Author Flair CSS Class

In [208]:
agg_df['author_flair_css_class'].fillna('blank', inplace=True)
agg_df['author_flair_css_class'].value_counts()
print(f'Author Flair CSS Class Null is: {agg_df["author_flair_css_class"].isnull().sum()}')

Author Flair CSS Class Null is: 0


### Author Flair Type

In [209]:
agg_df['author_flair_type'].fillna('unknown', inplace=True)
agg_df['author_flair_type'].value_counts()
print(f'Author Flair Type Null is: {agg_df["author_flair_type"].isnull().sum()}')

Author Flair Type Null is: 0


### Self Text

In [210]:
agg_df['selftext'].fillna('title_only', inplace=True)
agg_df['selftext'].value_counts()
print(f'Self Text Null is: {agg_df["selftext"].isnull().sum()}')

Self Text Null is: 0


### Values Count

In [211]:
agg_df.isnull().sum().sort_values(ascending=False)

subreddit                  0
selftext                   0
num_comments               0
author                     0
locked                     0
over_18                    0
no_follow                  0
domain                     0
author_flair_type          0
created                    0
is_self                    0
author_flair_css_class     0
edited                     0
score                      0
ups                        0
upvote_ratio               0
subreddit_name_prefixed    0
link_flair_richtext        0
title                      0
is_video                   0
dtype: int64

## Removing Duplicates

In [215]:
agg_df.duplicated(subset=['author']).sum()

7756

In [214]:
agg_df.duplicated(subset=['title']).sum()

7413

In [216]:
agg_df.duplicated(subset=['selftext']).sum()

7689

In [221]:
agg_df.drop_duplicates(subset=['selftext'], inplace=True)

agg_df.duplicated(subset=['selftext']).sum()

0

In [222]:
agg_df.shape

(1911, 20)

## Normalizing and Encoding

# Cleaned Data