# Word Frequency Analysis of Posts

This notebook will be used to analysis the word frequencies in top posts based on subreddits

### Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### Data Loading

In [6]:
df = pd.read_csv("../data/updated_datav3.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,id,post,score,subreddit,title,url,subreddit_name
0,0,0,,53782,0,"New ""Discovery Mode"" turns video game ""Assassi...",https://www.theverge.com/2018/2/20/17033024/as...,history
1,1,1,"Hi everyone and especially our students,\r\r\n...",38426,0,We are not here to help you with your End of T...,https://www.reddit.com/r/history/comments/8pw3...,history
2,2,2,,35982,0,A 1776 excerpt from John Adam's diary where he...,https://founders.archives.gov/documents/Adams/...,history
3,3,3,,34908,0,Famous Viking warrior burial revealed to be th...,http://www.news.com.au/technology/science/arch...,history
4,4,4,,34197,0,"3,000-year-old underwater castle discovered in...",https://inhabitat.com/3000-year-old-underwater...,history


#### Removing Unnamed Column

In [7]:
df.drop(axis=1,labels=["Unnamed: 0"], inplace=True)
df.to_csv("../data/updated_datav4.csv",index=False)

### Function to Get Highest Word Counts

Let's first separate our data to groupby subreddit:

In [35]:
# Function to split the posts off of punctuations

def split_post(post):
    # separate punctuations
    post = post.replace(".", " . ") \
                 .replace(",", " , ") \
                 .replace(";", " ; ") \
                 .replace("?", " ? ") \
                 .replace("\""," \" ") \
                 .replace("!"," ! ")

    return post.split() # Returns each word as an index in a list

# Function to get the word counts of words in a subreddit

def get_word_counts(ind):
    
    word_dict = {}
    sr_df = df[df.subreddit == ind]
    for i,row in sr_df.iterrows():
        
        # Figure out whether the post has text content and assign appropriately
        
        if row["post"] is np.nan:
            post = row["title"]
        else:
            post = row["post"]
    
        word_list = split_post(post)
        
        # Iterate through the words in the post
        
        for word in word_list:
            
            if word in ['\"','?','!','.',',',';']:
                continue
            
            # If the dictionary already has the word, add a tally to it
            if word in word_dict:
                word_dict[word] +=1
            
            # If the dictionary does not have the word, assign it 1
            else:
                word_dict[word] = 1
    
    return word_dict
            
hist_word_dict = get_word_counts(0)

In [44]:
#pd.DataFrame(hist_word_dict,columns=["Word","Count"],index=list(range(len(hist_word_dict))))
keys = list(hist_word_dict.keys())
vals = list(hist_word_dict.values())
hist_word = pd.DataFrame(keys,columns=["word"])
hist_word["count"]=vals

hist_word.head(30)
sorted_hist_word = hist_word.sort_values("count",ascending=False)


In [47]:
sorted_hist_word.head(100)

Unnamed: 0,word,count
31,the,4959
19,of,2675
36,to,2440
24,and,2408
10,a,1998
127,in,1762
177,I,1398
150,was,1163
30,that,1122
34,is,839
