In [2]:
import pandas as pd
import re
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

data = pd.read_excel('data/ldareddit.xlsx')
public_tweets = pd.DataFrame(data.loc[:,['body','date']])
public_tweets.columns = ["tweets","time"] 
public_tweets

Unnamed: 0,tweets,time
0,I don't have a dog in this fight but reading t...,2016-06-23
1,"I'm certainly no expert, but that seems like a...",2016-06-03
2,"To _______,\n\nTITLE\n\nGENRE\n\nLOGLINE\n\nTh...",2016-06-04
3,"Oh, wow, this is bad.\n\nFirstly, you don't wr...",2016-06-04
4,"\nHey there, have you read the posting guideli...",2016-06-03
...,...,...
2104,300 buildings threatened by California wildfire.,2016-09-27
2105,In Photos: California Wildfire Endangers Canna...,2016-09-28
2106,On This Date In California Weather History (Se...,2016-09-28
2107,California Mountain Homes Threatened by Destru...,2016-09-28


In [1]:
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from string import punctuation

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

nltk.download('punkt')
stop_words = stopwords.words('english')
custom = stop_words+list(punctuation)
wordnet_lemmatizer = WordNetLemmatizer()
nltk.download('wordnet')


In [3]:
from nltk import WordNetLemmatizer, pos_tag, word_tokenize
from nltk.corpus import wordnet

def get_pos(word):
    return pos_tag(word_tokenize(word))[0][1][0]
 
lem = WordNetLemmatizer() 

def lemma(word):
    pos_label = get_pos(word)
    if pos_label == 'j': pos_label = 'a'    # 'j' <--> 'a' reassignment
    if pos_label in ['r']:  # For adverbs it's a bit different
        try:
            return wordnet.synset(word+'.r.1').lemmas()[0].pertainyms()[0].name()
        except:
            return lem.lemmatize(word)
    if pos_label in ['a', 's', 'v']: # For adjectives and verbs
        return lem.lemmatize(word, pos=pos_label)
    else:   # For nouns and everything else as it is the default kwarg
        return lem.lemmatize(word)
    
def tweet_filter(tweets):
    tweet_corpus = []
    for i in tweets:
        a = re.sub("(http|https)://[.a-z/0-9/A-Z]*","",i)  # removing url links
        a = re.sub("(RT|@|#)[a-z/A-Z/0-9]*","",a) # removing usernames
        a = re.sub(r"(^ | $)","",a) # removing white space
        a = a.encode("ascii", "ignore").decode("ascii") # remove non-ascii
        a = a.lower() 
        words = [word for (word, pos) in nltk.pos_tag(nltk.word_tokenize(a)) if get_pos(word) in ['a', 's', 'v', 'n']]
        words = [word for word in words if word not in custom] #remove stopwords and punctuation
        words = [word for word in words if not any(c.isdigit() for c in word)] # remove digits   
        words = [word for word in words if len(word)>3] #remove words lesser than 3 in length      
        
        words = [lemma(word) for word in words] #lemmatize words 
        
        tweet_corpus.append(" ".join(words))
    return tweet_corpus

In [4]:
public_tweets["tweets"]=public_tweets["tweets"].apply(str)
tweet_fill = tweet_filter(public_tweets["tweets"])
tweet_fill
tweet_fill.to_excel('data/redditfilter.xlsx', index=True)

['fight reading sound spoon liar.. liar tend explain minor detail people result overthinking detail try make sure cover base tell story hop suck me.. hurt anyone story scream bs.. know go scam mind fell doubt scam knew it..',
 'expert seem text query look nail beat conflict theme script source rando write query letter',
 '_______ title genre logline thanks time consideration mark burkholder',
 'write synopsis go read synopsis query logline something query tell reader write brevity overwriter query letter overwritten title genre logline thank mark burkholder irrelevant compare mistake make living fuck query anyone write screenplay idea year take learn craft idea struggle effort apply zero business send query script write feature length script think send query send query script dumbest idea dumb idea show respect respect craft respect manager send read script know script fuck belief fact send thing right go feel fuck dumb manager asks manager',
 'read post guideline expect feedback reque

In [5]:
df = public_tweets
df["tweets"]=tweet_fill
df = df.sort_values('time') # Sort by month
df

Unnamed: 0,tweets,time
1548,date california weather history jose report te...,2016-06-01
1,expert seem text query look nail beat conflict...,2016-06-03
1549,feedback seek feedback query letter feature sc...,2016-06-03
4,read post guideline expect feedback request in...,2016-06-03
1309,illinois kindle california wildfire use metaphor,2016-06-03
...,...,...
2106,date california weather history september sant...,2016-09-28
2108,palpitare demon lover.when turn sixteen mastur...,2016-09-29
1545,depends use tree tree come ground vacuum carbo...,2016-09-30
1547,christianity spread wildfire rome capitalize s...,2016-09-30


In [8]:
def my_tokenizer(s):
    tokens = nltk.tokenize.word_tokenize(s)
    return tokens

df.title = df.title.apply(my_tokenizer)
df.to_excel('data/redditcorpus.xlsx', index=True)