In [1]:
import praw
import pandas as pd
import re
import string
import nltk
from collections import Counter
from gensim.models import Word2Vec
from nltk.corpus import stopwords
nltk.download('stopwords')
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA
from matplotlib import pyplot
%matplotlib inline

[nltk_data] Downloading package stopwords to /home/keras/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def get_subreddit(subreddit_name, app_name, oa_script, oa_secret, pwd):
    '''
    returns subreddit object with top 1000 subreddit posts
    
    '''

    with open(oa_script) as f:
    
        script = f.read().strip()

    with open(oa_secret) as f:
    
        secret = f.read().strip()

    with open(pwd) as f:
    
        pw = f.read().strip()
    
# get reddit object
    
    reddit = praw.Reddit(client_id=script,
                        client_secret=secret,
                        user_agent=app_name,
                        username='neurodivergent_ai',
                        password=pw)

    # get subreddit

    subreddit = reddit.subreddit(subreddit_name)

    # get top posts

    top = subreddit.top(limit=1000)

    for post in subreddit.top(limit=5):
    
        print(post.title, '\n')
        
    return top


def make_subreddit_df(subreddit_object):
    '''
    takes subreddit api object,
    returns as dataframe:
    
    * entire subreddit, all columns
    
    '''
    
    columns_dict = {'title': [],
                   'url': [],
                   'date': [],
                   'score': [],
                   'n_comments': [],
                   'body': []}
    
    # build dictionary
    
    for post in subreddit_object:
        
        columns_dict['title'].append(post.title)
        
        columns_dict['url'].append(post.url)
        
        columns_dict['date'].append(post.created)
        
        columns_dict['score'].append(post.score)
        
        columns_dict['n_comments'].append(post.num_comments)
        
        columns_dict['body'].append(post.selftext)
    
    # convert to dataframe
    
    subreddit_df = pd.DataFrame(columns_dict)
    
    return subreddit_df

# text cleaning: 2 different versions
# light clean for model training, heavy clean for vocab work

def light_clean(doc):
    '''
    lightly cleans input doc for
    word embedding training:
    
    * tokenizes
    * lowercase
    * removes punctuation & non-alpha characters
    
    returns clean document.
    
    '''
    
    # tokenize
    
    split_tokens = doc.split()
    
    # punctuation list
    
    punctuation = string.punctuation
    
    # set up regex filter
    
    rgx_punct_filter = re.compile('[%s]' % re.escape(punctuation))
    
    # apply filter
    
    no_punct_tokens = [rgx_punct_filter.sub('', char) for char in split_tokens]
    
    # alpha tokens only
    
    alpha_tokens = [t for t in no_punct_tokens if t.isalpha()]
    
    # lowercase
    
    lower_tokens = [t.lower() for t in alpha_tokens]
    
    return lower_tokens

def super_clean(doc):
    '''
    cleans a single input document.
    preps for vocabulary analysis.
    applies more processing than
    light_clean():
    
    * tokenizes
    * lowercase
    * removes punctuation & numbers
    * removes stopwords
    
    returns clean document.
    
    '''
    
    # tokenize
    
    split_tokens = doc.split()
    
    # grab punctuation list
    
    punctuation = string.punctuation
    
    # setup re filter
    
    rgx_punct_filter = re.compile('[%s]' % re.escape(punctuation))
    
    # apply filter
    
    no_punct_tokens = [rgx_punct_filter.sub('', char) for char in split_tokens]
    
    # take out numbers & any other non-alpha characters
    
    alpha_tokens = [i for i in no_punct_tokens if i.isalpha()]
    
    # lowercase
    
    lower_tokens = [t.lower() for t in alpha_tokens]
    
    # get stop words
    
    stop_words = set(stopwords.words('english'))
    
    # filter out stopwords
    
    go_tokens = [t for t in lower_tokens if not t in stop_words]
    
    # remove very short tokens
    
    clean_tokens = [word for word in go_tokens if len(word) > 1]
    
    return clean_tokens

def clean_all_docs(docs):
    '''
    takes a set of documents,
    applies light_clean() and 
    returns as a list
    '''
    clean_docs = []
    
    for doc in docs:
        
        doc = light_clean(doc)
        
        if doc:
            
            clean_docs.append(doc)
    
    return clean_docs


def train_w2v_model(docs):
    '''
    trains word2vec model on input set of docs.
    fits PCA projection to 2d space.
    prints 2d visualization of words in vector space.
    returns trained model.
    
    '''
    
    # train w2v model 
    
    w2v_model = Word2Vec(docs, min_count=1)
    
    # get vocab
    
    vector_vocab = w2v_model[w2v_model.wv.vocab]
    
    # fit PCA model / 2d projection
    
    pca_model = PCA(n_components=2)
    
    pca_projection = pca_model.fit_transform(vector_vocab)
    
    # plot
    
    pyplot.scatter(pca_projection[:, 0], pca_projection[:, 1])
    
    # plot with words
    
    vocab_list = list(w2v_model.wv.vocab)
    
    for i, word in enumerate(vocab_list):
        
        pyplot.annotate(word, xy=(pca_projection[i, 0], pca_projection[i, 1]))  
    
    return w2v_model

    
def get_cos_sim(model, word_1, word_2):
    
    return model.wv.similarity(word_1, word_2)


def compare_cos_sim(model, word_list, comp_word):
    '''
    retrieves cosine similarities for a word
    vs a list of words, returns list
    '''
    
    cos_sim_list = []
    
    for word in word_list:
        
        cos_sim = get_cos_sim(model, word, comp_word)
        
        in_list = []
        
        in_list.append(comp_word)
        
        in_list.append(word)
        
        in_list.append(cos_sim)
        
        cos_sim_list.append(in_list)
    
    return cos_sim_list    


def comp_cos_sim_lists(model, list_1, list_2):
    '''
    compares cosine similarities of terms
    on 2 lists. returns a list.
    '''
    
    cos_sim_list = []
    
    for word in list_1:
        
        if word not in list_2:
            
            sim_line = compare_cos_sim(model, list_2, word)
            
            cos_sim_list.append(sim_line)
    
    return cos_sim_list


def join_corpus(df):
    
    corpus = list(df['title']) + list(df['body'])
    
    return corpus


def get_vocab(docs):
    '''
    vocabulary processing
    applies super_clean() text prep
    removes stopwords
    returns vocabulary as DataFrame, 
    total training words, 
    unique vocabulary words.
    '''
    
    words = []
    
    for doc in docs:
        
        # super_clean() removes stopwords
        # returns a more useful vocabulary for analysis
        
        doc = super_clean(doc)
        
        for word in doc:
            
            words.append(word)
    
    total_words = len(words)
    
    word_counts = Counter(words)
    
    unique_words = len(word_counts)
    
    word_counts = pd.DataFrame.from_dict(word_counts, orient='index')
    
    # return df sorted by word frequency
    
    return word_counts.sort_values(by=[0], ascending=False), total_words, unique_words


def list_sims_for_mean(sim_comp_list):
    '''
    takes list of cosine similarity comparisons with words
    returns ordered list of numerical values only
    
    '''
    
    sims_for_mean = []
    
    for sub_list in sim_comp_list:
        
        for i in sub_list:
            
            # grab 3rd item/cos sim
            
            sim = i[2]
        
            sims_for_mean.append(sim)
    
    return sims_for_mean

        
def get_sim_means(sims_for_mean_1, sims_for_mean_2, sims_for_mean_3):
    '''
    takes 3 ordered lists of numerical values
    takes the mean & appends each mean to a list
    returns an ordered list of mean cosine similarities
    
    '''
    
    sim_means_list = []
    
    for i in range(len(sims_for_mean_1)):
        
        sim_mean = (sims_for_mean_1[i] + sims_for_mean_2[i] + sims_for_mean_3[i]) / 3
        
        sim_means_list.append(sim_mean)
    
    return sim_means_list
    

def get_mean_cos_sim(sim_list_1, sim_list_2, sim_list_3):
    '''
    takes 3 lists of cosine similarity comparisons with words
    gets mean cosine similarities for each term
    returns new list with terms & mean cosine sims for each term
    
    '''
    
    sims_1 = list_sims_for_mean(sim_list_1)
    
    sims_2 = list_sims_for_mean(sim_list_2)
    
    sims_3 = list_sims_for_mean(sim_list_3)
    
    means_list = get_sim_means(sims_1, sims_2, sims_3)
    
    mean_cos_sims = []
    
    # make new list with comparison terms
    
    for sub_lists in sim_list_1:
        
        for i in sub_lists:
            
            local_sim_list = []
            
            local_sim_list.append(i[0])
            
            local_sim_list.append(i[1])
            
            mean_cos_sims.append(local_sim_list)
            
    # append mean cos sim values
        
    for i in range(len(mean_cos_sims)):
        
        mean_cos_sims[i].append(means_list[i])
    
    return mean_cos_sims
                

def text_prep_pipeline(subreddit_name, app_name, oa_script, oa_secret, pwd, vocab_number):
    
    subreddit = get_subreddit(subreddit_name, app_name, oa_script, oa_secret, pwd)
        
    subreddit_df = make_subreddit_df(subreddit)
    
    raw_corpus = join_corpus(subreddit_df)
    
    clean_corpus = clean_all_docs(raw_corpus)
   
    # get vocab takes raw corpus, NOT clean
    vocab_df, total_words, unique_words = get_vocab(raw_corpus)
    
    top, bottom = get_top_and_bottom(vocab_df, vocab_number)
    
    return subreddit_df, raw_corpus, clean_corpus, vocab_df, top, bottom, total_words, unique_words


def train_pipeline(docs, term_list_A, term_list_B):
    
    # train model
    
    trained_w2v_model = train_w2v_model(docs)
    
    term_cos_sims = comp_cos_sim_lists(trained_w2v_model, term_list_A, term_list_B)
    
    return trained_w2v_model, term_cos_sims


def final_pipeline(subreddit_name, app_name, oa_script, oa_secret, pwd, vocab_number, term_list_A, term_list_B):
    
    #text_prep_pipeline(subreddit_name, app_name, oa_script, oa_secret, pwd, vocab_number)
    # return subreddit_df, raw_corpus, clean_corpus, vocab_df, top, bottom
    
    subreddit_df, raw_corpus, clean_corpus, vocab_df, top, bottom, total_words, unique_words = text_prep_pipeline(subreddit_name, app_name, oa_script, oa_secret, pwd, vocab_number)
    
    # save data to file(s)
    # prepare to export as CSVs
    # first generate unique names
    
    subreddit_df_filename = app_name + '_subreddit.csv'
    
    raw_corpus_filename = app_name + '_raw_corpus.csv'
    
    clean_corpus_filename = app_name + '_clean_corpus.csv'
    
    vocab_df_filename = app_name + '_vocab.csv'
    
    # convert lists to DataFrames
    
    raw_corpus_df = pd.DataFrame(raw_corpus)
    
    clean_corpus_df = pd.DataFrame(clean_corpus)
    
    # export to CSVs using unique filenames
    
    subreddit_df.to_csv(subreddit_df_filename)
    
    raw_corpus_df.to_csv(raw_corpus_filename)
    
    clean_corpus_df.to_csv(clean_corpus_filename)
    
    vocab_df.to_csv(vocab_df_filename)
 
    # train 3 models
    
    model_1, cos_sims_1 = train_pipeline(clean_corpus, term_list_A, term_list_B)
    
    model_2, cos_sims_2 = train_pipeline(clean_corpus, term_list_A, term_list_B)
    
    model_3, cos_sims_3 = train_pipeline(clean_corpus, term_list_A, term_list_B)
    
    final_sims_list = get_mean_cos_sim(cos_sims_1, cos_sims_2, cos_sims_3)
    
    final_sims_df = pd.DataFrame(final_sims_list)
    
    # generate unique names for the models & df
    
    model_1_filename = app_name + '_w2v_model_1.model'
    
    model_2_filename = app_name + '_w2v_model_2.model'
    
    model_3_filename = app_name + '_w2v_model_3.model'
    
    final_sims_filename = app_name + '_final_sims_list.csv'
    
    # save final cosine similarities & models to disk
    
    model_1.save(model_1_filename)
    
    model_2.save(model_2_filename)
    
    model_3.save(model_3_filename)
    
    final_sims_df.to_csv(final_sims_filename)
    
    # display results
    
    print('number of words used in training: \n')
    
    print(total_words, '\n')
    
    print('total unique vocabulary words: \n')
    
    print(unique_words, '\n')
    
    print('top vocabulary: \n')
    
    print(top, '\n')
    
    print('bottom vocabulary: \n')
    
    print(bottom, '\n')
    
    print('mean cosine similarities: \n')
    
    print(final_sims_df)
    
    return model_1, model_2, model_3, final_sims_list, top, bottom
