In [19]:
# imports
import pandas as pd 
import numpy as np

In [20]:
# initialize Reddit Bot

from db import DbService, SaveJsonToFileStrategy
from bot import RedditBot
from services import ApiClient, Service
from analysis import BertComparitor, PostComparisonProvider

bot = RedditBot(
            Service(
                ApiClient = ApiClient(),
                DbService = DbService(
                         SaveJsonToFileStrategy())
                    , analyzerService = PostComparisonProvider(BertComparitor('bert-base-nli-mean-tokens'))))



In [21]:
df = bot.run()



posts[1]:  ['Help with XML button', 'https://www.reddit.com/r/ProgrammingBuddies/comments/w6skn0/help_with_xml_button/', "Hi how are you?\n\nI'm having trouble making a button that when I press it, I download a table with the information in XML format. I am using jquery, AJAX calls and a base in SQL to bring the information, how would you proceed to do it? \n\nps: the system architecture is MVC.\n\n Thanks!", 1, 1658658605.0, 'w6skn0']


In [22]:
df

Unnamed: 0,title,url,body,score,created,id
0,Help with XML button,https://www.reddit.com/r/ProgrammingBuddies/co...,Hi how are you?\n\nI'm having trouble making a...,1,1658658605.0,w6skn0
1,Android Development with Kotlin,https://www.reddit.com/r/ProgrammingBuddies/co...,Hello. I am a beginner to Android Development ...,2,1658648653.0,w6q635
2,Programming Python trading bot,https://www.reddit.com/r/ProgrammingBuddies/co...,I am currently working on a python trading bot...,2,1658601696.0,w6b00j
3,Looking for people to study game engine progra...,https://www.reddit.com/r/ProgrammingBuddies/co...,"Hi, add me on discord if interested : Rack Smi...",2,1658596562.0,w692v4
4,Need help getting out of a Support role and ge...,https://www.reddit.com/r/ProgrammingBuddies/co...,Hello! So I have currently been working in sup...,3,1658551989.0,w5v9ea
...,...,...,...,...,...,...
144,more web devs for projects,https://www.reddit.com/r/ProgrammingBuddies/co...,Looking for more members to join discord serve...,2,1656183202.0,vklpfo
145,I'm looking for a programming buddy or couple ...,https://www.reddit.com/r/ProgrammingBuddies/co...,I'm just starting to learn C ++ and I think it...,13,1656123127.0,vk4gvk
146,Hey guys. So I am looking for someone who can ...,https://www.reddit.com/r/ProgrammingBuddies/co...,,6,1656073569.0,vjn9v2
147,Beginner looking for someone to work on DS/ML ...,https://www.reddit.com/r/ProgrammingBuddies/co...,"Hello!\n\nI am 17 and a senior in HS, took a p...",2,1656027417.0,vjacu6


In [32]:
import re
import html
import emoji
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/galzafar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/galzafar/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/galzafar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/galzafar/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [24]:
def simple_cleanup(text):
  # Convert to lowercase.
  text = text.lower()     

  # Remove everything but letters and spaces.
  text = re.sub(r'[^a-z\s]', ' ', text)

  # Remove single letters.
  text = re.sub(r'(^\w\s)|(\s\w\s)|(\s\w$)', ' ', text)

  # Converge multiple spaces into one.
  text = re.sub(r'\s+', ' ', text) 

  # Remove trailing and leading spaces.    
  text = text.strip()

  return text


  # Since we found only 6 rows with emojis we decided to remove them.
def remove_emojis(text):
  return emoji.replace_emoji(text)

def remove_urls(text):
  return re.sub('http(s?)://[^\s]+', ' ', text)

def decode_html_entities(text):
  return html.unescape(text)



def remove_stopwords(text):
  eng_stop_words = stopwords.words('english')
  non_stop_words = [word for word in text.split() if word not in eng_stop_words]
  return ' '.join(non_stop_words)   



wl = WordNetLemmatizer()

# This is a helper function to map NTLK position tags
def get_wordnet_pos(text):
    if text.startswith('J'):
        return wordnet.ADJ
    elif text.startswith('V'):
        return wordnet.VERB
    elif text.startswith('N'):
        return wordnet.NOUN
    elif text.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def lemmatize(text):
    # Split the text to words and get the part of speach (pos) of each 
    # of the words (i.e. noun, verb, etc.)
    words = word_tokenize(text)
    words_with_pos = nltk.pos_tag(words) 
    
    # Lemmatize each word.
    res = []
    for x in words_with_pos:
      word = x[0]
      pos = x[1]
      res.append(wl.lemmatize(word, get_wordnet_pos(pos)))

    return " ".join(res)




def preprocess(text):
  text = remove_urls(text)  
  text = remove_emojis(text)
  text = decode_html_entities(text)
  text = simple_cleanup(text)
  text = remove_stopwords(text)
  text = lemmatize(text)
  return text
    


In [33]:
df['text'] = df['title'] + ' ' + df['body']
df['text'] = df['text'].apply(preprocess)

In [25]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances

In [34]:

model_name = 'bert-base-nli-mean-tokens'
model = SentenceTransformer(model_name)

text_embeddings = model.encode(df['text'])




In [35]:
pairwise_similarities=cosine_similarity(text_embeddings)
pairwise_differences=euclidean_distances(text_embeddings)



In [36]:
df_similarity_matrix = pd.DataFrame(pairwise_similarities, columns=range(pairwise_similarities.shape[0]), index=range(pairwise_similarities.shape[1]))

In [None]:
%history

In [37]:
from numpy import float32


s = df_similarity_matrix.unstack()
so = s.sort_values(kind="quicksort", ascending=False)

so

56   56     1.000000
71   71     1.000000
92   92     1.000000
104  104    1.000000
148  148    1.000000
              ...   
146  7      0.168673
96   7      0.149337
7    96     0.149337
128  7      0.118306
7    128    0.118306
Length: 22201, dtype: float32

In [38]:
df_so = pd.DataFrame(so, columns=['similarity'])


In [52]:
import math
df_so = pd.DataFrame(so, columns=['similarity'])

# dataframe drop values with similarity around 1.0
mask = df_so['similarity'].apply(lambda x: not math.isclose(x, 1.0, rel_tol=0.01))
df_so = df_so[mask]
print(df_so.shape)
# drop duplicates based on similarity value
df_so = df_so.drop_duplicates(subset=['similarity'], keep='first')
df_so.shape
#df_so


(22052, 1)


(11016, 1)