In [7]:
# imports
import pandas as pd 
import numpy as np

In [8]:
# initialize Reddit Bot

from db import DbService, SaveJsonToFileStrategy
from bot import RedditBot
from services import ApiClient, Service
from analysis import BertComparitor, PostComparisonProvider

bot = RedditBot(
            Service(
                ApiClient = ApiClient(),
                DbService = DbService(
                         SaveJsonToFileStrategy())
                    , analyzerService = PostComparisonProvider(BertComparitor('bert-base-nli-mean-tokens'))))



In [9]:
# test fetch authors


In [10]:
df = bot.run()



posts[1]:  [Redditor(name='renjitze'), 'Join our team to build your portfolio while contributing in open-source projects. Web dev & Game dev in parallel', 'https://www.reddit.com/r/ProgrammingBuddies/comments/w7x6io/join_our_team_to_build_your_portfolio_while/', 'Hello everyone, we are looking for people who wants to collaborate in open-source projects, the main purpose of the team is to develop things and learn together at the same time so later we have a good portfolio to land an IT job. Everything is well organized and explained in our website and discord server to avoid confusion in the development process.\n\nTechnologies we use:\n\n· Web development: MERN Stack (we are open to use Python too)\n\n· Game development: Godot engine (free and lightweight), GDScript (it\'s a "python clone" for Godot)\n\nWe can work in parallel projects for people who prefers web development. Right now we are learning Godot to create a simple 2D game. If you have ideas and time to code then join us! see

In [11]:
df

Unnamed: 0,author,title,url,body,score,created,id
0,renjitze,Join our team to build your portfolio while co...,https://www.reddit.com/r/ProgrammingBuddies/co...,"Hello everyone, we are looking for people who ...",1,1658775692.0,w7x6io
1,logicallyillogical,Need help with an upcoming interview,https://www.reddit.com/r/ProgrammingBuddies/co...,"Not sure if I can post this, but hoping you gu...",1,1658773678.0,w7wcvi
2,ItsTomDev,Just looking for people to vibe with,https://www.reddit.com/r/ProgrammingBuddies/co...,"Heya, I am a 18 year old guy just looking for ...",2,1658772387.0,w7vtov
3,zootbot,Looking for python / Devops buddy to work on w...,https://www.reddit.com/r/ProgrammingBuddies/co...,I’ve got decent knowledge of aws and python. I...,1,1658770186.0,w7uvvg
4,mhfc1913,"Looking for Study buddy or group, starting Ang...",https://www.reddit.com/r/ProgrammingBuddies/co...,"Hey people!\nI'm a 36 yo dude, looking for peo...",2,1658769660.0,w7uo0y
...,...,...,...,...,...,...,...
144,samgreen400,learn golang with me,https://www.reddit.com/r/ProgrammingBuddies/co...,looking for activity partner to learn and talk...,1,1656257513.0,vl79fw
145,SmellyCow0,Looking for someone to study C and Discrete ma...,https://www.reddit.com/r/ProgrammingBuddies/co...,"Hi, I'm 16 and started to study C and discrete...",6,1656251800.0,vl59yg
146,PDROJACK,Looking for devs to make a dating app,https://www.reddit.com/r/ProgrammingBuddies/co...,Hey\n\nI found some flaws in current dating ap...,0,1656250192.0,vl4rtn
147,leugeneskabs,Looking for a mentor to help me to learn C# (a...,https://www.reddit.com/r/ProgrammingBuddies/co...,"Hi, I'm looking for a mentor to help me to lea...",13,1656247473.0,vl3y0t


In [12]:
import re
import html
import emoji
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/galzafar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/galzafar/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/galzafar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/galzafar/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [13]:
def simple_cleanup(text):
  # Convert to lowercase.
  text = text.lower()     

  # Remove everything but letters and spaces.
  text = re.sub(r'[^a-z\s]', ' ', text)

  # Remove single letters.
  text = re.sub(r'(^\w\s)|(\s\w\s)|(\s\w$)', ' ', text)

  # Converge multiple spaces into one.
  text = re.sub(r'\s+', ' ', text) 

  # Remove trailing and leading spaces.    
  text = text.strip()

  return text


  # Since we found only 6 rows with emojis we decided to remove them.
def remove_emojis(text):
  return emoji.replace_emoji(text)

def remove_urls(text):
  return re.sub('http(s?)://[^\s]+', ' ', text)

def decode_html_entities(text):
  return html.unescape(text)



def remove_stopwords(text):
  eng_stop_words = stopwords.words('english')
  non_stop_words = [word for word in text.split() if word not in eng_stop_words]
  return ' '.join(non_stop_words)   



wl = WordNetLemmatizer()

# This is a helper function to map NTLK position tags
def get_wordnet_pos(text):
    if text.startswith('J'):
        return wordnet.ADJ
    elif text.startswith('V'):
        return wordnet.VERB
    elif text.startswith('N'):
        return wordnet.NOUN
    elif text.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def lemmatize(text):
    # Split the text to words and get the part of speach (pos) of each 
    # of the words (i.e. noun, verb, etc.)
    words = word_tokenize(text)
    words_with_pos = nltk.pos_tag(words) 
    
    # Lemmatize each word.
    res = []
    for x in words_with_pos:
      word = x[0]
      pos = x[1]
      res.append(wl.lemmatize(word, get_wordnet_pos(pos)))

    return " ".join(res)




def preprocess(text):
  text = remove_urls(text)  
  text = remove_emojis(text)
  text = decode_html_entities(text)
  text = simple_cleanup(text)
  text = remove_stopwords(text)
  text = lemmatize(text)
  return text
    


In [14]:
df['text'] = df['title'] + ' ' + df['body']
df['text'] = df['text'].apply(preprocess)

In [15]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances

In [16]:

model_name = 'bert-base-nli-mean-tokens'
model = SentenceTransformer(model_name)

text_embeddings = model.encode(df['text'])




In [17]:
pairwise_similarities=cosine_similarity(text_embeddings)
pairwise_differences=euclidean_distances(text_embeddings)

pairwise_similarities

array([[1.0000001 , 0.6663472 , 0.76760525, ..., 0.68761384, 0.78184503,
        0.6284179 ],
       [0.6663472 , 1.0000002 , 0.7284243 , ..., 0.71485883, 0.69427395,
        0.7164765 ],
       [0.76760525, 0.7284243 , 0.9999999 , ..., 0.7027    , 0.6910368 ,
        0.70399904],
       ...,
       [0.68761384, 0.71485883, 0.7027    , ..., 0.9999999 , 0.63132936,
        0.5807563 ],
       [0.78184503, 0.69427395, 0.6910368 , ..., 0.63132936, 0.99999994,
        0.71065116],
       [0.6284179 , 0.7164765 , 0.70399904, ..., 0.5807563 , 0.71065116,
        0.99999976]], dtype=float32)

In [18]:
df_similarity_matrix = pd.DataFrame(pairwise_similarities, columns=range(pairwise_similarities.shape[0]), index=range(pairwise_similarities.shape[1]))

In [19]:
%history

# imports
import pandas as pd 
import numpy as np
# initialize Reddit Bot

from db import DbService, SaveJsonToFileStrategy
from bot import RedditBot
from services import ApiClient, Service
from analysis import BertComparitor, PostComparisonProvider

bot = RedditBot(
            Service(
                ApiClient = ApiClient(),
                DbService = DbService(
                         SaveJsonToFileStrategy())
                    , analyzerService = PostComparisonProvider(BertComparitor('bert-base-nli-mean-tokens'))))
df = bot.run()
# initialize Reddit Bot

from db import DbService, SaveJsonToFileStrategy
from bot import RedditBot
from services import ApiClient, Service
from analysis import BertComparitor, PostComparisonProvider

bot = RedditBot(
            Service(
                ApiClient = ApiClient(),
                DbService = DbService(
                         SaveJsonToFileStrategy())
                    , analyzerService = PostComparisonProvider(BertComparitor('bert-b

In [20]:
from numpy import float32


s = df_similarity_matrix.unstack()
so = s.sort_values(kind="quicksort", ascending=False)

so

112  112    1.000000
79   79     1.000000
65   65     1.000000
100  100    1.000000
17   17     1.000000
              ...   
66   85     0.174603
16   104    0.149337
104  16     0.149337
16   136    0.118306
136  16     0.118306
Length: 22201, dtype: float32

In [21]:
df_so = pd.DataFrame(so, columns=['similarity'])


In [22]:
import math
df_so = pd.DataFrame(so, columns=['similarity'])

# dataframe drop values with similarity around 1.0
mask = df_so['similarity'].apply(lambda x: not math.isclose(x, 1.0, rel_tol=0.01))
df_so = df_so[mask]

# drop duplicates based on similarity value
df_so = df_so.drop_duplicates(subset=['similarity'], keep='first')

# sort by index
df_so = df_so.sort_values(by='similarity', ascending=False)
df_so.head(50)


Unnamed: 0,Unnamed: 1,similarity
64,100,0.904627
5,57,0.903831
60,120,0.901211
64,85,0.899778
81,15,0.89872
35,40,0.896889
134,60,0.892858
38,120,0.890767
0,39,0.889838
71,14,0.885771


In [23]:
#split index into two columns
# df2 = pd.DataFrame(df_so['b'].tolist(), index=df_so.index)
# result = df_so.head().index.tolist()

# get top 100 results
top_100_results = df_so.head(100)
# reset index inplace
top_100_results.reset_index(inplace=True)
result = df.loc[:,'text']
result
# for each row in dataframe add a new column with the value of 'level_0' as the index for df['text']
# apply to both column
top_100_results['text_level_0'] = top_100_results['level_0'].apply(lambda x: result.loc[x])
top_100_results['text_level_1'] = top_100_results['level_1'].apply(lambda x: result.loc[x])

# top_100_results['text_level_0'] = df['text'].iloc(0)
# top_100_results
# df['text'][100]
id_results = df.loc[:,'id']
top_100_results['id_level_0'] = top_100_results['level_0'].apply(lambda x: id_results.loc[x])
top_100_results['id_level_1'] = top_100_results['level_1'].apply(lambda x: id_results.loc[x])

# set author names
top_100_results['author_level_0'] = top_100_results['level_0'].apply(lambda x: df['author'].loc[x])
top_100_results['author_level_1'] = top_100_results['level_1'].apply(lambda x: df['author'].loc[x])
top_100_results[['text_level_0', 'text_level_1', 'id_level_0', 'id_level_1', 'similarity']].head(10)
# top_100_results['id'] = df['id'].iloc[:]
# top_100_results




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_100_results['text_level_0'] = top_100_results['level_0'].apply(lambda x: result.loc[x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_100_results['text_level_1'] = top_100_results['level_1'].apply(lambda x: result.loc[x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_100_results['id_l

Unnamed: 0,text_level_0,text_level_1,id_level_0,id_level_1,similarity
0,anybody want study together solve problem hell...,join community st time well year old backend d...,vyve65,vt4e4j,0.904627
1,need javascript mentor please get assignment c...,look mentor helper advisor personal project wo...,w7p3iv,vzl4bh,0.903831
2,look buddy learn javascript scratch know basic...,look study buddy hello new learning code compl...,vz8zs4,vpwtz6,0.901211
3,anybody want study together solve problem hell...,best online boot camp course java python curre...,vyve65,vv94kb,0.899778
4,look program buddy im bore lonely want learn c...,nlp python udemy course hey look somebody inte...,vwapev,w5mxla,0.89872
5,self teach programmer look help others fronten...,get back program pause look mentor buddy help ...,w30wjr,w2s3il,0.896889
6,look buddy cs html java hi look beginner want ...,look buddy learn javascript scratch know basic...,vmu84s,vz8zs4,0.892858
7,look partner learn java complete newbie java k...,look study buddy hello new learning code compl...,w2v6xf,vpwtz6,0.890767
8,join team build portfolio contribute open sour...,want team crud blog another project hello go p...,w7x6io,w2t5wx,0.889838
9,look study buddy hi everyone currently premed ...,need level c skill college hey try learn last ...,vxt8yz,w5rwoa,0.885771
