In [1]:
#Import libraries

# Reddit API
import requests
import pandas as pd

# Clean dataframe

# Remove stopwords
import nltk
from nltk.corpus import stopwords

# Extract names from spacy library
import spacy

# Dataframe numerical manipulation
import numpy as np

# Sentiment analysis
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import difflib

# Removing punctuation and data cleaning post sentiment analysis
import string

# Webscraper libraries
from selenium import webdriver
import urllib.request
import os
from time import sleep
import io
from PIL import Image, ImageDraw, ImageOps


### Setup Reddit API

In [2]:
#Setup Reddit API:
client_id = 'ptyAAVrgtcJZCzNytlTtJA'
with open('secret.txt', 'r') as f:
    secret_key = f.read()

auth = requests.auth.HTTPBasicAuth(client_id, secret_key)

with open('pw.txt', 'r') as f:
    pw = f.read()

with open('user.txt', 'r') as f:
    user = f.read()

data = {
    'grant_type': 'password',
    'username': user,
    'password': pw
}

headers = {'user-agent': 'myapi/0.0.1'}

res = requests.post('https://www.reddit.com/api/v1/access_token',
                    auth=auth, data=data, headers=headers)

TOKEN = res.json()['access_token']

headers['Authorization'] = f'bearer {TOKEN}'

In [3]:
#Get reddit data
# TODO: add a logging script here
requests.get('https://oauth.reddit.com/api/v1/me', headers=headers)

res = requests.get('https://oauth.reddit.com/r/bjj/new',
                 headers=headers, params={'limit':'100'})

# Get new posts
for post in res.json()['data']['children']:
    print(post['data']['title'])

#Create an empty dataframe to store the data: -> cols: the subreddit, the title of the post, the selftext of the post
df = pd.DataFrame()
new_rows = [
    {
        'subreddit': post['data']['subreddit'],
        'title': post['data']['title'],
        'selftext': post['data']['selftext']  # ,
        # 'upvote_ratio': post['data']['upvote_ratio'],
        # 'ups':post['data']['ups'],
        # 'downs':post['data']['downs'],
        # 'score':post['data']['score']
    }
    for post in res.json()['data']['children']
]
df = pd.concat([df, pd.DataFrame(new_rows)], ignore_index=True)


What to do when I’m going for collar chokes or sleeve control but accidentally stick my thumb under the rashguard that’s underneath the Gi?
Inspired by another post and this was too long a story to be a comment
Mixing Adderal and training, good idea or bad idea?
Brazilian Jiu Jitsu can now form part of GCSE Curriculum in Physical Education - UKBJJA
Deep dive
safety rules for a massive man?
When do you know what your BJJ ‘game’ is?
Andrew Wiltse coming back to competition soon
White Belt Wednesday
No Holds Barred Q&amp;A: Nicky Ryan-Craig Jones
When did you start competing?
A lot of people on r/bjj would not be able to train at normal/traditional Judo school.
offering reasonably priced Sunday class in Taiwan
Kron is back! Fights Charles Jourdain on May 6th
Judo terms in BJJ
Why BJJ people suck at takedowns
VHTS shorts
What submissions are the easiest to pull on people with no BJJ experience?
Does the pass determine the guard, or vise versa?
Exhausted Arms
Anyone who has trained or curre

## Clean Dataframe

In [4]:
# Remove common English words using nltk stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
df['title'] = df['title'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))
df['selftext'] = df['selftext'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ikram\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
# Use Spacy to get names from titles and selftext column

nlp = spacy.load('en_core_web_sm')

people_names_title = []
for text in df['title']:
    doc = nlp(text)
    people_names_title.extend(
        entity.text for entity in doc.ents if entity.label_ == 'PERSON'
    )

people_names_selftext = []
for text in df['selftext']:
    doc = nlp(text)
    people_names_selftext.extend(
        entity.text for entity in doc.ents if entity.label_ == 'PERSON'
    )

In [6]:
#Add names from data extracted in the Bjj Extraction repo and convert column to list
names_df = pd.read_csv('practitioner_name.csv')
names_list = names_df['practitioner_name'].tolist()
names_list_lower = list(map(lambda x: x.lower(), names_list))

# filter names_df to extract the names in the names_list csv
names_df_title_df = df[df['title'].str.contains('|'.join(names_list))]
names_df_selftext_df = df[df['selftext'].str.contains('|'.join(names_list))]

# Concatenate two columns of the df into a new column containing names and comments-> merged -> separated by whitespace character ' '.
df['merged'] = df['title'].str.cat(df['selftext'], sep=' ')

# make all words lower case
df['merged'] = df['title'].str.cat(df['selftext'], sep=' ')

### Sentiment analysis using nltk Sentiment Intensity Analyzer and then further clean data

In [7]:
# Initialize the SentimentIntensityAnalyzer object
sia = SentimentIntensityAnalyzer()

In [8]:
# Check if any names mentioned data frame match up with whats in the names column 
results = []
for text in df['merged']:
    doc = nlp(text)
    names_found = []
    for token in doc:
        token_lower = token.text.lower()
        closest_match = difflib.get_close_matches(token_lower, names_list_lower)
        if token_lower in names_list_lower:
            names_found.append(token.text)
        elif closest_match:
            names_found.append(names_list[names_list_lower.index(closest_match[0])])
    if len(names_found) > 0:
        positive_words = [word for word in text.split() if sia.polarity_scores(word)['pos'] > 0]
        results.append({'names': names_found, 'positive_words': positive_words})
        
df_results = pd.DataFrame(results)

In [9]:
df_results['positive_words']

0                                                    []
1     [Inspired, like, flexibility,, favorite, giggl...
2     [good, better, like, Hoping, enthusiasts, like...
3                                                    []
4                                    [kind, hope, sure]
                            ...                        
78                                         [well, like]
79                    [respect, Like, LIKE, methodical]
80                                                   []
81                             [interested, like, fair]
82                                                   []
Name: positive_words, Length: 83, dtype: object

In [10]:
# From list of words in positive_words, find words that need to be manually removed
common_words = [
    "create",
    "please",
    "help"
]

In [11]:
# convert rows from lists to strings
df_results['names'] = df_results['names'].apply(lambda x: ', '.join(x))
df_results['positive_words'] = df_results['positive_words'].apply(lambda x: ', '.join(x))

# convert positive_words_replaced with positive words and unpacked_names with names
# define a function to remove punctuation except commas from a string
def remove_punctuation_from_string(text):
    return text.translate(str.maketrans('', '', string.punctuation.replace(',', '')))


In [12]:
df_results

Unnamed: 0,names,positive_words
0,"Sam McNally, Maxine Thylin, Enson Inoue",
1,"Daniel Otero, Ryan Gracie, Manuel Pontes, Nyja...","Inspired, like, flexibility,, favorite, giggle..."
2,"Dany Gerard, Kira Sung, Ben Baxter, Andre Motta","good, better, like, Hoping, enthusiasts, like,..."
3,Eduardo Tinoco,
4,"Eliot Marshall, Alex Martins","kind, hope, sure"
...,...,...
78,"Gordon Ryan, Pedro Ramalho, Ryan Hall, Ryan Ha...","well, like"
79,Tiago Alves,"respect, Like, LIKE, methodical"
80,Ryan Gracie,
81,"Amal Easton, Amal Easton","interested, like, fair"


In [13]:
#Remove punctuation, common words and empty rows

# define a function to remove punctuation except commas from a list of strings
def remove_punctuation_from_list(lst):
    return [remove_punctuation_from_string(text) for text in lst]

# create a set of the common bjj words
common_words_set = set(common_words)

# define a function to remove the common bjj words from a list of strings
def remove_common_words(words):
    # split the input string into a list of words
    words_list = words.split()
    
    # remove non-alphanumeric characters from each word in the list
    words_list = [''.join(filter(str.isalnum, word)) for word in words_list]
    
    # remove the common bjj words from the list of words
    words_list = [word for word in words_list if word.lower() not in common_words_set]
    
    # join the remaining words into a string separated by commas and return it
    return ', '.join(words_list)


# apply the remove_common_words function to the 'positive_words' column of the dataframe
df_results['positive_words_replaced'] = df_results['positive_words'].apply(remove_common_words)



In [14]:
df_results

Unnamed: 0,names,positive_words,positive_words_replaced
0,"Sam McNally, Maxine Thylin, Enson Inoue",,
1,"Daniel Otero, Ryan Gracie, Manuel Pontes, Nyja...","Inspired, like, flexibility,, favorite, giggle...","Inspired, like, flexibility, favorite, giggle,..."
2,"Dany Gerard, Kira Sung, Ben Baxter, Andre Motta","good, better, like, Hoping, enthusiasts, like,...","good, better, like, Hoping, enthusiasts, like,..."
3,Eduardo Tinoco,,
4,"Eliot Marshall, Alex Martins","kind, hope, sure","kind, hope, sure"
...,...,...,...
78,"Gordon Ryan, Pedro Ramalho, Ryan Hall, Ryan Ha...","well, like","well, like"
79,Tiago Alves,"respect, Like, LIKE, methodical","respect, Like, LIKE, methodical"
80,Ryan Gracie,,
81,"Amal Easton, Amal Easton","interested, like, fair","interested, like, fair"


In [15]:
df_results = df_results.replace('', np.nan)
df_results.dropna(inplace=True)

In [16]:
# Group rows by name and input into a new dataframe
grouped = df_results[['names', 'positive_words_replaced']].groupby('names')['positive_words_replaced'].apply(list)
df_results = pd.DataFrame({'names': grouped.index, 'positive_words_replaced': grouped.values})

In [17]:
# split strings in names, make lowercase and group data
df_results['names'] = df_results['names'].apply(lambda x: x.split(', '))
df_results = df_results.explode('names')
df_results = df_results.applymap(lambda x: x.lower() if type(x) == str else x)
df_results = df_results.groupby('names', as_index=False).max()

In [18]:
df_results

Unnamed: 0,names,positive_words_replaced
0,ailson brites,"[gained, surprised, abilities, advantage]"
1,alan moraes,"[Inspired, like, flexibility, favorite, giggle..."
2,alan sanchez,"[like, yes, good, hoping, giving, good, safe, ..."
3,alec baulding,"[join, energy, confidence]"
4,alex martins,"[wow, like, thanks, inspired]"
...,...,...
146,vanessa english,"[honestly, love, appreciate, share, Thanks, pe..."
147,vitor henrique,"[strength, fresh, strongest, strength, top, li..."
148,vitor toledo,"[gentle, optimal, recommend, want, yes, enjoyi..."
149,willis nunes,"[gained, surprised, abilities, advantage]"


In [19]:
# define a function to count the number of words in a list of strings
def count_words(words_list):
    count = 0
    for words in words_list:
        for word in words.split():
            count += 1
    return count

# apply the function to the column and create a new column called "word_count"
df_results['word_count'] = df_results['positive_words_replaced'].apply(count_words)
df_results = df_results.sort_values(by=['word_count'], ascending=False).reset_index(drop=True)


In [20]:
df_results['positive_words_replaced'] = df_results['positive_words_replaced'].apply(lambda x: ', '.join(x))
df_results

Unnamed: 0,names,positive_words_replaced,word_count
0,penny thomas,"Inspired, like, flexibility, favorite, giggle,...",108
1,heath pedigo,"Inspired, like, flexibility, favorite, giggle,...",108
2,jeremy jackson,"Inspired, like, flexibility, favorite, giggle,...",108
3,eric phan,"Inspired, like, flexibility, favorite, giggle,...",108
4,claudio mattos,"Inspired, like, flexibility, favorite, giggle,...",108
...,...,...,...
146,nyjah rollins,want,1
147,kit dale,recommend,1
148,rubens charles,shares,1
149,joel gingery,want,1


In [21]:
# get top 5 results, capitalize names and save as csv
top_5_results = df_results.head(5).copy()
top_5_results['names'] = top_5_results['names'].str.title()
top_5_results.loc[:, 'names'] = top_5_results['names'].str.title()
top_5_results.to_csv("top_10_results.csv", index=False)
top_5_results

Unnamed: 0,names,positive_words_replaced,word_count
0,Penny Thomas,"Inspired, like, flexibility, favorite, giggle,...",108
1,Heath Pedigo,"Inspired, like, flexibility, favorite, giggle,...",108
2,Jeremy Jackson,"Inspired, like, flexibility, favorite, giggle,...",108
3,Eric Phan,"Inspired, like, flexibility, favorite, giggle,...",108
4,Claudio Mattos,"Inspired, like, flexibility, favorite, giggle,...",108


### Scrape images from website and save as a circle png

In [22]:
# set up the web driver
driver = webdriver.Chrome()

In [23]:
#Get names to pull from site in a format that works with appending to the dataframe
df_results_top_5_copy = df_results.head(5).copy()
df_results_top_5_copy['search_names'] = df_results_top_5_copy['names'].str.replace(' ', '-')

search_names = []
for name in df_results_top_5_copy['search_names']:
    search_names.append(name)

search_names

['penny-thomas',
 'heath-pedigo',
 'jeremy-jackson',
 'eric-phan',
 'claudio-mattos']

In [24]:
# Site links
main_site = "https://www.bjjheroes.com/bjj-fighters/"
site_links = []
for name in search_names:
    site_links.append(main_site + name)
site_links

['https://www.bjjheroes.com/bjj-fighters/penny-thomas',
 'https://www.bjjheroes.com/bjj-fighters/heath-pedigo',
 'https://www.bjjheroes.com/bjj-fighters/jeremy-jackson',
 'https://www.bjjheroes.com/bjj-fighters/eric-phan',
 'https://www.bjjheroes.com/bjj-fighters/claudio-mattos']

In [25]:
# Loop through list of website links, download the banner image of each site and crop into circle
# output as a png
numbers = [str(i) for i in range(1, 6)]
for i, (link, number) in enumerate(zip(site_links, numbers)):
    # open site
    driver.get(link)
    sleep(3)
    xpath = "/html/body/div[2]/div[1]/div/div[4]/div/div/div[1]/div/div/div[2]"
    
    # Find the image element using the given XPath
    image_element = driver.find_element("xpath", xpath)
    
    # Get the URL of the image from the element's "src" attribute
    image_url = image_element.get_attribute("style")
    try:
        start_index = image_url.index('"') + 1
        end_index = image_url.index('"', start_index)
        url = image_url[start_index:end_index]
    except ValueError:
        print(f"Error: image URL not found for site {link}")
        continue

    # Download the image content
    image_content = requests.get(url).content
    
    # Convert image content to Pillow Image object
    image = Image.open(io.BytesIO(image_content))
    
    # Crop the image into a circle
    width, height = image.size
    circle_radius = min(width, height) // 2
    mask = Image.new('L', (circle_radius*2, circle_radius*2), 0)
    draw = ImageDraw.Draw(mask)
    draw.ellipse((0, 0, circle_radius*2, circle_radius*2), fill=255)
    cropped_image = ImageOps.fit(image, (circle_radius*2, circle_radius*2), centering=(0.5, 0.5))
    
    # Apply mask to cropped image
    rgba_image = Image.new('RGBA', (circle_radius*2, circle_radius*2), (0, 0, 0, 0))
    rgba_image.paste(cropped_image, (0, 0), mask=mask)
    
    # Save the image to the specified file path
    download_path = ""
    file_name = f"{number}.png"
    file_path = download_path + file_name
    rgba_image.save(file_path)

driver.close()
