In [28]:
from bs4 import BeautifulSoup
import pandas as pd
import requests


In [29]:
# Add headers to mimic a browser
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
    "Connection": "keep-alive",
}

In [30]:
url = "https://blackbeltwiki.com/judo-techniques"

response = requests.get(url, headers=headers)
html_content = response.text
if response.status_code == 200:
    soup = BeautifulSoup(html_content, 'html.parser')
else:
    print(f"Error: Could not load page. Status Code: {response.status_code}")

In [39]:
# Parse the HTML Content

# Initialize lists to store the data
categories = []
subcategories = []
move_names = []
move_links = []

# Extract data from the HTML
current_category = None
current_subcategory = None

for tag in soup.find_all(['p', 'ul']):
    if tag.name == 'p':
        strong_tag = tag.find('strong')
        if strong_tag:
            current_category = strong_tag.get_text(strip=True)
            current_subcategory = None
        else:
            current_subcategory = tag.get_text(strip=True)
    elif tag.name == 'ul' and current_category:
        for li_tag in tag.find_all('li'):
            a_tag = li_tag.find('a')
            move_name = a_tag.get_text(strip=True)
            move_link = a_tag['href']
            categories.append(current_category)
            subcategories.append(current_subcategory)
            move_names.append(move_name)
            move_links.append(move_link)

# Create a DataFrame
df = pd.DataFrame({
    'Category': categories,
    'Subcategory': subcategories,
    'Move_Name': move_names,
    'Move_Link': move_links
})

# Display the DataFrame
print(df)


                             Category  \
0    Ukemui – Judo Breakfalls & Rolls   
1    Ukemui – Judo Breakfalls & Rolls   
2    Ukemui – Judo Breakfalls & Rolls   
3    Ukemui – Judo Breakfalls & Rolls   
4             Nage Waza – Judo Throws   
..                                ...   
287         Make Your Own Home “Dojo”   
288         Make Your Own Home “Dojo”   
289         Make Your Own Home “Dojo”   
290         Make Your Own Home “Dojo”   
291         Make Your Own Home “Dojo”   

                                           Subcategory          Move_Name  \
0                                                 None          Mae Ukemi   
1                                                 None       Ushiro Ukemi   
2                                                 None         Yoko Ukemi   
3                                                 None       Zenpo Kaiten   
4              Te Waza – Judo Throws (Hand Techniques)    Ippon Seoi Nage   
..                                           

In [None]:
# Write the DataFrame to a CSV file
df.to_csv('data/judo_techniques.csv', index=False)

In [32]:
# Create new DataFrame from the CSV file to clena the data
df_judo = pd.read_csv('data/judo_techniques.csv')

print(df_judo.head())

                           Category                              Subcategory  \
0  Ukemui – Judo Breakfalls & Rolls                                      NaN   
1  Ukemui – Judo Breakfalls & Rolls                                      NaN   
2  Ukemui – Judo Breakfalls & Rolls                                      NaN   
3  Ukemui – Judo Breakfalls & Rolls                                      NaN   
4           Nage Waza – Judo Throws  Te Waza – Judo Throws (Hand Techniques)   

         Move_Name                                      Move_Link  
0        Mae Ukemi        https://www.blackbeltwiki.com/mae-ukemi  
1     Ushiro Ukemi     https://www.blackbeltwiki.com/ushiro-ukemi  
2       Yoko Ukemi       https://www.blackbeltwiki.com/yoko-ukemi  
3     Zenpo Kaiten     https://www.blackbeltwiki.com/zenpo-kaiten  
4  Ippon Seoi Nage  https://www.blackbeltwiki.com/ippon-seoi-nage  


In [33]:
df_judo.info()
df_judo.tail()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 292 entries, 0 to 291
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Category     292 non-null    object
 1   Subcategory  262 non-null    object
 2   Move_Name    292 non-null    object
 3   Move_Link    292 non-null    object
dtypes: object(4)
memory usage: 9.3+ KB


Unnamed: 0,Category,Subcategory,Move_Name,Move_Link
287,Make Your Own Home “Dojo”,Martial Arts Training Equipment– Practice mats...,Strength Training,https://blackbeltwiki.com/strength-training
288,Make Your Own Home “Dojo”,Martial Arts Training Equipment– Practice mats...,Stretching,https://blackbeltwiki.com/stretching
289,Make Your Own Home “Dojo”,Martial Arts Training Equipment– Practice mats...,Warm-Up Exercises,https://blackbeltwiki.com/warm-up-exercises
290,Make Your Own Home “Dojo”,Martial Arts Training Equipment– Practice mats...,Yoga,https://blackbeltwiki.com/yoga-for-martial-arts
291,Make Your Own Home “Dojo”,Martial Arts Training Equipment– Practice mats...,Other Fitness,https://blackbeltwiki.com/martial-arts-physica...


In [34]:
# Clean up data file

df_filtered = df_judo[~df_judo['Category'].str.contains('Make Your Own Home', case=False, na=False)]
df_filtered = df_filtered[~df_filtered['Category'].str.contains('Martial arts books are', case=False, na=False)]
df_filtered.tail()

Unnamed: 0,Category,Subcategory,Move_Name,Move_Link
109,Related Pages,,Judo Hip Throws,https://blackbeltwiki.com/judo-throws-hip-tech...
110,Related Pages,,Judo Hand Throws,https://blackbeltwiki.com/judo-throws-hand-tec...
111,Related Pages,,Judo Foot Throws,https://blackbeltwiki.com/judo-throws-foot-tec...
112,Related Pages,,Judo Sacrifice Throws,https://blackbeltwiki.com/judo-throws-sacrific...
113,Related Pages,,Judo Forbidden Techniques,https://blackbeltwiki.com/kinshi-waza


In [35]:
df_filtered.to_csv('data/judo_techniques_filtered.csv', index=False)
df_judo = pd.read_csv('data/judo_techniques_filtered.csv')

In [36]:
df_judo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114 entries, 0 to 113
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Category     114 non-null    object
 1   Subcategory  93 non-null     object
 2   Move_Name    114 non-null    object
 3   Move_Link    114 non-null    object
dtypes: object(4)
memory usage: 3.7+ KB
