In [3]:
import pandas as pd
import re

In [4]:
df = pd.read_csv("player_text_and_stats_columns.csv")  # e.g., "filtered_nba_players.csv"

## Define functions for data cleaning

In [5]:
def clean_wiki_text(text):
    if pd.isna(text):
        return ""
    # Remove Wikipedia section headers like "== Early life =="
    text = re.sub(r"==.*?==", "", text)
    # Remove any URLs
    text = re.sub(r"https?://\S+", "", text)
    # Normalize all whitespace to single spaces
    text = re.sub(r"\s+", " ", text)
    return text.strip()

# === Step 3: Clean stats_text ===
def clean_stats_text(text):
    if pd.isna(text):
        return ""
    # Split into lines
    lines = text.strip().split("\n")
    # Remove blank lines and "Career season" summary rows
    lines = [line for line in lines if "Career season" not in line and line.strip()]
    # Join as a single paragraph
    return " ".join(lines)


In [6]:
df["wiki_text_clean"] = df["wiki_text"].apply(clean_wiki_text)
df["stats_text_clean"] = df["stats_text"].apply(clean_stats_text)

# === Step 5: Combine them for chunking ===
df["combined_text"] = df["wiki_text_clean"] + "\n\n" + df["stats_text_clean"]

In [9]:
df

Unnamed: 0,player,wiki_text,stats_text,wiki_text_clean,stats_text_clean,combined_text
0,Alaa Abdelnaby,Alaa Abdelnaby (Arabic: علاء عبد النبي; born J...,"In the 1990–91 season, Alaa Abdelnaby played f...",Alaa Abdelnaby (Arabic: علاء عبد النبي; born J...,"In the 1990–91 season, Alaa Abdelnaby played f...",Alaa Abdelnaby (Arabic: علاء عبد النبي; born J...
1,Mahmoud Abdul-Rauf,Mahmoud Abdul-Rauf (born Chris Wayne Jackson; ...,"In the 1990–91 season, Mahmoud Abdul-Rauf play...",Mahmoud Abdul-Rauf (born Chris Wayne Jackson; ...,"In the 1990–91 season, Mahmoud Abdul-Rauf play...",Mahmoud Abdul-Rauf (born Chris Wayne Jackson; ...
2,Tariq Abdul-Wahad,Tariq Abdul-Wahad (born Olivier Michael Saint-...,"In the 1997–98 season, Tariq Abdul-Wahad playe...",Tariq Abdul-Wahad (born Olivier Michael Saint-...,"In the 1997–98 season, Tariq Abdul-Wahad playe...",Tariq Abdul-Wahad (born Olivier Michael Saint-...
3,Shareef Abdur-Rahim,"Julius Shareef Abdur-Rahim (born December 11, ...","In the 1996–97 season, Shareef Abdur-Rahim pla...","Julius Shareef Abdur-Rahim (born December 11, ...","In the 1996–97 season, Shareef Abdur-Rahim pla...","Julius Shareef Abdur-Rahim (born December 11, ..."
4,Alex Abrines,"Alejandro ""Álex"" Abrines Redondo (born 1 Augus...","In the 2016–17 season, Alex Abrines played for...","Alejandro ""Álex"" Abrines Redondo (born 1 Augus...","In the 2016–17 season, Alex Abrines played for...","Alejandro ""Álex"" Abrines Redondo (born 1 Augus..."
...,...,...,...,...,...,...
3382,Jim Zoet,"Jim Zoet (born December 20, 1953) is a Canadia...",,"Jim Zoet (born December 20, 1953) is a Canadia...",,"Jim Zoet (born December 20, 1953) is a Canadia..."
3383,Ivica Zubac,Ivica Zubac ( iv-EET-sa ZOO-bahts; Croatian: [...,"In the 2016–17 season, Ivica Zubac played for ...",Ivica Zubac ( iv-EET-sa ZOO-bahts; Croatian: [...,"In the 2016–17 season, Ivica Zubac played for ...",Ivica Zubac ( iv-EET-sa ZOO-bahts; Croatian: [...
3384,Tristan da Silva,Tristan da Silva (born 15 May 2001) is a Germa...,"In the 2024–25 season, Tristan da Silva played...",Tristan da Silva (born 15 May 2001) is a Germa...,"In the 2024–25 season, Tristan da Silva played...",Tristan da Silva (born 15 May 2001) is a Germa...
3385,Vlatko Čančar,Vlatko Čančar ( CHAHN-char; born 10 April 1997...,,Vlatko Čančar ( CHAHN-char; born 10 April 1997...,,Vlatko Čančar ( CHAHN-char; born 10 April 1997...


In [10]:
final_df = df[["player", "combined_text"]]
# Save the final DataFrame to a CSV file
final_df.to_csv("cleaned_player_texts.csv", index=False)