In [1]:
# Importing the necessary libraries to handle the CSV file
import pandas as pd

# Reading the CSV file into a DataFrame
file_path = 'NBA-Shooters-RawData.csv'
df = pd.read_csv(file_path)

# Extracting the contents of the column "Weibo text"
weibo_text_column = df.get('Weibo text', None)

# Check if the "Weibo text" column exists and display its contents
if weibo_text_column is not None:
    weibo_text_contents = weibo_text_column.tolist()
else:
    weibo_text_contents = "The column 'Weibo text' does not exist in the CSV file."

weibo_text_contents[:5], len(weibo_text_contents) if weibo_text_column is not None else 0  # Displaying the first 5 elements and the total length


(['[Three-point shooter] James made long-distance three-pointers during the Lakers era. #NBATucaoConference#O web link',
  "Fee Note: For the 76ers, the NBA trade deadline may be more about reducing the salary cap than adding reinforcements! As long as Embiid stays healthy and Harden continues to pass first and serve as a triple-double threat, they can make progress in the playoffs. Therefore, the 76ers are not expected to make any major lineup changes before the NBA trade deadline at 3 pm on Thursday (4 am on February 10th, Beijing time)! ! ! But they want a backup center, a backup point guard, and a shooter in the trade market. According to a report from Yahoo Sports on Saturday, Jazz big man Vanderbilt is the 76ers' main signing target to replace Embiid. However, the Jazz want a first-round pick to trade the five-year veteran! The 6-foot-9 Vanderbilt is a power forward but has the ability to guard all three frontcourt positions. The Pacers consider him an attractive option. The Jazz

In [2]:
import re

# Lowercasing the text
weibo_text_lower = [text.lower() for text in weibo_text_contents]

# Removing special characters using regular expressions
weibo_text_clean = [re.sub(r'[^a-zA-Z0-9\s]', '', text) for text in weibo_text_lower]

# Displaying the first 5 cleaned elements and the total length of the cleaned text
weibo_text_clean[:5], len(weibo_text_clean)


(['threepoint shooter james made longdistance threepointers during the lakers era nbatucaoconferenceo web link',
  'fee note for the 76ers the nba trade deadline may be more about reducing the salary cap than adding reinforcements as long as embiid stays healthy and harden continues to pass first and serve as a tripledouble threat they can make progress in the playoffs therefore the 76ers are not expected to make any major lineup changes before the nba trade deadline at 3 pm on thursday 4 am on february 10th beijing time   but they want a backup center a backup point guard and a shooter in the trade market according to a report from yahoo sports on saturday jazz big man vanderbilt is the 76ers main signing target to replace embiid however the jazz want a firstround pick to trade the fiveyear veteran the 6foot9 vanderbilt is a power forward but has the ability to guard all three frontcourt positions the pacers consider him an attractive option the jazz have discussed the possibility of 

In [3]:
pip install nltk


Note: you may need to restart the kernel to use updated packages.


In [4]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/huanjingheng/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [5]:
from nltk.corpus import stopwords

# Tokenizing the cleaned text (splitting the text into words)
weibo_text_tokenized = [text.split() for text in weibo_text_clean]

# Defining a set of English stop words
stop_words = set(stopwords.words('english'))

# Removing stop words from the tokenized text
weibo_text_no_stopwords = [[word for word in tokens if word not in stop_words] for tokens in weibo_text_tokenized]

# Converting the token lists back to strings for further processing or visualization
weibo_text_final = [" ".join(tokens) for tokens in weibo_text_no_stopwords]

# Displaying the first 5 cleaned, tokenized, and stop words removed text entries
weibo_text_final[:5]


['threepoint shooter james made longdistance threepointers lakers era nbatucaoconferenceo web link',
 'fee note 76ers nba trade deadline may reducing salary cap adding reinforcements long embiid stays healthy harden continues pass first serve tripledouble threat make progress playoffs therefore 76ers expected make major lineup changes nba trade deadline 3 pm thursday 4 february 10th beijing time want backup center backup point guard shooter trade market according report yahoo sports saturday jazz big man vanderbilt 76ers main signing target replace embiid however jazz want firstround pick trade fiveyear veteran 6foot9 vanderbilt power forward ability guard three frontcourt positions pacers consider attractive option jazz discussed possibility trading vanderbilt beasley knicks trail blazers hawks package philadelphia may need thirdparty team get involved make trade philadelphia doesnt ton tradable assets 76ers available roster spot might position get player need buyout market according 

In [7]:
# Create a new DataFrame with the cleaned text
cleaned_df = pd.DataFrame({'Cleaned_Weibo_Text': weibo_text_final})

# Save the cleaned DataFrame to a new CSV file
cleaned_file_path = 'CleanedDataOfNBAShooters.csv'
cleaned_df.to_csv(cleaned_file_path, index=False)

cleaned_file_path


'CleanedDataOfNBAShooters.csv'