In [1]:
import pandas as pd
# Reading the new version of the CSV file to understand its structure
try:
    # Try reading a few rows to get an idea of the structure
    preview_data_new = pd.read_csv('CleanedDataOfNBAShooters.csv')
except pd.errors.ParserError as e:
    # If there's a parsing error, display it
    preview_data_new = f"Error while reading the file: {str(e)}"

preview_data_new


Unnamed: 0,Cleaned_Weibo_Text
0,Threepoint shooter James made longdistance thr...
1,Fee Note For the 76ers the NBA trade deadline ...
2,NBA McCollum made a sudden stop and hit a thre...
3,The Lakers 4for2 trade plan involved Westbrook...
4,NBA legend Vince Carter talked about the Nets ...
...,...
1934,NBADream1TeamThe best team for January 3 Mitch...
1935,Hyde Sports Morning NewsFootball War Report Li...
1936,NBA Zero Distance Game Review Brunson made thr...
1937,happynewyear After the Warriors won their four...


In [2]:
from collections import Counter
# Tokenizing the text and counting the occurrences of each word in the new dataset
word_counter_new = Counter()
for text in preview_data_new['Cleaned_Weibo_Text']:
    words = text.split()
    word_counter_new.update(words)

# Filtering out only the capitalized words (potential proper nouns)
capitalized_word_counter = {word: count for word, count in word_counter_new.items() if word.istitle()}

# Finding the 20 most common capitalized words for a quick review
most_common_capitalized_words = sorted(capitalized_word_counter.items(), key=lambda x: x[1], reverse=True)[:20]
most_common_capitalized_words


[('The', 3129),
 ('I', 2113),
 ('In', 1400),
 ('He', 1365),
 ('Lakers', 1005),
 ('Warriors', 842),
 ('Curry', 822),
 ('This', 700),
 ('James', 615),
 ('Heat', 595),
 ('If', 590),
 ('It', 548),
 ('After', 531),
 ('But', 519),
 ('Nuggets', 513),
 ('Weibo', 396),
 ('They', 370),
 ('No', 339),
 ('When', 324),
 ('Suns', 321)]

In [3]:

# List of common English stop words
stop_words = set([
    'I', 'Me', 'My', 'Myself', 'We', 'Our', 'Ours', 'Ourselves', 'You', "You're", "You've", "You'll", "You'd", 
    'Your', 'Yours', 'Yourself', 'Yourselves', 'He', 'Him', 'His', 'Himself', 'She', "She's", 'Her', 'Hers', 
    'Herself', 'It', "It's", 'Its', 'Itself', 'They', 'Them', 'Their', 'Theirs', 'Themselves', 'What', 'Which', 
    'Who', 'Whom', 'This', 'That', "That'll", 'These', 'Those', 'Am', 'Is', 'Are', 'Was', 'Were', 'Be', 'Been', 
    'Being', 'Have', 'Has', 'Had', 'Having', 'Do', 'Does', 'Did', 'Doing', 'A', 'An', 'The', 'And', 'But', 'If', 
    'Or', 'Because', 'As', 'Until', 'While', 'Of', 'At', 'By', 'For', 'With', 'About', 'Against', 'Between', 
    'Into', 'Through', 'During', 'Before', 'After', 'Above', 'Below', 'To', 'From', 'Up', 'Down', 'In', 'Out', 
    'On', 'Off', 'Over', 'Under', 'Again', 'Further', 'Then', 'Once', 'Here', 'There', 'When', 'Where', 'Why', 
    'How', 'All', 'Any', 'Both', 'Each', 'Few', 'More', 'Most', 'Other', 'Some', 'Such', 'No', 'Nor', 'Not', 
    'Only', 'Own', 'Same', 'So', 'Than', 'Too', 'Very', 'S', 'T', 'Can', 'Will', 'Just', 'Don', "Don't", 'Should', 
    "Should've", 'Now', 'D', 'LL', 'M', 'O', 'Re', 'Ve', 'Y', 'Ain', 'Aren', "Aren't", 'Couldn', "Couldn't", 
    'Didn', "Didn't", 'Doesn', "Doesn't", 'Hadn', "Hadn't", 'Hasn', "Hasn't", 'Haven', "Haven't", 'Isn', "Isn't", 
    'Ma', 'Mightn', "Mightn't", 'Mustn', "Mustn't", 'Needn', "Needn't", 'Shan', "Shan't", 'Shouldn', "Shouldn't", 
    'Wasn', "Wasn't", 'Weren', "Weren't", 'Won', "Won't", 'Wouldn', "Wouldn't", 'NBA', 'Video', 'Weibo', 'Lakers', 
    'Warriors', 'Heat', 'Nuggets', 'Suns', 'Celtics', 'Rockets', 'Clippers', 'Bucks', 'Nets', 'Raptors', 'Thunder', 
    'Jazz', 'Although', 'Conference', 'Kings', 'Pacers', 'Pelicans', 'Pistons', 'Timberwolves', 'Trail', 'Blazers',
    'League', 'Finals', 'Magic', 'Mavericks', 'Grizzlies', 'Hawks', 'Hornets', 'Bulls', 'Cavaliers', 'Knicks',
    'Spurs', '76ers', 'Thunder', 'Wizards', 'Magic', 'Mavericks', 'Grizzlies', 'Hawks', 'Hornets', 'Bulls',
    'Cavaliers', 'Knicks', 'However', 'Basketball', 'Cup', 'Heats', 'World', 'Western'
    
])

# Filtering out the stop words
filtered_word_counter = {word: count for word, count in capitalized_word_counter.items() if word not in stop_words}

# Finding the 20 most common filtered words for a quick review
most_common_filtered_words = sorted(filtered_word_counter.items(), key=lambda x: x[1], reverse=True)[:20]
most_common_filtered_words


[('Curry', 822),
 ('James', 615),
 ('Jordan', 312),
 ('Jokic', 311),
 ('Green', 271),
 ('Durant', 263),
 ('Harden', 249),
 ('Westbrook', 220),
 ('Kobe', 193),
 ('Irving', 189),
 ('Thompson', 173),
 ('Paul', 172),
 ('Murray', 167),
 ('Russell', 167),
 ('Allen', 155),
 ('Davis', 154),
 ('Butler', 150),
 ('Poole', 144),
 ('Ray', 141),
 ('Lillard', 137)]