In [12]:
import pandas as pd
import nltk
from collections import Counter
from nltk.corpus import wordnet as wn

# Ensure NLTK resources are downloaded
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Load preprocessed data from a CSV file
preprocessed_data = pd.read_csv('v3_preprocessed_data.csv')

# Define a function to extract adjectives from tokenized words
def extract_adjectives(text):
    words = nltk.word_tokenize(text)  # Tokenize the text directly
    tagged = nltk.pos_tag(words)
    return [word for word, pos in tagged if pos.startswith('JJ') and len(word) > 2]

# List of words to exclude
filter_out = [
    'human', 'person', 'people', 'participant', 'participants', 'individual',
    'individuals', 'person', 'persons', 'western', 'finish', 'second', 'third',
    'fourth', 'first', 'minute', 'full', 'mile', 'fifth', 'american', 'sixth',
    'seventh', 'british', 'french', 'female', 'eighth', 'ninth', 'usa', 'moroccan',
    'sweden', 'february', 'u.s.', 'u.k.', 'january', 'italy', 'other', 'top',
    'last', 'more', 'final', 'early', 'few', 'many', 'next', 'much', 'most',
    'overall', 'same', 'own', 'related', 'previous', 'such', 'hearst', 'several',
    'sure', 'clear', 'able', 'local', 'hot', 'right', 'due', 'latest', 'digital',
    'lead', 'additional', 'late', 'least', 'steep', 'past', 'north', 'close',
    'flat', 'real', 'low', 'subscribe', 'whole', 'only', 'international', 'south',
    'total', 'black', 'green', 'free', 'entire', 'warm', 'likely', 'half',
    'vertical', 'trail', 'higher', 'national', 'back', 'eventual', 'main',
    'uphill', 'deep', 'recent', 'live', 'key', 'multiple', 'dry', 'fellow',
    'last-minute', 'further', 'general', '10th', 'common', 'daily', 'rocky',
    'two-time', 'double', 'available', 'croft', 'red', 'open', 'essential',
    'running', 'various', 'prior', 'remote', 'official', 'european', 'guardian',
    'outside', 'massive', 'isn', 'don', 'relative', 'longest', 'automatic', 'ski',
    'particular', 'saturday', 'nick', 'white', 'lake', 'italien', 'average',
    'unable', 'october', 'certain', 'tom', 'xavier', 'outdoor', 'april',
    'following', 'specific', 'shoe', 'chinese', 'northern', 'later', 'added',
    'weekly', 'david', 'mds', 'sarah', 'spanish', 'patrick', 'complete', 'west',
    'opposite', 'italian', 'mere', 'podium', 'appalachian', 'gear', 'anna',
    'swiss', 'separate', 'fewer', 'capable', 'earlier', 'pole', 'sunday',
    'michigan', 'actual', 'ida', 'dark', 'standard', 'spain', 'english', 'leg',
    'cumulative', 'familiar', 'chase', 'monthly', 'muddy', 'lifelong', 'retail',
    'severe', 'hidden', 'olympic', 'useful', 'aware', 'constant', 'loose',
    'initial', 'australia', 'germany', 'hillary', 'tenth', 'glucose', 'closer',
    'rugged', 'diet', 'olympian', 'snow', 'japanese', 'dusty', 'latter', 'friday',
    'uganda', 'dong', 'frozen', 'adam', 'like', 'future', 'portugal', 'czech',
    'liza', 'cross', 'table', 'august', 'new']


# Initialize a list to store all adjectives
all_adjectives = []

# Process each row in the DataFrame
for index, row in preprocessed_data.iterrows():
    adjectives = extract_adjectives(row['body'])  # Extract adjectives from the text

    # Filter words to include those with WordNet entries and not in the filter_out list
    all_adjectives.extend(word for word in adjectives if wn.synsets(word) and word.lower() not in filter_out)

# Count occurrences of each adjective
adjective_count = Counter(all_adjectives)

# Specify the number of most common words to keep
words_to_include = 200
adjectives_to_include = [word for word, count in adjective_count.most_common(words_to_include)]

# Create a DataFrame for adjectives and save it to a CSV file
adjectives_df = pd.DataFrame(adjectives_to_include, columns=['adjective'])
adjectives_df.to_csv('adjectives_from_corpus_filtered.csv', index=False)


[nltk_data] Downloading package punkt to C:\Users\Lenovo
[nltk_data]     Thinkpad\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Lenovo Thinkpad\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
