In [59]:
import csv
import datetime
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, RegexpTokenizer


subname = 'blackfriday'
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

input_file_path = r'C:\Users\valhk\Documents\MMA\Text analytics\Team project\Reddit_extracts\walmart_comments.csv'
output_file_path = r'C:\Users\valhk\Documents\MMA\Text analytics\Team project\Reddit_extracts\walmart_blackfriday.csv'


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\valhk\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\valhk\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [60]:
# Use a regular expression tokenizer to match only words (ignores punctuation)
tokenizer = RegexpTokenizer(r'\w+')
# Function to calculate the extended Black Friday period
def is_within_extended_period(date):
    year = date.year
    # Thanksgiving (fourth Thursday of November)
    thanksgiving = datetime.datetime(year, 11, (22 + (3 - datetime.date(year, 11, 1).weekday() + 7) % 7))
    # Start of the period: 3 days before Black Friday
    start_period = thanksgiving - datetime.timedelta(days=3)
    # End of the period: 4 days after Black Friday (after Cyber Monday)
    end_period = thanksgiving + datetime.timedelta(days=4)
    return start_period <= date <= end_period

# Function to check for relevant keywords in the text
def contains_relevant_keywords(text):
    keywords = ['black friday', 'cyber monday', 'blackfriday', 'cybermonday']
    text_lower = text.lower()
    return any(keyword in text_lower for keyword in keywords)

seen_bodies = set()

with open(input_file_path, mode='r', newline='', encoding='utf-8') as infile, open(output_file_path, mode='w', newline='', encoding='utf-8') as outfile:
    reader = csv.DictReader(infile)
    fieldnames = reader.fieldnames
    
    writer = csv.DictWriter(outfile, fieldnames=fieldnames)
    writer.writeheader()
    
    for row in reader:
        if not all(row.values()) or row['body'] in ('[deleted]', '[removed]'):
            continue
        
        word_tokens = tokenizer.tokenize(row['body'].lower())
        filtered_sentence = [word for word in word_tokens if word.isalpha() and word not in stop_words]

        clean_body = ' '.join(filtered_sentence)
        
        if clean_body in seen_bodies:
            continue
        else:
            seen_bodies.add(clean_body)
            #row['body'] = clean_body    depending on the output we want, we can keep the original body or the cleaned one or both
            row['subreddit_id'] = 'walmart'
        
        try:
            timestamp = datetime.datetime.utcfromtimestamp(int(row['retrieved_on']))
            # Check if the comment is within the extended period or contains relevant keywords
            if (timestamp.year < 2023 and timestamp.year >= 2018) and (is_within_extended_period(timestamp) or contains_relevant_keywords(row['body'])):
                row['retrieved_on'] = timestamp.strftime("%Y-%m-%d")
                writer.writerow(row)
        except ValueError:
            continue