## DATA PREPARATION

The file is intended to be run only once to generate cleaned_data.csv and subj_polarity_data.csv. These datasets are then used in other notebooks for further processing and training the models.

In [None]:
import pandas as pd
import re
from textblob import TextBlob
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [15]:
# Ensure you have these NLTK data downloaded
nltk.download('punkt')
nltk.download('stopwords')

# Load your dataset
df = pd.read_csv('../data/data.csv')

# Drop rows where 'body' is NaN
df = df.dropna(subset=['body'])

# Sample data cleaning function, updated to remove symbols
def clean_text(text):
    # Ensure text is a string
    if not isinstance(text, str):
        return ""  # Return a blank string for non-string inputs
    # Remove symbols like *, $, ~
    text = re.sub(r'[\*\$~]', '', text)
    # Tokenize the text
    tokens = word_tokenize(text)
    # Lowercase and keep only alphabetic words, remove numbers and punctuation
    tokens = [word.lower() for word in tokens if word.isalpha()]
    # Remove stopwords
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    # Join the tokens back into a string
    cleaned_text = ' '.join(tokens)
    return cleaned_text

# Use the 'body' column for text analysis
df['cleaned_text'] = df['body'].apply(clean_text)

print(df.head())

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Neel\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Neel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


   Unnamed: 0                                              title  score  \
0           0                      Are we in a recession or not?      2   
1           1    What Are Your Moves Tomorrow, February 26, 2024      1   
2           2                             Thank you Nancy Pelosi    113   
3           3  Imitator's Reckoning: $AI Anticipating a Downf...     17   
4           4                  What are the AI derivative plays?      9   

        id       subreddit                                                url  \
0  1azytar  wallstreetbets               https://i.redd.it/8xp7l2atpskc1.jpeg   
1  1azyssf  wallstreetbets  https://www.reddit.com/r/wallstreetbets/commen...   
2  1azxsze  wallstreetbets               https://i.redd.it/symyx8lniskc1.jpeg   
3  1azx26v  wallstreetbets  https://www.reddit.com/r/wallstreetbets/commen...   
4  1azvuxx  wallstreetbets  https://www.reddit.com/r/wallstreetbets/commen...   

   num_comments                                               

In [16]:
# Save the DataFrame to a new CSV file
df.to_csv('../data/cleaned_data.csv', index=False)  # Set index=False to avoid saving the DataFrame index as a separate column

print("Dataset saved")

Dataset saved


In [17]:
# Subjectivity and Polarity Detection
def detect_sentiment(text):
    blob = TextBlob(text)
    return blob.subjectivity, blob.polarity

# Apply sentiment detection
df['subjectivity'], df['polarity'] = zip(*df['cleaned_text'].apply(detect_sentiment))

# Display the dataframe
print(df.head())

   Unnamed: 0                                              title  score  \
0           0                      Are we in a recession or not?      2   
1           1    What Are Your Moves Tomorrow, February 26, 2024      1   
2           2                             Thank you Nancy Pelosi    113   
3           3  Imitator's Reckoning: $AI Anticipating a Downf...     17   
4           4                  What are the AI derivative plays?      9   

        id       subreddit                                                url  \
0  1azytar  wallstreetbets               https://i.redd.it/8xp7l2atpskc1.jpeg   
1  1azyssf  wallstreetbets  https://www.reddit.com/r/wallstreetbets/commen...   
2  1azxsze  wallstreetbets               https://i.redd.it/symyx8lniskc1.jpeg   
3  1azx26v  wallstreetbets  https://www.reddit.com/r/wallstreetbets/commen...   
4  1azvuxx  wallstreetbets  https://www.reddit.com/r/wallstreetbets/commen...   

   num_comments                                               

In [18]:

df.to_csv(r'C:\Users\Neel\OneDrive\Desktop\University\Y4S2\IR\Information-Retrieval\data\subj_polarity_data.csv', index=False)  # Set index=False to avoid saving the DataFrame index as a separate column

print("Dataset with initial subjectivity and polarity saved")

Dataset with initial subjectivity and polarity saved
