# Sentiment Analysis and Text Preprocessing

### Imports and Setup

In [1]:
import string
import logging
import pandas as pd
import spacy
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
# from nltk.stem import WordNetLemmatizer
from nltk.sentiment.util import mark_negation

## Step 1: Download Resources

In [3]:
# Download resources
logging.info("Downloading Resources...")
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# spaCy model for lemmatization (medium model) "python -m spacy download en_core_web_sm"
nlp = spacy.load('en_core_web_sm')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\phili\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\phili\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\phili\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Step 2: Load Data

In [10]:
# Load dataset
df = pd.read_csv('spotify_reviews_lightweight.csv', header=None, names=['raw_reviews'])

## Step 3: Define Preprocessing Function

In [11]:
# Preprocessing function
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)

    # Mark negations
    tokens = mark_negation(tokens)

    # Remove punctuation and stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [t for t in tokens if t not in stop_words and t not in string.punctuation]

    # Lemmatization
    lemmatized_tokens = [token.lemma_ for token in nlp(' '.join(tokens))]

    return ' '.join(lemmatized_tokens)


## Step 4: Apply Preprocessing to Dataset

In [13]:
# Apply preprocessing
df['processed_reviews'] = df['raw_reviews'].apply(preprocess_text)

None


## Step 5: Save Processed Data

In [14]:
# Save to CSV
output_file = 'reviews_preprocessed.csv'
df.to_csv(output_file, index=False)
logging.info(f"Processed data saved to {output_file}.")

## Step 6: Preview Preprocessing Results

In [16]:
# Print first few words of unprocessed and processed text
print("Unprocessed Text (first few words):")
print(df['raw_reviews'].iloc[0][:50])

print("\nProcessed Text (first few words):")
print(df['processed_reviews'].iloc[0][:50])

Unprocessed Text (first few words):
Great music service, the audio is high quality and

Processed Text (first few words):
great music service audio high quality app easy us
