<a href="https://colab.research.google.com/github/NhaLe411/251MI71_Group05_FinalProject_SourceCode/blob/main/data_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import re
import warnings
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
from collections import Counter
import io
from datetime import datetime
from google.colab import files

warnings.filterwarnings("ignore")

# NLTK setup
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('vader_lexicon')

# ==== Detect relevant columns ====
def detect_columns(df):
    col_map = {'text': None, 'rating': None, 'date': None}
    for col in df.columns:
        col_l = col.lower()
        if not col_map['text'] and any(k in col_l for k in ['review', 'comment', 'text', 'content']):
            col_map['text'] = col
        if not col_map['rating'] and any(k in col_l for k in ['rating', 'score', 'stars']):
            col_map['rating'] = col
        if not col_map['date'] and col_l in ['traveldate', 'season_tag']:
            col_map['date'] = col
        if not col_map['date'] and any(k in col_l for k in ['date', 'time', 'created']):
            col_map['date'] = col
    return col_map

# ==== Text cleaning, tokenizing ====
def clean_text(text):
    if pd.isna(text): return ""
    text = str(text)
    text = re.sub(r'<[^>]+>', ' ', text)
    text = re.sub(r'http[s]?://\S+', ' ', text)
    text = re.sub(r'www\.\S+', ' ', text)
    text = re.sub(r'\S+@\S+', ' ', text)
    text = re.sub(r'[^\w\s.,!?;:\'-]', ' ', text)
    text = re.sub(r'\d+(?!\s*(star|rating|/\d))', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.lower().strip()

stop_words = set(stopwords.words('english'))
stop_words.update({'hotel', 'room', 'stay', 'place', 'time', 'day', 'night', 'would', 'could'})

def tokenize(text):
    tokens = word_tokenize(text)
    return [t for t in tokens if t not in stop_words and len(t) > 2 and t.isalpha()]

lemmatizer = WordNetLemmatizer()
def lemmatize(tokens):
    return [lemmatizer.lemmatize(t) for t in tokens]

# ==== Sentiment & NPS ====
analyzer = SentimentIntensityAnalyzer()
def analyze_sentiment(text):
    vader_score = analyzer.polarity_scores(text)['compound']
    blob_score = TextBlob(text).sentiment.polarity
    score = vader_score * 0.7 + blob_score * 0.3
    label = 'positive' if score >= 0.1 else 'negative' if score <= -0.1 else 'neutral'
    return pd.Series([label, score, abs(score)])

def estimate_rating(label, score, conf):
    if label == 'positive':
        rating = 8.0 + (score * 2)
    elif label == 'negative':
        rating = 3.0 + (score + 1.0) * 2
    else:
        rating = 6.0 + (score * 2)
    return max(1, min(10, rating * (0.7 + conf * 0.3)))

def get_nps_category(rating):
    if rating >= 9: return 'Promoter'
    elif rating >= 7: return 'Passive'
    else: return 'Detractor'

def get_season(date):
    if pd.isna(date): return 'Unknown'
    month = date.month
    if month in [12, 1, 2]: return 'Winter'
    elif month in [3, 4, 5]: return 'Spring'
    elif month in [6, 7, 8]: return 'Summer'
    elif month in [9, 10, 11]: return 'Fall'

# ==== Upload & Run ====
uploaded = files.upload()
filename = list(uploaded.keys())[0]
if filename.endswith('.csv'):
    df = pd.read_csv(io.BytesIO(uploaded[filename]))
else:
    df = pd.read_excel(io.BytesIO(uploaded[filename]))

# Detect columns
col_map = detect_columns(df)
text_col = col_map['text']
rating_col = col_map['rating']
date_col = col_map['date']

df = df[df[text_col].notna() & (df[text_col] != '')].copy()

# Preprocess
df['cleaned_text'] = df[text_col].apply(clean_text)
df['tokens'] = df['cleaned_text'].apply(tokenize)
df['lemmatized'] = df['tokens'].apply(lemmatize)
df['processed_text'] = df['lemmatized'].apply(lambda x: ' '.join(x))

# Sentiment
df[['sentiment_label', 'sentiment_score', 'sentiment_confidence']] = df['cleaned_text'].apply(analyze_sentiment)

# Rating
if rating_col:
    df['final_rating'] = df[rating_col]
else:
    df['final_rating'] = df.apply(lambda row: estimate_rating(row['sentiment_label'], row['sentiment_score'], row['sentiment_confidence']), axis=1)

# NPS
df['nps_category'] = df['final_rating'].apply(get_nps_category)

# Season (d√πng c·ªôt season_tag n·∫øu c√≥)
if 'season_tag' in df.columns:
    df['season'] = df['season_tag'].astype(str).str.strip().str.title()
else:
    if date_col:
        df['parsed_date'] = pd.to_datetime(df[date_col], errors='coerce')
        df['season'] = df['parsed_date'].apply(get_season)
    else:
        df['season'] = 'Unknown'

# Keywords
all_tokens = [t for sub in df['lemmatized'] for t in sub]
freq = Counter(all_tokens)
common_keywords = ', '.join([w for w, c in freq.most_common(15)])
df['top_keywords'] = common_keywords

# Analyze seasonal trends
print("üìä Seasonal Rating Summary:")
season_stats = []

for season_name, group in df.groupby('season'):
    avg_rating = group['final_rating'].mean()
    total = len(group)
    pos_pct = (group['sentiment_label'] == 'positive').sum() / total * 100
    neg_pct = (group['sentiment_label'] == 'negative').sum() / total * 100
    promoters = (group['nps_category'] == 'Promoter').sum()
    detractors = (group['nps_category'] == 'Detractor').sum()
    nps = ((promoters - detractors) / total) * 100
    season_stats.append([season_name, round(avg_rating, 2), f"{pos_pct:.1f}%", f"{neg_pct:.1f}%", round(nps)])

season_df = pd.DataFrame(season_stats, columns=['Season', 'Avg Rating', '% Positive', '% Negative', 'NPS'])
print(season_df.to_string(index=False))

# Export
output_filename = f"hotel_review_processed_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
columns_to_export = [text_col, 'cleaned_text', 'processed_text', 'sentiment_label',
                     'sentiment_score', 'sentiment_confidence', 'final_rating',
                     'nps_category', 'season']

if date_col: columns_to_export.insert(1, date_col)
df[columns_to_export].to_csv(output_filename, index=False)
files.download(output_filename)


ModuleNotFoundError: No module named 'vaderSentiment'

In [None]:
!pip install vaderSentiment

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[?25l   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/126.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m126.0/126.0 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2
