In [None]:
import pandas as pd
import numpy as np
import re
import emoji


# === INPUT ===
input_path = "../datasets/scraped_data/scraped_tripadvisor_ethiopian_airlines_reviews.csv"

# === Step 1: Load raw dataset ===
df = pd.read_csv(input_path)

# Step 2: Drop unwanted columns
df.drop(columns=[col for col in ['Cleanliness', 'Check-in and boarding'] if col in df.columns], inplace=True)

# Step 3: Fix 'Date of Travel' format
def fix_date(val):
    try:
        parsed = pd.to_datetime(val, format='%b-%y')
        return parsed.strftime('%b-%Y')
    except:
        return val

if 'Date of Travel' in df.columns:
    df['Date of Travel'] = df['Date of Travel'].astype(str).str.strip().apply(fix_date)

# Step 4: Reorder important columns
priority_cols = ['Date of Travel', 'Reviewer Name', 'Review Title', 'Review Location', 'Comment']
df = df[[col for col in priority_cols if col in df.columns] + [col for col in df.columns if col not in priority_cols]]

# Step 5: Create Seat_Comfort column from Legroom & Seat comfort
if all(col in df.columns for col in ['Legroom', 'Seat comfort']):
    df['Seat_Comfort'] = df[['Legroom', 'Seat comfort']].apply(
        lambda row: (
            np.mean([val for val in [row['Legroom'], row['Seat comfort']] if pd.notnull(val)])
            if pd.notnull(row['Legroom']) or pd.notnull(row['Seat comfort']) else np.nan
        ),
        axis=1
    )

# Step 6: Drop unwanted columns and round Seat_Comfort
df.drop(columns=[col for col in ['Unnamed: 14', 'Legroom', 'Seat comfort'] if col in df.columns], inplace=True)
if 'Seat_Comfort' in df.columns:
    df['Seat_Comfort'] = df['Seat_Comfort'].apply(lambda x: round(x) if pd.notnull(x) else np.nan)

# Step 7: Fill NaNs in rating columns using rounded median
rating_cols = ['Food and Beverage', 'In-flight Entertainment', 'Value for money', 'Seat_Comfort', 'Customer service']
for col in rating_cols:
    if col in df.columns and df[col].isnull().any():
        df[col] = df[col].fillna(round(df[col].median()))

# === ✅ Option 2: Fix 'Date of Travel' safely before splitting
df['Date of Travel'] = df['Date of Travel'].replace('', 'unknown-unknown')  # fix empty string
df['Date of Travel'] = df['Date of Travel'].fillna('unknown-unknown')       # fix NaN
df['Date of Travel'] = df['Date of Travel'].apply(lambda x: x if '-' in str(x) else 'unknown-unknown')  # fix malformed

# Step 8: Split 'Date of Travel' into Month and Year
df[['Month', 'Year']] = df['Date of Travel'].str.split('-', expand=True)
df['Year'] = df['Year'].replace('', 'unknown')
df['Year'] = df['Year'].apply(lambda x: '20' + x if pd.notnull(x) and str(x).isdigit() and len(str(x)) == 2 else x)

# Replace 'Date of Travel' with 'Month' and 'Year' in correct position
date_index = df.columns.get_loc('Date of Travel')
df.drop(columns='Date of Travel', inplace=True)
df.insert(date_index, 'Year', df.pop('Year'))
df.insert(date_index, 'Month', df.pop('Month'))

# Step 9: Split 'Review Location' into From and Destination
df['Review Location'] = df['Review Location'].fillna('unknown - unknown')
split_loc = df['Review Location'].str.split(' - ', n=1, expand=True)
df['From'] = split_loc[0]
df['Destination'] = split_loc[1]

# Replace 'Review Location' with 'From' and 'Destination' in correct position
loc_index = df.columns.get_loc('Review Location')
df.drop(columns='Review Location', inplace=True)
df.insert(loc_index, 'Destination', df.pop('Destination'))
df.insert(loc_index, 'From', df.pop('From'))

# Step 10: Drop columns 'Reviewer Name', 'General Rating'
df.drop(columns=[col for col in ['Reviewer Name', 'General Rating'] if col in df.columns], inplace=True)

# Step 11: Normalize column names
df.columns = (
    df.columns
    .str.strip().str.lower()
    .str.replace(' ', '_').str.replace('-', '_')
)
df.rename(columns={
    'from': 'departure_city',
    'destination': 'arrival_city',
    'comment': 'review_comment',
    'in_flight_entertainment': 'inflight_entertainment',
    'food_and_beverage': 'food_and_beverages',
}, inplace=True)

# Step 12: Drop invalid rows
df = df[df['month'].astype(str).str.strip().str.lower() != 'unknown']
df = df[~(df['review_title'].isnull() | (df['review_title'].astype(str).str.strip() == ''))]
df.reset_index(drop=True, inplace=True)


# Step 12.1: Drop duplicate rows
df.drop_duplicates(inplace=True)


# Step 12.2: Clean review text fields
def clean_text(text):
    if pd.isnull(text):
        return ""
    text = str(text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = emoji.replace_emoji(text, replace='')
    text = text.replace('-', ' ').replace('&', 'and')
    text = re.sub(r'[-–—]', ' ', text)
    text = re.sub(r'[^\w\s/]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'[^A-Za-z0-9\s\.\,\?\!\/]', '', text)
    return text

columns_to_clean = ['review_title', 'departure_city', 'arrival_city', 'review_comment']
for col in columns_to_clean:
    if col in df.columns:
        df[col] = df[col].apply(clean_text)



# Step 13: Add 'source' column
df['source'] = 'TripAdvisor'

# === Final OUTPUT ===
output_path = "../datasets/cleaned_data/cleaned_tripadvisor_ethiopian_airlines_reviews.csv"

df.to_csv(output_path, index=False)

print("✅ One-cell script completed.")
print("📄 Final cleaned and tagged file saved to:\n", output_path)


✅ One-cell script completed.
📄 Final cleaned and tagged file saved to:
 C:\Users\abro27\OneDrive\Desktop\Mak\Education\3.Data_Analytics\Final Project\Capstone Projects\Datas\TripAdvisory\cleaned_tripadvisor_ethiopian_airlines_reviews.csv
