In [None]:
# ===============================
# British Airways Reviews Pipeline (Collection + Cleaning)
# ===============================

# ---- Imports ----
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
from bs4 import BeautifulSoup
import os
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# ---- Step 1: Data Collection ----
all_reviews, all_ratings, all_dates, all_countries = [], [], [], []

# Collect reviews from Skytrax (35 pages, 100 per page)
for page_no in range(1, 36):
    response = requests.get(
        f"https://www.airlinequality.com/airline-reviews/british-airways/page/{page_no}/?sortby=post_date%3ADesc&pagesize=100"
    )
    soup = BeautifulSoup(response.content, "html.parser")

    # Extract review texts
    for block in soup.find_all("div", class_="text_content"):
        all_reviews.append(block.text)

    # Extract ratings
    for block in soup.find_all("div", class_="rating-10"):
        try:
            all_ratings.append(block.span.text)
        except:
            print(f"Rating missing on page {page_no}")
            all_ratings.append("None")

    # Extract review date
    for block in soup.find_all("time"):
        all_dates.append(block.text)

    # Extract reviewer country
    for block in soup.find_all("h3"):
        all_countries.append(block.span.next_sibling.text.strip(" ()"))

# Trim ratings to match length (small adjustment if mismatch)
all_ratings = all_ratings[:len(all_reviews)]

# Create raw dataframe
raw_df = pd.DataFrame({
    "review_text": all_reviews,
    "rating": all_ratings,
    "review_date": all_dates,
    "reviewer_country": all_countries
})

print("📌 Raw Data Shape:", raw_df.shape)
display(raw_df.head())


# ---- Step 2: Data Cleaning ----
clean_df = raw_df.copy()

# Mark verified reviews
clean_df["is_verified"] = clean_df.review_text.str.contains("Trip Verified")

# Clean textual reviews
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

cleaned_texts = []
for txt in clean_df.review_text:
    txt = txt.replace("✅ Trip Verified |", "")
    txt = re.sub("[^a-zA-Z]", " ", txt).lower()
    tokens = txt.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    cleaned_texts.append(" ".join(tokens))

clean_df["cleaned_review"] = cleaned_texts

# Convert date to datetime
clean_df["review_date"] = pd.to_datetime(clean_df["review_date"], errors="coerce")

# Clean ratings (remove junk, drop "None")
clean_df["rating"] = clean_df["rating"].str.strip()
clean_df = clean_df[clean_df["rating"] != "None"]

# Handle missing countries
clean_df = clean_df.dropna(subset=["reviewer_country"])

# Reset index
clean_df.reset_index(drop=True, inplace=True)

print("📌 Cleaned Data Shape:", clean_df.shape)
display(clean_df.head())


# ---- Step 3: Save Clean Data ----
output_path = os.path.join(os.getcwd(), "cleaned_ba_reviews.csv")
clean_df.to_csv(output_path, index=False)
print("✅ Cleaned dataset exported to:", output_path)


In [None]:
# ===============================
# Exploratory Data Analysis (EDA)
# ===============================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from wordcloud import WordCloud
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from sklearn.feature_extraction import text, TfidfVectorizer
import nltk

# Load the cleaned dataset
eda_df = pd.read_csv("cleaned_ba_reviews.csv")
eda_df = eda_df.reset_index(drop=True)

print("📊 Dataset Shape:", eda_df.shape)
display(eda_df.head())


# ---- Ratings Analysis ----
print("⭐ Average Rating:", eda_df["rating"].astype(int).mean())

rating_counts = eda_df["rating"].value_counts().sort_index()
plt.figure(figsize=(8,5))
sns.barplot(x=rating_counts.index, y=rating_counts.values, palette="viridis")
plt.xlabel("Rating (Stars)")
plt.ylabel("Number of Reviews")
plt.title("Distribution of Ratings")
plt.show()


# ---- Country Insights ----
print(f"🌍 Reviews from {eda_df['reviewer_country'].nunique()} unique countries")

country_review_counts = eda_df["reviewer_country"].value_counts().head(10)
plt.figure(figsize=(10,5))
sns.barplot(x=country_review_counts.index, y=country_review_counts.values, palette="magma")
plt.xticks(rotation=30)
plt.title("Top 10 Countries by Review Count")
plt.show()

# Average ratings per country (top 12)
country_avg_ratings = (
    eda_df.groupby("reviewer_country")["rating"].mean().sort_values(ascending=False).head(12)
)
plt.figure(figsize=(12,5))
sns.barplot(x=country_avg_ratings.index, y=country_avg_ratings.values, palette="coolwarm")
plt.xticks(rotation=30)
plt.title("Top 12 Countries with Highest Avg Ratings")
plt.show()


# ---- Time Series of Ratings ----
eda_df["review_date"] = pd.to_datetime(eda_df["review_date"], errors="coerce")
fig = px.line(eda_df, x="review_date", y=eda_df["rating"].astype(int), title="Ratings Over Time")
fig.update_xaxes(rangeslider_visible=True)
fig.show()


# ---- WordCloud ----
all_text = " ".join(eda_df["cleaned_review"])
custom_stopwords = set(stopwords.words("english"))
custom_stopwords.update([
    "ba", "flight", "british", "airway", "airline", "plane", "london", "heathrow",
    "passenger", "aircraft", "would", "could"
])

plt.figure(figsize=(15,8))
wc = WordCloud(width=1000, height=600, stopwords=custom_stopwords, max_words=300, background_color="white").generate(all_text)
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.title("WordCloud of Customer Reviews")
plt.show()


# ---- Word Frequency ----
words_list = [w for w in all_text.split() if w not in text.ENGLISH_STOP_WORDS]
freq_dist = FreqDist(words_list).most_common(20)

freq_df = pd.DataFrame(freq_dist, columns=["Word", "Frequency"])
plt.figure(figsize=(12,6))
sns.barplot(x="Word", y="Frequency", data=freq_df, palette="crest")
plt.xticks(rotation=30)
plt.title("Top 20 Frequent Words in Reviews")
plt.show()


# ---- N-grams Function ----
from nltk import ngrams

def plot_ngrams(words, n=2, top_k=15):
    ngram_list = ngrams(words, n)
    freq = FreqDist(ngram_list).most_common(top_k)
    freq_dict = {"_".join(k): v for k, v in freq}

    plt.figure(figsize=(10,6))
    sns.barplot(x=list(freq_dict.keys()), y=list(freq_dict.values()), palette="flare")
    plt.xticks(rotation=45)
    plt.title(f"Top {top_k} {n}-grams")
    plt.show()

# Example: Show top bigrams
plot_ngrams(words_list, n=2)


# ---- Sentiment Analysis ----
from textblob import TextBlob
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Polarity with TextBlob
eda_df["polarity"] = eda_df["cleaned_review"].apply(lambda x: TextBlob(x).sentiment.polarity)

# Vader sentiment labels
sid = SentimentIntensityAnalyzer()
eda_df["sentiment_label"] = eda_df["cleaned_review"].apply(
    lambda x: 1 if sid.polarity_scores(x)["compound"] > 0.2 else (-1 if sid.polarity_scores(x)["compound"] < -0.2 else 0)
)

print("🔍 Sentiment Label Distribution:")
print(eda_df["sentiment_label"].value_counts())


In [None]:
# ===============================
# Customer Booking Data Analysis (EDA & Cleaning)
# ===============================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Load dataset
cwd = os.getcwd()
booking_df = pd.read_csv(cwd + "/customer_booking.csv", encoding="ISO-8859-1")
booking_df = booking_df.reset_index(drop=True)

print("📌 Booking Dataset Shape:", booking_df.shape)
display(booking_df.head())
booking_df.info()
booking_df.describe()


# ---- Sales Channel Analysis ----
sales_counts = booking_df["sales_channel"].value_counts()
total_sales = sales_counts.sum()
for channel, count in sales_counts.items():
    print(f"Bookings via {channel}: {count} ({round(count/total_sales*100,2)}%)")


# ---- Trip Type Distribution ----
trip_counts = booking_df["trip_type"].value_counts()
total_trips = trip_counts.sum()
for trip, count in trip_counts.items():
    print(f"Trip type '{trip}': {round(count/total_trips*100,2)}% of bookings")


# ---- Purchase Lead Analysis ----
plt.figure(figsize=(15,5))
sns.histplot(booking_df["purchase_lead"], bins=40, kde=True, color="skyblue")
plt.title("Distribution of Purchase Lead (Days in Advance)")
plt.xlabel("Purchase Lead (Days)")
plt.ylabel("Number of Bookings")
plt.show()

# Remove extreme outliers (>600 days)
booking_df = booking_df[booking_df["purchase_lead"] <= 600]


# ---- Length of Stay Analysis ----
plt.figure(figsize=(15,5))
sns.histplot(booking_df["length_of_stay"], bins=40, kde=True, color="salmon")
plt.title("Distribution of Length of Stay")
plt.xlabel("Length of Stay (Days)")
plt.ylabel("Number of Bookings")
plt.show()

# Remove extreme outliers (>500 days)
booking_df = booking_df[booking_df["length_of_stay"] <= 500]


# ---- Flight Day Mapping ----
day_map = {"Mon":1, "Tue":2, "Wed":3, "Thu":4, "Fri":5, "Sat":6, "Sun":7}
booking_df["flight_day_num"] = booking_df["flight_day"].map(day_map)

plt.figure(figsize=(10,4))
sns.countplot(x="flight_day_num", data=booking_df, palette="viridis")
plt.title("Booking Count by Flight Day")
plt.xlabel("Day of Week (1=Mon, 7=Sun)")
plt.ylabel("Number of Bookings")
plt.show()


# ---- Booking Origin ----
top_origins = booking_df["booking_origin"].value_counts().head(20)
plt.figure(figsize=(15,5))
sns.barplot(x=top_origins.index, y=top_origins.values, palette="magma")
plt.xticks(rotation=45)
plt.title("Top 20 Countries by Booking Requests")
plt.xlabel("Country")
plt.ylabel("Number of Bookings")
plt.show()

# Successful bookings by country
top_complete_origins = booking_df[booking_df["booking_complete"]==1]["booking_origin"].value_counts().head(20)
plt.figure(figsize=(15,5))
sns.barplot(x=top_complete_origins.index, y=top_complete_origins.values, palette="coolwarm")
plt.xticks(rotation=45)
plt.title("Top 20 Countries by Completed Bookings")
plt.xlabel("Country")
plt.ylabel("Number of Completed Bookings")
plt.show()


# ---- Booking Completion Rate ----
completion_rate = booking_df["booking_complete"].mean() * 100
print(f"✅ Booking Completion Rate: {round(completion_rate,2)}%")
print(f"❌ Incomplete Bookings: {round(100-completion_rate,2)}%")


# ---- Save Cleaned/Filtered Dataset ----
output_file = os.path.join(cwd, "filtered_customer_booking.csv")
booking_df.to_csv(output_file, index=False)
print("✅ Filtered dataset saved as:", output_file)


In [None]:
# ===============================
# Predictive Modeling: Customer Booking Completion
# ===============================

import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams.update({'font.size': 14})

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score
from yellowbrick.classifier import ConfusionMatrix

# Load cleaned customer booking dataset
cwd = os.getcwd()
booking_df = pd.read_csv(cwd + "/filtered_customer_booking.csv")
booking_df = booking_df.reset_index(drop=True)
display(booking_df.head())

# -------------------------------
# One-Hot Encoding for categorical variables
# -------------------------------
encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)

# Sales Channel
sales_encoded = pd.DataFrame(
    encoder.fit_transform(booking_df[['sales_channel']]),
    columns=['Internet','Mobile']
)
# Trip Type
trip_encoded = pd.DataFrame(
    encoder.fit_transform(booking_df[['trip_type']]),
    columns=['RoundTrip','OneWayTrip','CircleTrip']
)

# Combine encoded columns
booking_processed = booking_df.join([sales_encoded, trip_encoded])
booking_processed.drop(['sales_channel','trip_type','booking_origin','route'], axis=1, inplace=True)

# Target label
y = booking_processed.pop('booking_complete')
X = booking_processed.copy()

# -------------------------------
# Feature Scaling
# -------------------------------
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X_scaled['label'] = y

# -------------------------------
# Correlation Matrix (optional visual check)
# -------------------------------
plt.figure(figsize=(10,7))
sns.heatmap(X_scaled.corr(), cmap='coolwarm', annot=False)
plt.title("Feature Correlation Heatmap")
plt.show()

# -------------------------------
# Train-Test Split
# -------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled.drop('label', axis=1).to_numpy(),
    X_scaled['label'].to_numpy(),
    test_size=0.2,
    random_state=42
)

# -------------------------------
# Helper functions for modeling
# -------------------------------
def fit_predict(model, X_tr, y_tr, X_val):
    model.fit(X_tr, y_tr)
    return model.predict(X_val)

def evaluate_model(y_true, y_pred):
    return {
        "accuracy": round(accuracy_score(y_true, y_pred), 2),
        "precision": round(precision_score(y_true, y_pred), 2),
        "recall": round(recall_score(y_true, y_pred), 2),
        "f1": round(f1_score(y_true, y_pred), 2)
    }

# -------------------------------
# Random Forest Classifier
# -------------------------------
rf_clf = RandomForestClassifier(n_estimators=50, max_depth=50, min_samples_split=5, random_state=0)

# Training Evaluation
y_train_pred = fit_predict(rf_clf, X_train, y_train, X_train)
train_metrics = evaluate_model(y_train, y_train_pred)
print("✅ Training Metrics:", train_metrics)

# Confusion Matrix - Training
cm_train = ConfusionMatrix(rf_clf, classes=[0,1])
cm_train.fit(X_train, y_train)
cm_train.score(X_train, y_train)

# Testing Evaluation
y_test_pred = fit_predict(rf_clf, X_train, y_train, X_test)
test_metrics = evaluate_model(y_test, y_test_pred)
print("📊 Testing Metrics:", test_metrics)

# Confusion Matrix - Testing
cm_test = ConfusionMatrix(rf_clf, classes=[0,1])
cm_test.fit(X_train, y_train)
cm_test.score(X_test, y_test)

# Feature Importance Plot
plt.figure(figsize=(10,8))
feature_importances = rf_clf.feature_importances_
sorted_idx = feature_importances.argsort()
plt.barh(X_scaled.drop('label', axis=1).columns[sorted_idx], feature_importances[sorted_idx])
plt.xlabel("Random Forest Feature Importance")
plt.title("Feature Importance")
plt.show()

# -------------------------------
# Handling Imbalanced Dataset
# -------------------------------
print("Original label distribution:\n", X_scaled['label'].value_counts())

# Downsample majority class (label=0) to balance
majority_df = X_scaled[X_scaled['label']==0].sample(n=8000, random_state=42)
minority_df = X_scaled[X_scaled['label']==1]
balanced_df = pd.concat([majority_df, minority_df], ignore_index=True).sample(frac=1, random_state=42)

X_bal = balanced_df.drop('label', axis=1)
y_bal = balanced_df['label']

# Split balanced data
X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(
    X_bal.to_numpy(), y_bal.to_numpy(), test_size=0.2, random_state=42
)

# Train on balanced data
rf_bal = RandomForestClassifier(n_estimators=50, max_depth=50, min_samples_split=5, random_state=0)
y_test_pred_bal = fit_predict(rf_bal, X_train_bal, y_train_bal, X_test_bal)

# Evaluate balanced model
balanced_metrics = evaluate_model(y_test_bal, y_test_pred_bal)
print("✅ Balanced Dataset Testing Metrics:", balanced_metrics)

# Confusion Matrix - Balanced Data
cm_bal = ConfusionMatrix(rf_bal, classes=[0,1])
cm_bal.fit(X_train_bal, y_train_bal)
cm_bal.score(X_test_bal, y_test_bal)

# Feature Importance - Balanced Data
plt.figure(figsize=(10,8))
feat_imp_bal = rf_bal.feature_importances_
sorted_idx_bal = feat_imp_bal.argsort()
plt.barh(X_bal.columns[sorted_idx_bal], feat_imp_bal[sorted_idx_bal])
plt.xlabel("Random Forest Feature Importance")
plt.title("Balanced Data Feature Importance")
plt.show()
