# **BRITISH AIRWAYS REVIEWS ANALYSIS**
##### **by Lucila Aldana Quiñonez | Marketing Data Analyst**
# **Keywords Analysis**

In [5]:
import pandas as pd
from collections import Counter
import re
import nltk
from nltk.corpus import stopwords

In [6]:
# Load the sentiment dataset
sentiment_df = pd.read_csv(
    r"C:\Users\lucil\Documents\Education\Data Analysis\Tableau\Projects Tableau\British Airways Customer Reviews Analysis - by Lucila Aldana Quiñonez _ Marketing Data Analyst\reviews_with_sentiment.csv"
)

# Ensure date column is of datetime type
sentiment_df['date'] = pd.to_datetime(
    sentiment_df['date'],
    errors='coerce',
    dayfirst=True
)

# Overview the data
sentiment_df.info()
sentiment_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1324 entries, 0 to 1323
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   full_review      1324 non-null   object        
 1   recommended      1324 non-null   object        
 2   date             484 non-null    datetime64[ns]
 3   sentiment_score  1324 non-null   float64       
 4   sentiment        1324 non-null   object        
dtypes: datetime64[ns](1), float64(1), object(3)
memory usage: 51.8+ KB


Unnamed: 0,full_review,recommended,date,sentiment_score,sentiment
0,service was mediocre at best. Just returned f...,no,2023-03-10,0.8933,Positive
1,BA standards continue to decline. BA standar...,no,2023-02-10,-0.6428,Negative
2,"won the race to the bottom"" . Awful. Busines...",no,2023-02-10,-0.4997,Negative
3,Not a reliable airline. Not a reliable airli...,no,2023-02-10,-0.6281,Negative
4,Very disappointed. The airplanes and the loun...,no,NaT,-0.9319,Negative


In [7]:
# Keep only non-recommending reviews
df_no_rec = sentiment_df[sentiment_df["recommended"] == "no"]

neg_no = df_no_rec[df_no_rec["sentiment"] == "Negative"]
neu_no = df_no_rec[df_no_rec["sentiment"] == "Neutral"]
pos_no = df_no_rec[df_no_rec["sentiment"] == "Positive"]

In [8]:
# Clean the text
nltk.download("stopwords")

stop_words = set(stopwords.words("english"))

# Add airline-specific stopwords
airline_noise = {
    "ba", "british", "airways", "flight", "flights",
    "airline", "plane", "airport", "london", "heathrow"
}

stop_words = stop_words.union(airline_noise)

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"[^a-z\s]", "", text)
    tokens = [word for word in text.split() if word not in stop_words and len(word) > 2]
    return " ".join(tokens)

neg_no["clean_review"] = neg_no["full_review"].apply(clean_text)
neu_no["clean_review"] = neu_no["full_review"].apply(clean_text)
pos_no["clean_review"] = pos_no["full_review"].apply(clean_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lucil\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  neg_no["clean_review"] = neg_no["full_review"].apply(clean_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  neu_no["clean_review"] = neu_no["full_review"].apply(clean_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/

In [9]:
# Extract top keywords per segment

def top_keywords(series, n=20):
    words = " ".join(series).split()
    return Counter(words).most_common(n)

top_neg = top_keywords(neg_no["clean_review"])
top_neu = top_keywords(neu_no["clean_review"])
top_pos = top_keywords(pos_no["clean_review"])

top_neg, top_neu, top_pos

([('service', 416),
  ('class', 366),
  ('seat', 359),
  ('food', 316),
  ('business', 311),
  ('seats', 297),
  ('cabin', 279),
  ('crew', 274),
  ('staff', 240),
  ('one', 223),
  ('time', 215),
  ('would', 202),
  ('economy', 200),
  ('first', 200),
  ('passengers', 184),
  ('poor', 179),
  ('hours', 173),
  ('get', 162),
  ('even', 157),
  ('club', 143)],
 [('service', 20),
  ('seats', 11),
  ('served', 9),
  ('crew', 9),
  ('seat', 9),
  ('cabin', 9),
  ('food', 9),
  ('business', 8),
  ('class', 8),
  ('meal', 8),
  ('time', 8),
  ('would', 8),
  ('good', 8),
  ('could', 7),
  ('return', 7),
  ('staff', 6),
  ('get', 6),
  ('one', 6),
  ('drinks', 6),
  ('flying', 6)],
 [('service', 270),
  ('seat', 267),
  ('class', 240),
  ('crew', 226),
  ('food', 225),
  ('business', 204),
  ('seats', 193),
  ('time', 183),
  ('cabin', 183),
  ('one', 180),
  ('economy', 165),
  ('would', 153),
  ('get', 152),
  ('staff', 143),
  ('good', 139),
  ('first', 119),
  ('meal', 113),
  ('lounge', 

In [10]:
# Convert keyword lists to DataFrames
df_neg = pd.DataFrame(top_neg, columns=["keyword", "count"])
df_neg["sentiment"] = "Negative"

df_neu = pd.DataFrame(top_neu, columns=["keyword", "count"])
df_neu["sentiment"] = "Neutral"

df_pos = pd.DataFrame(top_pos, columns=["keyword", "count"])
df_pos["sentiment"] = "Positive"

# Combine all sentiments
keywords_df = pd.concat([df_neg, df_neu, df_pos], ignore_index=True)

# Segment label for future reuse
keywords_df["segment"] = "Not Recommended"

# Order sentiment properly
sentiment_order = ["Negative", "Neutral", "Positive"]
keywords_df["sentiment"] = pd.Categorical(
    keywords_df["sentiment"],
    categories=sentiment_order,
    ordered=True
)

# Sort by sentiment, then count
keywords_df = keywords_df.sort_values(
    by=["sentiment", "count"],
    ascending=[True, False]
)

# Export to CSV
keywords_df.to_csv("non_recomm_keywords.csv", index=False)