In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
import seaborn as sns

In [None]:
base_url = "https://www.airlinequality.com/airline-reviews/british-airways"
pages = 10
page_size = 100

reviews = []

# for i in range(1, pages + 1):
for i in range(1, pages + 1):

    print(f"Scraping page {i}")

    # Create URL to collect links from paginated data
    url = f"{base_url}/page/{i}/?sortby=post_date%3ADesc&pagesize={page_size}"

    # Collect HTML data from this page
    response = requests.get(url)

    # Parse content
    content = response.content
    parsed_content = BeautifulSoup(content, 'html.parser')
    for para in parsed_content.find_all("div", {"class": "text_content"}):
        reviews.append(para.get_text())
    
    print(f"   ---> {len(reviews)} total reviews")

In [None]:
df = pd.DataFrame()
df["reviews"] = reviews
df.head()

In [None]:
df.to_csv("BA_reviews.csv")

In [None]:
df = pd.read_csv('BA_reviews.csv')

In [None]:
df.columns

In [None]:
df.rename(columns={'Unnamed: 0': 'ID'}, inplace=True)

In [None]:
df

In [None]:
df['reviews'] = df['reviews'].str.replace('Trip Verified', '')


In [None]:
df['reviews'] = df['reviews'].str.replace('Not Verified', '')


In [None]:
df['reviews'] = df['reviews'].str.replace('✅', '')


In [None]:
df['reviews'] = df['reviews'].str.replace('|', '')


In [None]:
df['ID'] = range(1, len(df) + 1)

In [None]:
from textblob import TextBlob

# Function to get sentiment polarity
def get_sentiment(text):
    blob = TextBlob(text)
    return blob.sentiment.polarity

# Apply sentiment analysis to the 'reviews' column
df['sentiment'] = df['reviews'].apply(get_sentiment)

In [None]:
df

In [None]:
index = np.random.randint(1,999)
check = df['reviews'][index]
print(check)

print("\n The index is {}".format(index))

In [None]:
token = nltk.word_tokenize(check)
token

In [None]:
token[:10]

In [None]:
tagged = nltk.pos_tag(token)

In [None]:
tagged

In [None]:
nltk.chunk.ne_chunk(tagged)

In [None]:
entities = nltk.chunk.ne_chunk(tagged)
entities.pprint()

In [None]:
sia = SentimentIntensityAnalyzer()
sia

In [None]:
sia.polarity_scores('I am so happy')

In [None]:
index = np.random.randint(1,999)
checkk = df['reviews'][index]
print(checkk)
sia.polarity_scores(checkk)

In [None]:
res = {}
for i, row in df.iterrows():
    text = row['reviews']
    myid = row['ID']
    if isinstance(text, str):  # Check if the value is a string
        res[myid] = sia.polarity_scores(text)
    else:
        res[myid] = {'compound': 0, 'pos': 0, 'neu': 0, 'neg': 0}

In [None]:
vaders = pd.DataFrame(res).T

In [None]:
vaders = vaders.loc[:, ~vaders.columns.duplicated()]
vaders

In [None]:
vaders = vaders.merge(df, how='left')

# Print the merged DataFrame
print(vaders)

In [None]:
vaders

In [None]:
import matplotlib.pyplot as plt

# Count the number of positive, neutral, and negative sentiments
sentiment_counts = vaders['sentiment'].value_counts()

# Plot the sentiment distribution
plt.figure(figsize=(8, 6))
plt.bar(sentiment_counts.index, sentiment_counts.values)
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.title('Sentiment Distribution')
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Plot a histogram of sentiment scores
plt.figure(figsize=(8, 6))
plt.hist(vaders['compound'], bins=10, edgecolor='black')
plt.xlabel('Sentiment Score')
plt.ylabel('Frequency')
plt.title('Sentiment Score Distribution')
plt.show()

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Create a word cloud of most frequent words in reviews
text = ' '.join(vaders['reviews'].dropna().tolist())
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)

# Display the word cloud
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Most Frequent Words in Reviews')
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Create a scatter plot to show the relationship between positive and negative sentiment scores
plt.figure(figsize=(8, 6))
plt.scatter(vaders['pos'], vaders['neg'], alpha=0.5)
plt.xlabel('Positive Sentiment Score')
plt.ylabel('Negative Sentiment Score')
plt.title('Positive vs Negative Sentiment Scores')
plt.show()



In [None]:
import matplotlib.pyplot as plt

# Create a box plot to visualize the distribution of sentiment scores
plt.figure(figsize=(8, 6))
plt.boxplot(vaders['compound'])
plt.ylabel('Sentiment Score')
plt.title('Distribution of Sentiment Scores')
plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Create a distribution plot to visualize the sentiment scores
plt.figure(figsize=(8, 6))
sns.kdeplot(vaders['compound'], shade=True)
plt.xlabel('Sentiment Score')
plt.ylabel('Density')
plt.title('Distribution of Sentiment Scores')
plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Calculate the correlation matrix
correlation = vaders[['neg', 'neu', 'pos', 'compound']].corr()

# Create a heatmap to visualize the correlation matrix
plt.figure(figsize=(8, 6))
sns.heatmap(correlation, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix of Sentiment Scores')
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Create a scatter plot to visualize the relationship between sentiment and compound score
plt.figure(figsize=(8, 6))
plt.scatter(vaders['sentiment'], vaders['compound'], alpha=0.5)
plt.xlabel('Sentiment')
plt.ylabel('Compound Score')
plt.title('Sentiment vs. Compound Score')
plt.show()
