In [1]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import altair as alt
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import seaborn as sns
import random

# Download NLTK resources if not already downloaded
import nltk
nltk.download('vader_lexicon')
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('vader_lexicon')
alt.data_transformers.enable("vegafusion")

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\joeyj\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\joeyj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\joeyj\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\joeyj\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


DataTransformerRegistry.enable('vegafusion')

In [3]:
filepath = "Airline_Reviews.csv"

df_data = pd.read_csv(filepath)
df_data = df_data.drop(columns = ['Route', 'Inflight Entertainment', 'Aircraft', 'Wifi & Connectivity', 'Recommended', 'Review Date',])

# Convert all values in the collumn 'Overall_Rating' to numeric vlaues
# Any non-numeric numbers changed to NaN
df_data['Overall_Rating'] = pd.to_numeric(df_data['Overall_Rating'], errors='coerce')

# Calculate the mean
means_rating = df_data.groupby("Airline Name")['Overall_Rating'].mean().astype(int)
df_data = df_data.dropna(subset=['Overall_Rating']) # Drop all NaN Values

df_data.tail()

Unnamed: 0.1,Unnamed: 0,Airline Name,Overall_Rating,Review_Title,Verified,Review,Type Of Traveller,Seat Type,Date Flown,Seat Comfort,Cabin Staff Service,Food & Beverages,Ground Service,Value For Money
23166,23166,ZIPAIR,1.0,"""customer service is terrible""",False,Bangkok to Tokyo. I’ve flown many low cost ai...,Couple Leisure,Economy Class,June 2022,2.0,1.0,,1.0,1.0
23167,23167,ZIPAIR,1.0,"""Avoid at all costs""",True,Avoid at all costs. I booked flights to go f...,Solo Leisure,Economy Class,June 2022,,,,,1.0
23168,23168,ZIPAIR,3.0,"""Will not recommend to anyone""",True,Flight was leaving at 23.15 and after an hou...,Business,Economy Class,May 2022,2.0,4.0,2.0,1.0,2.0
23169,23169,ZIPAIR,6.0,"""It was immaculately clean""",True,Zipair is JAL’s budget airline. They don’t ha...,Business,Business Class,May 2022,3.0,4.0,3.0,1.0,5.0
23170,23170,ZIPAIR,1.0,"""lost all of our money with no refund""",True,They lied about connection in Narita and we ...,Solo Leisure,Economy Class,May 2022,,,,1.0,1.0


In [4]:
df_data['Review']

0          Moroni to Moheli. Turned out to be a pretty ...
1         Moroni to Anjouan. It is a very small airline...
2          Anjouan to Dzaoudzi. A very small airline an...
3          Please do a favor yourself and do not fly wi...
4         Do not book a flight with this airline! My fr...
                               ...                        
23166     Bangkok to Tokyo. I’ve flown many low cost ai...
23167      Avoid at all costs. I booked flights to go f...
23168      Flight was leaving at 23.15 and after an hou...
23169     Zipair is JAL’s budget airline. They don’t ha...
23170      They lied about connection in Narita and we ...
Name: Review, Length: 22329, dtype: object

In [5]:
# Function to remove commas and parentheses from text
def remove_punctuation(text):
    if isinstance(text, str):  # Ensure text is a string
        return re.sub(r'[,()\'?\.]', '', text)
    return text

# Remove numeric characters
df_data['Review'] = df_data['Review'].str.replace(r'\d+', '', regex=True)

# Change all text to lowercase
df_data['Review'] = df_data['Review'].str.lower()

# Split the text in the 'Review' column into individual words
df_data['Review'] = df_data['Review'].str.split()

# Define the stop words from the nltk library
stops = set(stopwords.words('english'))

# Define a function to remove the stop words from the text
def remove_stopwords(words):
    if isinstance(words, list): 
        return [word for word in words if word not in stops]
    return words

# Apply the remove_stopwords function to the 'Review' column
df_data['Review'] = df_data['Review'].apply(remove_stopwords)

# Apply the remove_punctuation function to each word in the 'Review' column
df_data['Review'] = df_data['Review'].apply(lambda words: [remove_punctuation(word) for word in words] if isinstance(words, list) else words)

# Join the words back into sentences
df_data['Review'] = df_data['Review'].apply(lambda words: ' '.join(words) if isinstance(words, list) else words)

# Print the modified reviews
reviews = df_data['Review']

df_data['Review'] = pd.Series(df_data['Review'], dtype="string")

In [6]:
def analyze_lyrics(review):
    # Tokenize the review
    tokens = word_tokenize(review)  # Tokenize the single string
    tokens = remove_stopwords(tokens)  # Remove stopwords
    
    freq_dist = FreqDist(tokens)  # Frequency distribution of tokens
    unique_words = set(tokens)  # Unique words
    ttr = len(unique_words) / len(tokens) if len(tokens) > 0 else 0  # Type-token ratio
    
    sentiment_analyzer = SentimentIntensityAnalyzer()
    sentiment_score = sentiment_analyzer.polarity_scores(' '.join(tokens))  # Sentiment score
    
    return freq_dist, ttr, sentiment_score, tokens

In [7]:
reviews_analysis = reviews.apply(analyze_lyrics)

In [8]:
# Iterate through each row of review_analysis
for idx, row in enumerate(reviews_analysis):
    if idx >= len(df_data): # Breaks if contines after the length of reviews_analysis
        break
    
    # Iterate over the second element of each row and pull the pos, neu, and neg values
    if len(row) > 2 and isinstance(row[2], dict): 
        sentiment_dict = row[2]
        
        # Extract sentiment values
        neg = sentiment_dict.get('neg', None)
        neu = sentiment_dict.get('neu', None)
        pos = sentiment_dict.get('pos', None)
        compound = sentiment_dict.get('compound', None)

        # Assign values to the DataFrame
        df_data.at[idx, 'Sentiment_Neg'] = neg
        df_data.at[idx, 'Sentiment_Neu'] = neu
        df_data.at[idx, 'Sentiment_Pos'] = pos
        df_data.at[idx, 'Sentiment_Compound'] = compound

In [10]:
df_data.to_csv('sentiment.csv')

In [40]:
# Get top 50 airlines by count
airline_counts = df_data['Airline Name'].value_counts().nlargest(10)  

# Filter the data to include only the rows with the top 50 airlines
top_airlines = df_data[df_data['Airline Name'].isin(airline_counts.index)]

# Convert 'Date Flown' to datetime for proper sorting and analysis
df_data['Date Flown'] = pd.to_datetime(df_data['Date Flown'])

# Calculate sentiment changes for each airline
top_airlines_sorted = top_airlines.sort_values(by=['Airline Name', 'Date Flown'])
top_airlines_sorted['Sentiment_Change'] = top_airlines_sorted.groupby('Airline Name')['Sentiment_Compound'].diff()

# Altair Selection for Highlighting
highlight = alt.selection_single(
    fields=['Airline Name'], 
    on='click'
)

# Line Chart: Sentiment Over Time for Drastic Changes
line_chart = alt.Chart(top_airlines).mark_line().encode(
    x=alt.X('Date Flown:T', scale=alt.Scale(domain=[df_data['Date Flown'].min(), df_data['Date Flown'].max()])),
    y='Sentiment_Compound:Q',
    color=alt.condition(
        highlight, 'Airline Name:N', alt.value('lightgray')
    ),
    tooltip=['Airline Name', 'Date Flown', 'Sentiment_Compound', 'Sentiment_Neg', 'Sentiment_Neu', 'Sentiment_Pos']
).add_selection(
    highlight
).interactive().properties(
    title='Sentiment Compound Values Over Time for Airlines with Drastic Changes',
    width=800,
    height=400
)

date_selection = alt.selection_interval()

# Scatter Plot: Sentiment Compound Scores for Airlines
scatter = alt.Chart(top_airlines_sorted).mark_circle(size=60).encode(
    x=alt.X('Date Flown:T', scale=alt.Scale(domain=[df_data['Date Flown'].min(), df_data['Date Flown'].max()])),
    y='Sentiment_Compound:Q',
    color=alt.condition(
        highlight, 'Airline Name:N', alt.value('lightgray')
    ),
    tooltip=['Airline Name', 'Sentiment_Compound', 'Date Flown', 'Type Of Traveller', 'Overall_Rating']
).add_selection(
    highlight
).add_selection(
    date_selection
).properties(
    title='Scatter Plot of Compound Sentiment Scores for Airlines',
    width=800,
    height=400
)

filtered_plot = scatter.transform_filter(
    date_selection
).properties(
    title = ""
)

# Display both charts
line_chart.display()

combined_chart = alt.vconcat(scatter, filtered_plot)
combined_chart.display()




  highlight = alt.selection_single(
  ).add_selection(
  ).add_selection(
  ).add_selection(
