In [7]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
import ssl
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [8]:
df = pd.read_csv('reviewslabelling.csv')
df.head(5)

Unnamed: 0.1,Unnamed: 0,title,reviews,date,place,Unnamed: 5,Unnamed: 6
0,0,"""no boarding drinks provided""",First time flying with British Airways busines...,25th June 2023,United Kingdom,,
1,1,"""WiFi didn't work""",Not You can buy sandwiches and crisps but don'...,24th June 2023,United Kingdom,,
2,2,"""stick with economy""",This is a two-for-one review covering economy ...,24th June 2023,United Kingdom,,
3,3,"""Communication is terrible""",Absolutely horrible airline. Communication is ...,23rd June 2023,United States,,
4,4,"""delays and cancellations""",Having experienced delays and cancellations de...,22nd June 2023,United States,,


In [9]:
print(df.info())
print(df.describe())
print(df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2930 entries, 0 to 2929
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  2930 non-null   int64  
 1   title       2930 non-null   object 
 2   reviews     2930 non-null   object 
 3   date        2930 non-null   object 
 4   place       2930 non-null   object 
 5   Unnamed: 5  0 non-null      float64
 6   Unnamed: 6  0 non-null      float64
dtypes: float64(2), int64(1), object(4)
memory usage: 160.4+ KB
None
        Unnamed: 0  Unnamed: 5  Unnamed: 6
count  2930.000000         0.0         0.0
mean      4.500000         NaN         NaN
std       2.872772         NaN         NaN
min       0.000000         NaN         NaN
25%       2.000000         NaN         NaN
50%       4.500000         NaN         NaN
75%       7.000000         NaN         NaN
max       9.000000         NaN         NaN
Unnamed: 0       0
title            0
reviews          0
date        

In [11]:
ssl._create_default_https_context = ssl._create_unverified_context

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet') 

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nolimitmide/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nolimitmide/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nolimitmide/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [24]:
# Function for text preprocessing with lemmatization
def preprocess_text_lemmatization(text):
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.isalpha()]
    
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text


df['preprocessed_title'] = df['title'].apply(preprocess_text_lemmatization)

# Display the first few rows of the DataFrame with the lemmatized preprocessed text
print(df[['title', 'preprocessed_title']].head())


                           title       preprocessed_title
0  "no boarding drinks provided"  boarding drink provided
1             "WiFi didn't work"                wifi work
2           "stick with economy"            stick economy
3    "Communication is terrible"   communication terrible
4     "delays and cancellations"       delay cancellation


In [19]:
from textblob import TextBlob

In [20]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [26]:
def get_sentiment_polarity_textblob(text):
    analysis = TextBlob(text)
    return analysis.sentiment.polarity

def get_sentiment_polarity_vader(text):
    analyzer = SentimentIntensityAnalyzer()
    compound_score = analyzer.polarity_scores(text)['compound']
    return compound_score

def categorize_sentiment(polarity):
    if polarity > 0:
        return 'Positive'
    elif polarity < 0:
        return 'Negative'
    else:
        return 'Neutral'


df['sentiment_polarity_textblob'] = df['preprocessed_title'].apply(get_sentiment_polarity_textblob)
df['sentiment_polarity_vader'] = df['preprocessed_title'].apply(get_sentiment_polarity_vader)

def voting_ensemble(textblob_score, vader_score):
    sentiments = [
        categorize_sentiment(textblob_score),
        categorize_sentiment(vader_score),
    ]
    sentiment_votes = {'Positive': 0, 'Negative': 0, 'Neutral': 0}
    for sentiment in sentiments:
        sentiment_votes[sentiment] += 1
    
    return max(sentiment_votes, key=sentiment_votes.get)


df['ensemble_sentiment'] = df.apply(lambda row: voting_ensemble(row['sentiment_polarity_textblob'], row['sentiment_polarity_vader']), axis=1)


print(df[['preprocessed_title','ensemble_sentiment']].head())


        preprocessed_title ensemble_sentiment
0  boarding drink provided            Neutral
1                wifi work            Neutral
2            stick economy            Neutral
3   communication terrible           Negative
4       delay cancellation           Negative
