In [16]:
import numpy as np
import pandas as pd
import csv
from sklearn.model_selection import train_test_split

In [2]:
dataset_path = "all_reviews_fulldataset.csv" # use your own path
data_df = pd.read_csv(dataset_path)
data_df.head()

Unnamed: 0,Airline,Review ID,Date Published,Overall Rating,Passenger Country,Trip Verified,Review Title,Review,Aircraft,Type Of Traveller,...,Layover,Date Flown,Seat Comfort,Cabin Staff Service,Food & Beverages,Ground Service,Wifi & Connectivity,Value For Money,Recommended,Inflight Entertainment
0,scoot,891669,26/3/2024,3.0,Australia,Not Verified,"""not recommend flying Scoot""",My flight to Singapore was uneventful as usual...,Boeing 787,Solo Leisure,...,,Feb-24,1.0,3.0,1.0,3.0,2.0,3.0,no,
1,scoot,891535,24/3/2024,1.0,Malaysia,Trip Verified,"""whole cabin is like sauna""",Using super old plane. Aircon was blowing warm...,,Solo Leisure,...,,Mar-24,1.0,3.0,2.0,3.0,,2.0,no,1.0
2,scoot,891527,24/3/2024,9.0,Singapore,Trip Verified,"""professional, helpful and friendly""","I travelled with my sister, my elderly parent ...",,Family Leisure,...,,Mar-24,4.0,5.0,4.0,5.0,,5.0,yes,
3,scoot,891411,22/3/2024,7.0,United States,Trip Verified,"""seat pitch is generous and comfortable""",Was assigned last two row at seat 39F with the...,A321 NEO,Solo Leisure,...,,Mar-24,5.0,3.0,,2.0,2.0,5.0,yes,
4,scoot,891340,21/3/2024,4.0,Australia,Trip Verified,"""Very rude male flight attendant""",Very rude male flight attendant. Accessed the ...,A321,Family Leisure,...,,Mar-24,3.0,1.0,,4.0,,4.0,no,


In [3]:
filtered_data_df = data_df[['Review', 'Recommended']].copy()
filtered_data_df

Unnamed: 0,Review,Recommended
0,My flight to Singapore was uneventful as usual...,no
1,Using super old plane. Aircon was blowing warm...,no
2,"I travelled with my sister, my elderly parent ...",yes
3,Was assigned last two row at seat 39F with the...,yes
4,Very rude male flight attendant. Accessed the ...,no
...,...,...
4112,SYD-OOL. Arrived at the airport on time. Fligh...,no
4113,Cairns-Sydney-Phuket in Business class. Was no...,no
4114,Had the misfortune of flying Business class fr...,no
4115,I have just flown from Melbourne to Sydney and...,no


In [4]:
def clean_text(text):
    """
    Inner function for cleaning individual text entries.
    """
    replace_char=''
    return ''.join(char if char.isascii() else replace_char for char in text)

In [5]:
filtered_data_df['Cleaned_Review'] = filtered_data_df['Review'].apply(clean_text)

In [7]:
filtered_data_df.head(100)

Unnamed: 0,Review,Recommended,Cleaned_Review
0,My flight to Singapore was uneventful as usual...,no,My flight to Singapore was uneventful as usual...
1,Using super old plane. Aircon was blowing warm...,no,Using super old plane. Aircon was blowing warm...
2,"I travelled with my sister, my elderly parent ...",yes,"I travelled with my sister, my elderly parent ..."
3,Was assigned last two row at seat 39F with the...,yes,Was assigned last two row at seat 39F with the...
4,Very rude male flight attendant. Accessed the ...,no,Very rude male flight attendant. Accessed the ...
...,...,...,...
95,I asked the counter staff to assign windows se...,no,I asked the counter staff to assign windows se...
96,They overcharged me for my luggage even though...,no,They overcharged me for my luggage even though...
97,Clark to Heathrow via Singapore. I just had th...,no,Clark to Heathrow via Singapore. I just had th...
98,1 hour 20 mins stood up in line to to check in...,no,1 hour 20 mins stood up in line to to check in...


In [10]:
distinct_values = filtered_data_df['Recommended'].unique()
distinct_values

array(['no', 'yes'], dtype=object)

In [11]:
filtered_data_df['is_negative_sentiment'] = filtered_data_df['Recommended'].map({'yes': 0, 'no': 1})
filtered_data_df

Unnamed: 0,Review,Recommended,Cleaned_Review,is_negative_sentiment
0,My flight to Singapore was uneventful as usual...,no,My flight to Singapore was uneventful as usual...,1
1,Using super old plane. Aircon was blowing warm...,no,Using super old plane. Aircon was blowing warm...,1
2,"I travelled with my sister, my elderly parent ...",yes,"I travelled with my sister, my elderly parent ...",0
3,Was assigned last two row at seat 39F with the...,yes,Was assigned last two row at seat 39F with the...,0
4,Very rude male flight attendant. Accessed the ...,no,Very rude male flight attendant. Accessed the ...,1
...,...,...,...,...
4112,SYD-OOL. Arrived at the airport on time. Fligh...,no,SYD-OOL. Arrived at the airport on time. Fligh...,1
4113,Cairns-Sydney-Phuket in Business class. Was no...,no,Cairns-Sydney-Phuket in Business class. Was no...,1
4114,Had the misfortune of flying Business class fr...,no,Had the misfortune of flying Business class fr...,1
4115,I have just flown from Melbourne to Sydney and...,no,I have just flown from Melbourne to Sydney and...,1


In [12]:
train_test_df = filtered_data_df[['Cleaned_Review', 'is_negative_sentiment']].copy()
train_test_df

Unnamed: 0,Cleaned_Review,is_negative_sentiment
0,My flight to Singapore was uneventful as usual...,1
1,Using super old plane. Aircon was blowing warm...,1
2,"I travelled with my sister, my elderly parent ...",0
3,Was assigned last two row at seat 39F with the...,0
4,Very rude male flight attendant. Accessed the ...,1
...,...,...
4112,SYD-OOL. Arrived at the airport on time. Fligh...,1
4113,Cairns-Sydney-Phuket in Business class. Was no...,1
4114,Had the misfortune of flying Business class fr...,1
4115,I have just flown from Melbourne to Sydney and...,1


In [14]:
# Define the test size (e.g., 20% for testing)
test_size = 0.2

# Split the data into features (X) and target (y) if applicable
X = train_test_df  # Adjust if you have separate feature and target columns

# Perform train-test split
X_train, X_test = train_test_split(X, test_size=test_size, random_state=42)  # Set random_state for reproducibility

In [15]:
# Save the train data to train.csv
X_train.to_csv('train.csv', index=False)

# Save the test data to test.csv
X_test.to_csv('test.csv', index=False)