In [2]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder

In [3]:
df_train = pd.read_csv(r"E:\Ruhuna\Sem 07 10_02_2025\Advanced AI\Project\Dataset\train.csv")
df_test = pd.read_csv(r"E:\Ruhuna\Sem 07 10_02_2025\Advanced AI\Project\Dataset\test.csv")

print(df_train.head())


   Unnamed: 0      id  Gender      Customer Type  Age   Type of Travel  \
0           0   70172    Male     Loyal Customer   13  Personal Travel   
1           1    5047    Male  disloyal Customer   25  Business travel   
2           2  110028  Female     Loyal Customer   26  Business travel   
3           3   24026  Female     Loyal Customer   25  Business travel   
4           4  119299    Male     Loyal Customer   61  Business travel   

      Class  Flight Distance  Inflight wifi service  \
0  Eco Plus              460                      3   
1  Business              235                      3   
2  Business             1142                      2   
3  Business              562                      2   
4  Business              214                      3   

   Departure/Arrival time convenient  ...  Inflight entertainment  \
0                                  4  ...                       5   
1                                  2  ...                       1   
2                

In [4]:
df_train.drop(columns=["Unnamed: 0", "id"], inplace=True)

In [5]:
# Check for null values
print(df_train.isnull().sum())

Gender                                 0
Customer Type                          0
Age                                    0
Type of Travel                         0
Class                                  0
Flight Distance                        0
Inflight wifi service                  0
Departure/Arrival time convenient      0
Ease of Online booking                 0
Gate location                          0
Food and drink                         0
Online boarding                        0
Seat comfort                           0
Inflight entertainment                 0
On-board service                       0
Leg room service                       0
Baggage handling                       0
Checkin service                        0
Inflight service                       0
Cleanliness                            0
Departure Delay in Minutes             0
Arrival Delay in Minutes             310
satisfaction                           0
dtype: int64


In [6]:
#drop missing values
df_train.dropna(inplace=True)


In [7]:
print(df_train.head())

   Gender      Customer Type  Age   Type of Travel     Class  Flight Distance  \
0    Male     Loyal Customer   13  Personal Travel  Eco Plus              460   
1    Male  disloyal Customer   25  Business travel  Business              235   
2  Female     Loyal Customer   26  Business travel  Business             1142   
3  Female     Loyal Customer   25  Business travel  Business              562   
4    Male     Loyal Customer   61  Business travel  Business              214   

   Inflight wifi service  Departure/Arrival time convenient  \
0                      3                                  4   
1                      3                                  2   
2                      2                                  2   
3                      2                                  5   
4                      3                                  3   

   Ease of Online booking  Gate location  ...  Inflight entertainment  \
0                       3              1  ...                

Encode the categorical variables

In [8]:
# List of columns to encode
label_cols = ['Gender', 'Customer Type', 'Type of Travel', 'Class','satisfaction']

# Encode each column
for col in label_cols:
    le = LabelEncoder()
    df_train[col] = le.fit_transform(df_train[col])
    

print(df_train.head(5))
    

   Gender  Customer Type  Age  Type of Travel  Class  Flight Distance  \
0       1              0   13               1      2              460   
1       1              1   25               0      0              235   
2       0              0   26               0      0             1142   
3       0              0   25               0      0              562   
4       1              0   61               0      0              214   

   Inflight wifi service  Departure/Arrival time convenient  \
0                      3                                  4   
1                      3                                  2   
2                      2                                  2   
3                      2                                  5   
4                      3                                  3   

   Ease of Online booking  Gate location  ...  Inflight entertainment  \
0                       3              1  ...                       5   
1                       3           

In [9]:
# Generate synthetic complaint text

def generate_complaint(row):
    complaint = []
    if row['Inflight wifi service'] <= 2:
        complaint.append("wifi was slow")
    if row['Food and drink'] <= 2:
        complaint.append("food was bad")
    if row['Departure/Arrival time convenient'] <= 2:
        complaint.append("departure and arrival time slow")
    if row['Ease of Online booking'] <= 2:
        complaint.append("online booking was difficult")
    if row['Seat comfort'] <= 2:
        complaint.append("seat was uncomfortable")
    if row['Gate location'] <= 2:
        complaint.append("gate location was bad")
    if row['Inflight entertainment'] <= 2:
        complaint.append("inflight entertaintment was bad")
    if row['On-board service'] <= 2:
        complaint.append("onboard service was poor")
    if row["Leg room service"] <= 2:
        complaint.append("leg room service was poor")
    if row['Baggage handling'] <= 2:
        complaint.append("baggage handling was bad")
    if row['Checkin service'] <= 2:
        complaint.append("checking service was poor")
    if row['Inflight service'] <= 2:
        complaint.append("inflight service was unhelpful")
    if row['Cleanliness'] <= 2:
        complaint.append("plane was not clean")
    return " and ".join(complaint) if complaint else "no complaint"

df_train['Complaint_Text'] = df_train.apply(generate_complaint, axis=1)


In [10]:
print(df_train.columns)

Index(['Gender', 'Customer Type', 'Age', 'Type of Travel', 'Class',
       'Flight Distance', 'Inflight wifi service',
       'Departure/Arrival time convenient', 'Ease of Online booking',
       'Gate location', 'Food and drink', 'Online boarding', 'Seat comfort',
       'Inflight entertainment', 'On-board service', 'Leg room service',
       'Baggage handling', 'Checkin service', 'Inflight service',
       'Cleanliness', 'Departure Delay in Minutes', 'Arrival Delay in Minutes',
       'satisfaction', 'Complaint_Text'],
      dtype='object')


In [11]:
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [12]:
def clean_text(text):
    text = re.sub(r"[^a-zA-Z]", " ", text.lower())
    lemmatizer = WordNetLemmatizer()
    words = text.split()
    stop_words = set(stopwords.words('english'))
    cleaned = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return " ".join(cleaned)

df_train["Cleaned_Text"] = df_train["Complaint_Text"].apply(clean_text)

In [13]:
# Split the data into features and target

X = df_train.drop(columns=['satisfaction'])
y = df_train['satisfaction']

In [14]:
# Split the data into training and testing

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Model Training

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report