In [1]:
import pandas as pd
import nltk
import string
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [5]:
df = pd.read_csv("merged_tickets_with_weather.csv")

In [6]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [8]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
def clean_text(text):

    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

text_columns = ['Origin', 'Airline', 'Flight Type', 'Price Type', 'Season', 'Rain']
for col in text_columns:
    df[col] = df[col].apply(clean_text)
print(df.head())

    Departure      Return     Airline Duration  Transit   Price  \
0  2025-04-02  2025-04-09  air france  17h 30m        1  1393.0   
1  2025-04-02  2025-04-09  air france  20h 25m        1   992.0   
2  2025-04-02  2025-04-09  air france  35h 05m        1  1513.0   
3  2025-04-02  2025-04-09  air france  36h 20m        1  1151.0   
4  2025-04-02  2025-04-09  air france  17h 30m        1  1114.0   

   Competitor Price  Duration in Minutes Flight Type  Price Type  Season  \
0           1435.84                 1050       short  affordable  spring   
1           1034.84                 1225      medium  affordable  spring   
2           1555.84                 2105        long   expensive  spring   
3           1193.84                 2180        long  affordable  spring   
4           1156.84                 1050       short  affordable  spring   

   Price per Hour Origin  Average Temp (°C)  Precipitation (mm) Rain  
0           79.60     la               13.6                 0.2  yes 

In [11]:
df['cleaned']=df['Origin']+' '+df['Airline']+' '+df['Flight Type']+' '+df['Price Type']+' '+df['Season']+' '+df['Rain']

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf.fit_transform(df['cleaned'])

tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=tfidf.get_feature_names_out())
word_scores = tfidf_df.sum().sort_values(ascending=False)

top_keywords = word_scores.head(10)
print(top_keywords)

yes           1074.396321
affordable    1059.457979
short          929.533569
medium         926.896783
riyadh         897.149558
autumn         892.737590
paris          879.895190
mea            854.845419
summer         827.531452
airline        791.246885
dtype: float64


In [13]:
!pip install -U spacy
!python -m spacy download en_core_web_sm # english model
!pip install textblob #sentiment analysis

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m42.8 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [14]:
import spacy
from collections import Counter

nlp = spacy.load('en_core_web_sm')

sample_titles = df['cleaned'].sample(10, random_state=1).tolist()

for title in sample_titles:
    doc = nlp(title)
    print(f"Title: {title}")
    for token in doc:
        print(f"{token.text} - {token.pos_}")
    print("--------------")

Title: riyadh etihad airway long affordable autumn yes
riyadh - PROPN
etihad - VERB
airway - VERB
long - ADV
affordable - ADJ
autumn - NOUN
yes - INTJ
--------------
Title: la air france long affordable autumn 
la - ADP
air - PROPN
france - PROPN
long - PROPN
affordable - ADJ
autumn - NOUN
--------------
Title: paris mea short cheap autumn yes
paris - PROPN
mea - PROPN
short - PROPN
cheap - PROPN
autumn - NOUN
yes - INTJ
--------------
Title: la turkish airline short affordable winter yes
la - INTJ
turkish - VERB
airline - NOUN
short - ADJ
affordable - ADJ
winter - NOUN
yes - INTJ
--------------
Title: riyadh qatar airway medium cheap autumn 
riyadh - PROPN
qatar - PROPN
airway - NOUN
medium - NOUN
cheap - ADJ
autumn - NOUN
--------------
Title: la turkish airline medium affordable summer 
la - INTJ
turkish - VERB
airline - NOUN
medium - NOUN
affordable - ADJ
summer - NOUN
--------------
Title: riyadh qatar airway medium cheap autumn 
riyadh - PROPN
qatar - PROPN
airway - NOUN
medium -

In [16]:
from sklearn.cluster import KMeans
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X = vectorizer.fit_transform(df['cleaned'])

kmeans = KMeans(n_clusters=4, random_state=42, n_init='auto')
kmeans.fit(X)
df['cluster'] = kmeans.labels_

In [17]:
terms = vectorizer.get_feature_names_out()
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]

print("Top 5 words per cluster:\n")
for i in range(4):
    print(f"Cluster {i}:")
    top_words = [terms[ind] for ind in order_centroids[i, :5]]
    print(", ".join(top_words))
    print("--------")


Top 5 words per cluster:

Cluster 0:
paris, short, mea, yes, affordable
--------
Cluster 1:
airway, qatar, riyadh, medium, affordable
--------
Cluster 2:
airline, turkish, la, medium, yes
--------
Cluster 3:
riyadh, mea, short, affordable, autumn
--------


In [23]:
from sklearn.metrics.pairwise import cosine_similarity
tfidf_matrix = vectorizer.fit_transform(df['cleaned'])

def search_products(query, top_n=3):
    query_vec = vectorizer.transform([query])
    similarity_scores = cosine_similarity(query_vec, tfidf_matrix).flatten()
    top_indices = similarity_scores.argsort()[::-1][:top_n]
    results = df.iloc[top_indices][['Price','Departure','Return','Season','Airline']]
    return results

location = input("Enter the location: ")
season = input("Enter the season: ")
query = location + ' ' + season
results = search_products(query)
print("Top 3 matching product titles:")
print(results)

Enter the location: riyad
Enter the season: Fall
Top 3 matching product titles:
     Price   Departure      Return  Season          Airline
16  1163.0  2025-04-02  2025-04-09  spring  turkish airline
17  1127.0  2025-04-02  2025-04-09  spring     qatar airway
18  1327.0  2025-04-02  2025-04-09  spring     qatar airway
