In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime, timedelta
import requests
import json
from geopy.geocoders import Nominatim
from timezonefinder import TimezoneFinder
import pytz
from astral import LocationInfo
from astral.sun import sun
import random


file_id = "1f2JaIxdXlEypFlKDHwla5kC2_usslB43"
url = f"https://drive.google.com/uc?export=download&id={file_id}"

df = pd.read_csv(url)

In [17]:
df.head()

Unnamed: 0,Trip ID,Source,Destination,Trip Type,Start Date,End Date,Duration (Days),Budget (INR),Preferred Transportation,Accommodation Type,...,Shopping Interests,Event Participation,Safety Level,Travel Insurance Taken,Currency Exchange Needed,Language Barrier Level,Internet Availability,Peak Season or Off-Season,Tour Guide Required,Health & Fitness Facilities
0,1,Chandigarh,Ladakh,Beach,2023-10-23,2023-11-04,12,5685,Flight,Hostel,...,Street Markets,Sports Event,Very Safe,Yes,INR (Indian Rupee),Regional Language,Poor,Peak Season,No,Yoga Sessions
1,2,Dadra and Nagar Haveli and Daman and Diu,Odisha,Eco-Tourism,2023-03-26,2023-04-08,13,309348,Train,Hostel,...,Souvenirs,Food Festival,Moderately Safe,No,INR (Indian Rupee),Regional Language,Moderate,Peak Season,No,Gym Available
2,3,Karnataka,Himachal Pradesh,Adventure,2022-11-23,2022-12-05,12,172164,Flight,Hotel,...,Jewelry,Religious Event,Caution Required,Yes,INR (Indian Rupee),English,Excellent,Peak Season,Yes,Yoga Sessions
3,4,Puducherry,Maharashtra,Beach,2024-01-24,2024-02-01,8,391925,Train,Hostel,...,Handicrafts,Religious Event,Moderately Safe,Yes,INR (Indian Rupee),Hindi,Good,Peak Season,No,
4,5,Odisha,Sikkim,Heritage,2022-07-04,2022-07-23,19,381956,Flight,Resort,...,Luxury Brands,Film Screening,Unsafe at Night,Yes,INR (Indian Rupee),Hindi,Poor,Off-Season,Yes,Yoga Sessions


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 30 columns):
 #   Column                       Non-Null Count   Dtype 
---  ------                       --------------   ----- 
 0   Trip ID                      100000 non-null  int64 
 1   Source                       100000 non-null  object
 2   Destination                  100000 non-null  object
 3   Trip Type                    100000 non-null  object
 4   Start Date                   100000 non-null  object
 5   End Date                     100000 non-null  object
 6   Duration (Days)              100000 non-null  int64 
 7   Budget (INR)                 100000 non-null  int64 
 8   Preferred Transportation     100000 non-null  object
 9   Accommodation Type           100000 non-null  object
 10  Hotel Rating                 100000 non-null  int64 
 11  Planned Activities           100000 non-null  object
 12  Weather Condition            100000 non-null  object
 13  Cuisine Prefere

In [18]:
df["Special Requirements"] = df["Special Requirements"].fillna(df["Special Requirements"].mode()[0])
df["Health & Fitness Facilities"] = df["Health & Fitness Facilities"].fillna(df["Health & Fitness Facilities"].mode()[0])

In [None]:
def preprocess_data(df):
    df['features'] = (df['Destination'] + ' ' + df['Trip Type'] + ' ' + 
                      df['Planned Activities'] + ' ' + df['Cuisine Preference'] + ' ' + 
                      df['Travel Companions'] + ' ' + df['Special Requirements'])
    return df

df = preprocess_data(df)

tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['features'])

def get_sun_times(city, date):
    try:
        geolocator = Nominatim(user_agent="itinerary_planner")
        location = geolocator.geocode(city)
        
        if not location:
            return None, None
        
        tf = TimezoneFinder()
        timezone_str = tf.timezone_at(lng=location.longitude, lat=location.latitude)
        timezone = pytz.timezone(timezone_str)
        
        loc = LocationInfo(city, "India", timezone_str, location.latitude, location.longitude)
        s = sun(loc.observer, date=date, tzinfo=timezone)
        
        return s['sunrise'].strftime('%H:%M'), s['sunset'].strftime('%H:%M')
    except:
        return None, None

def get_points_of_interest(city, interest_type):
    mock_data = {
        'sunrise': ['Sunrise Point', 'Hilltop View', 'Beachfront'],
        'sunset': ['Sunset Point', 'Cliffside', 'Lakeside'],
        'breakfast': ['Local Cafe', 'Hotel Restaurant', 'Street Food'],
        'lunch': ['Traditional Restaurant', 'Fine Dining', 'Quick Bites'],
        'dinner': ['Rooftop Restaurant', 'Local Diner', 'Seafood Place'],
        'activity': ['Historical Site', 'Nature Walk', 'Museum', 'Shopping District']
    }
    return mock_data.get(interest_type, [])

def generate_itinerary(source, destination, days, start_date, budget, transport, accommodation, dietary):
    start_date = pd.to_datetime(start_date).date()
    end_date = start_date + timedelta(days=days-1)
    
    query = f"{destination} {budget} {transport} {accommodation} {dietary}"
    query_vec = tfidf.transform([query])
    
    similarity_scores = cosine_similarity(query_vec, tfidf_matrix).flatten()
    similar_indices = similarity_scores.argsort()[-5:][::-1]
    similar_trips = df.iloc[similar_indices]
    avg_similarity = similarity_scores[similar_indices].mean()
    
    common_activities = set()
    for activities in similar_trips['Planned Activities']:
        common_activities.update(activities.split(','))
    common_activities = list(common_activities)[:days*2]  # max 2 activities per day
    
    itinerary = []
    current_date = start_date
    
    for day in range(1, days+1):
        day_plan = {"Day": day, "Date": current_date, "Activities": []}
        sunrise_time, sunset_time = get_sun_times(destination, current_date)
        
        # sunrise point if available
        if sunrise_time:
            sunrise_points = get_points_of_interest(destination, 'sunrise')
            if sunrise_points:
                day_plan["Activities"].append({
                    "time": f"Before {sunrise_time}",
                    "activity": f"Sunrise at {random.choice(sunrise_points)}"
                })
        
        # breakfast
        breakfast_places = get_points_of_interest(destination, 'breakfast')
        day_plan["Activities"].append({
            "time": "08:00 - 09:00",
            "activity": f"Breakfast at {random.choice(breakfast_places) if breakfast_places else 'local restaurant'}"
        })
        
        # 1st activity
        if common_activities:
            day_plan["Activities"].append({
                "time": "09:30 - 12:30",
                "activity": f"{common_activities.pop(0)}"
            })
        
        # lunch
        lunch_places = get_points_of_interest(destination, 'lunch')
        day_plan["Activities"].append({
            "time": "13:00 - 14:00",
            "activity": f"Lunch at {random.choice(lunch_places) if lunch_places else 'local restaurant'}"
        })
        
        #2nd activity
        if common_activities:
            day_plan["Activities"].append({
                "time": "14:30 - 19:00",
                "activity": f"{common_activities.pop(0)}"
            })
        
        # sunset point if available
        if sunset_time:
            sunset_points = get_points_of_interest(destination, 'sunset')
            if sunset_points:
                day_plan["Activities"].append({
                    "time": f"Before {sunset_time}",
                    "activity": f"Sunset at {random.choice(sunset_points)}"
                })
        
        # dinner
        dinner_places = get_points_of_interest(destination, 'dinner')
        day_plan["Activities"].append({
            "time": "19:30 - 20:30",
            "activity": f"Dinner at {random.choice(dinner_places) if dinner_places else 'local restaurant'}"
        })
        
        itinerary.append(day_plan)
        current_date += timedelta(days=1)
    
    return itinerary, avg_similarity

# testing the function with sample inputs
if __name__ == "__main__":
    # example user inputs from frontend
    user_input = {
        "Source": "Delhi",
        "Destination": "Goa",
        "Days": 3,
        "Start Date": "2023-12-15",
        "Budget": 20000,
        "Preferred Transportation": "Flight",
        "Accommodation Type": "Hotel",
        "Dietary Restrictions": "Vegetarian"
    }
    
    # generating itinerary
    itinerary, accuracy = generate_itinerary(
        source=user_input["Source"],
        destination=user_input["Destination"],
        days=user_input["Days"],
        start_date=user_input["Start Date"],
        budget=user_input["Budget"],
        transport=user_input["Preferred Transportation"],
        accommodation=user_input["Accommodation Type"],
        dietary=user_input["Dietary Restrictions"]
    )
    
    # displaying itinerary
    print(f"\nGenerated Itinerary (Accuracy: {accuracy:.2%})")
    print("="*50)
    for day in itinerary:
        print(f"\nDay {day['Day']} ({day['Date'].strftime('%Y-%m-%d')}):")
        print("-"*30)
        for activity in day["Activities"]:
            print(f"{activity['time']}: {activity['activity']}")


Generated Itinerary (Accuracy: 54.32%)

Day 1 (2023-12-15):
------------------------------
Before 06:51: Sunrise at Beachfront
08:00 - 09:00: Breakfast at Local Cafe
09:30 - 12:30:  Museums
13:00 - 14:00: Lunch at Fine Dining
14:30 - 19:00:  Shopping
Before 18:05: Sunset at Sunset Point
19:30 - 20:30: Dinner at Local Diner

Day 2 (2023-12-16):
------------------------------
Before 06:52: Sunrise at Sunrise Point
08:00 - 09:00: Breakfast at Local Cafe
09:30 - 12:30: Trekking
13:00 - 14:00: Lunch at Traditional Restaurant
14:30 - 19:00:  Hiking
Before 18:05: Sunset at Lakeside
19:30 - 20:30: Dinner at Seafood Place

Day 3 (2023-12-17):
------------------------------
Before 06:52: Sunrise at Sunrise Point
08:00 - 09:00: Breakfast at Street Food
09:30 - 12:30: Hiking
13:00 - 14:00: Lunch at Traditional Restaurant
14:30 - 19:00:  Fishing
Before 18:06: Sunset at Lakeside
19:30 - 20:30: Dinner at Local Diner
