## Note that the if the user has booked the flight and hotel then only he/she will get recommendation else will get the output unknown user.

In [1]:
import os
import pandas as pd
from sqlalchemy import create_engine
from urllib.parse import quote_plus

# Load database credentials from environment variables
DB_USER = os.getenv("DB_USER")
DB_PASS = os.getenv("DB_PASSWORD")
DB_HOST = os.getenv("DB_HOST")
DB_PORT = 3306 # MySQL default port is 3306
DB_NAME = "makemytrip"

# Check if any variable is missing
if None in [DB_USER, DB_PASS, DB_HOST, DB_PORT, DB_NAME]:
    raise ValueError("One or more environment variables are not set. Please check your .env file or system settings.")

# Encode password to handle special characters
DB_PASS = quote_plus(DB_PASS)

# Create MySQL connection string
DATABASE_URL = f"mysql+pymysql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
engine = create_engine(DATABASE_URL)

# Fetch car rental data
query = "SELECT * FROM carrental;"
df_carrental = pd.read_sql(query, engine)

# Display the first few rows
df_carrental.head()

Unnamed: 0,travelCode,User_ID,Check_in,pickupLocation,dropoffLocation,carType,rentalAgency,rentalDuration,Car_total_distance,fuelPolicy,Car_bookingStatus,total_rent_price
0,22,0,2020-02-27,Johntown,Port Brian,Sedan,Sixt,3,285,Prepaid,Pending,3410.0
1,45,0,2020-08-06,Owensland,Ruizfort,Sedan,Enterprise,3,412,Full-to-Full,Pending,4644.0
2,117,2,2020-02-27,Edwardview,Katiefort,Sedan,Hertz,1,433,Partial,Confirmed,4740.0
3,119,2,2020-03-12,Moranborough,Lake Stephanie,Hatchback,Budget,3,421,Full-to-Full,Confirmed,3744.0
4,153,2,2020-11-05,Port Kathrynstad,East Ronnieberg,Sedan,Enterprise,5,100,Prepaid,Cancelled,1694.0


In [2]:
# import numpy as np
# from scipy.sparse import coo_matrix
# from sklearn.preprocessing import LabelEncoder

# # Copy dataframe to avoid modifying original
# df = df_carrental.copy()

# # Filter only confirmed bookings (optional)
# df = df[df["Car_bookingStatus"] == "Confirmed"]

# # Ensure carType and rentalAgency are strings before concatenation
# df["carType"] = df["carType"].astype(str)
# df["rentalAgency"] = df["rentalAgency"].astype(str)

# # Encode user and car rental information
# user_encoder = LabelEncoder()
# car_encoder = LabelEncoder()

# df["user_id"] = user_encoder.fit_transform(df["User_ID"])
# df["car_id"] = car_encoder.fit_transform(df["carType"] + " - " + df["rentalAgency"])  # Unique car-agency pair

# # Create interaction matrix (Users x Cars) with rental count as implicit feedback
# interaction_matrix = coo_matrix(
#     (np.ones(len(df)), (df["user_id"], df["car_id"])),
#     shape=(df["user_id"].nunique(), df["car_id"].nunique())
# )

# # Check matrix shape
# interaction_matrix.shape
import numpy as np
from scipy.sparse import coo_matrix
from sklearn.preprocessing import LabelEncoder
import implicit
from implicit.evaluation import precision_at_k, mean_average_precision_at_k

# Copy dataframe to avoid modifying the original
df = df_carrental.copy()

# Filter only confirmed bookings (optional)
df = df[df["Car_bookingStatus"] == "Confirmed"]

# Ensure carType and rentalAgency are strings before concatenation
df["carType"] = df["carType"].astype(str)
df["rentalAgency"] = df["rentalAgency"].astype(str)

# Encode user and car rental information
user_encoder = LabelEncoder()
car_encoder = LabelEncoder()

df["user_id"] = user_encoder.fit_transform(df["User_ID"])
df["car_id"] = car_encoder.fit_transform(df["carType"] + " - " + df["rentalAgency"])  # Unique car-agency pair

# Check if 'Rental_Count' exists, if not, assume count is 1 per row
if "Rental_Count" not in df.columns:
    df["Rental_Count"] = 1

# Create interaction matrix with rental count as implicit feedback
interaction_matrix = coo_matrix(
    (df["Rental_Count"], (df["user_id"], df["car_id"])),  
    shape=(df["user_id"].nunique(), df["car_id"].nunique())
)

# Convert to CSR format (required for ALS)
interaction_matrix_csr = interaction_matrix.tocsr()
interaction_matrix_csr.shape


  from .autonotebook import tqdm as notebook_tqdm


(723, 20)

In [5]:
import numpy as np
from scipy.sparse import coo_matrix, csr_matrix
from sklearn.preprocessing import LabelEncoder
import implicit
from implicit.evaluation import precision_at_k, mean_average_precision_at_k

# Copy dataframe
df = df_carrental.copy()

# Filter only confirmed bookings
df = df[df["Car_bookingStatus"] == "Confirmed"]

# Encode user and car rental information
user_encoder = LabelEncoder()
car_encoder = LabelEncoder()

df["user_id"] = user_encoder.fit_transform(df["User_ID"])
df["car_id"] = car_encoder.fit_transform(df["carType"].astype(str) + " - " + df["rentalAgency"].astype(str))

# Ensure 'Rental_Count' exists
if "Rental_Count" not in df.columns:
    df["Rental_Count"] = 1

# ✅ Create interaction matrix (Users x Cars)
interaction_matrix = coo_matrix(
    (df["Rental_Count"], (df["user_id"], df["car_id"])),  
    shape=(df["user_id"].nunique(), df["car_id"].nunique())
).tocsr()

# ✅ Correct Train/Test Split
train_percentage = 0.8
num_interactions = interaction_matrix.nnz
train_size = int(train_percentage * num_interactions)

interaction_indices = np.arange(num_interactions)
np.random.shuffle(interaction_indices)

train_indices = interaction_indices[:train_size]
test_indices = interaction_indices[train_size:]

train_interactions = interaction_matrix.copy()  # ✅ Ensure sparse format
test_interactions = interaction_matrix.copy()   # ✅ Ensure sparse format

print(f"Train Interactions Type: {type(train_interactions)}")
print(f"Test Interactions Type: {type(test_interactions)}")

# ✅ Train ALS model
model = implicit.als.AlternatingLeastSquares(factors=50, iterations=50, regularization=0.05)
model.fit(train_interactions)

print("✅ Model training completed!")


precision_scores = precision_at_k(model, train_interactions, test_interactions, K=10)
recall_scores = mean_average_precision_at_k(model, train_interactions, test_interactions, K=10)

# ✅ Convert to a NumPy array before taking mean()
precision_scores = np.asarray(precision_scores).mean()
recall_scores = np.asarray(recall_scores).mean()

print(f"✅ Updated Model Performance:")
print(f"Precision@10: {precision_scores:.4f}")
print(f"Recall@10: {recall_scores:.4f}")


Train Interactions Type: <class 'scipy.sparse._csr.csr_matrix'>
Test Interactions Type: <class 'scipy.sparse._csr.csr_matrix'>


  0%|          | 0/50 [00:00<?, ?it/s]

100%|██████████| 50/50 [00:00<00:00, 333.27it/s]


✅ Model training completed!


  0%|          | 0/723 [00:00<?, ?it/s]


AttributeError: 'implicit.evaluation._memoryviewslice' object has no attribute 'dtype'

checking if implicit is already installed.:

In [None]:
import implicit

# Convert interaction matrix to CSR format (required for Implicit library)
interaction_matrix_csr = interaction_matrix.tocsr()

# Train ALS model
# model = implicit.als.AlternatingLeastSquares(factors=10, iterations=20, regularization=0.1) this model gave poor performance. because of certain reasons.
model = implicit.als.AlternatingLeastSquares(factors=50, iterations=50, regularization=0.05)

model.fit(interaction_matrix_csr)

print("Model training completed!")

100%|██████████| 50/50 [00:00<00:00, 364.95it/s]

Model training completed!





printing the performance of the model.

below approach did not work so I have defined a function to print the performance.

In [None]:
import implicit
from implicit.evaluation import mean_average_precision_at_k, ndcg_at_k, precision_at_k
from sklearn.model_selection import train_test_split
import numpy as np
from scipy.sparse import csr_matrix

# Ensure interaction matrix is in CSR format
interaction_matrix_csr = interaction_matrix.tocsr()

# Get row indices (users) and column indices (items) from sparse matrix
user_indices, item_indices = interaction_matrix_csr.nonzero()

# Split into train and test (80% train, 20% test)
train_users, test_users, train_items, test_items = train_test_split(
    user_indices, item_indices, test_size=0.2, random_state=42
)

# Create new train and test sparse matrices
train_matrix = csr_matrix(
    (np.ones_like(train_users), (train_users, train_items)), 
    shape=interaction_matrix_csr.shape
)

test_matrix = csr_matrix(
    (np.ones_like(test_users), (test_users, test_items)), 
    shape=interaction_matrix_csr.shape
)

# Train ALS model
model = implicit.als.AlternatingLeastSquares(factors=10, iterations=20, regularization=0.1)
model.fit(train_matrix)

# ✅ Convert test_matrix properly before evaluation
test_matrix = test_matrix.tocsr()

# Evaluate model performance
map_at_10 = mean_average_precision_at_k(model, train_matrix, test_matrix, K=10)
ndcg_at_10 = ndcg_at_k(model, train_matrix, test_matrix, K=10)
precision_at_10 = precision_at_k(model, train_matrix, test_matrix, K=10)

print("Model Performance Metrics:")
print(f"MAP@10: {map_at_10:.4f}")
print(f"NDCG@10: {ndcg_at_10:.4f}")
print(f"Precision@10: {precision_at_10:.4f}")


In [None]:
import numpy as np

def evaluate_als_model(model, train_matrix, test_matrix, K=10):
    """
    Evaluate the ALS model using Precision@K and Recall@K.
    """
    precision_list = []
    recall_list = []

    num_users, _ = train_matrix.shape

    for user_id in range(num_users):
        # Get test items for the user
        test_items = test_matrix[user_id].indices

        if len(test_items) == 0:
            continue  # Skip users with no test data

        # Get top-K recommendations for the user
        recommended_items = model.recommend(user_id, train_matrix[user_id], N=K, filter_already_liked_items=True)
        recommended_items = [item[0] for item in recommended_items]

        # Compute Precision@K
        hits = len(set(recommended_items) & set(test_items))
        precision = hits / K
        recall = hits / len(test_items)

        precision_list.append(precision)
        recall_list.append(recall)

    # Compute final metrics
    avg_precision = np.mean(precision_list)
    avg_recall = np.mean(recall_list)

    return avg_precision, avg_recall

# Run Evaluation
precision_at_10, recall_at_10 = evaluate_als_model(model, interaction_matrix_csr, interaction_matrix_csr, K=10)

print("✅ Model Performance Metrics:")
print(f"Precision@10: {precision_at_10:.4f}")
print(f"Recall@10: {recall_at_10:.4f}")


✅ Model Performance Metrics:
Precision@10: 0.0000
Recall@10: 0.0000


Code to get the mapping for carType.

In [None]:
# Create a mapping of encoded car_id to original carType
car_id_mapping = dict(zip(df["car_id"], df["carType"]))

# Display unique mappings
print("Car Type Mapping (Encoded → Original):")
for enc_id, orig_car in car_id_mapping.items():
    print(f"Encoded ID: {enc_id} → Original Car Type: {orig_car}")

Car Type Mapping (Encoded → Original):
Encoded ID: 18 → Original Car Type: Sedan
Encoded ID: 1 → Original Car Type: Hatchback
Encoded ID: 17 → Original Car Type: Sedan
Encoded ID: 13 → Original Car Type: SUV
Encoded ID: 4 → Original Car Type: Hatchback
Encoded ID: 7 → Original Car Type: Luxury
Encoded ID: 0 → Original Car Type: Hatchback
Encoded ID: 3 → Original Car Type: Hatchback
Encoded ID: 8 → Original Car Type: Luxury
Encoded ID: 9 → Original Car Type: Luxury
Encoded ID: 16 → Original Car Type: Sedan
Encoded ID: 6 → Original Car Type: Luxury
Encoded ID: 12 → Original Car Type: SUV
Encoded ID: 15 → Original Car Type: Sedan
Encoded ID: 19 → Original Car Type: Sedan
Encoded ID: 2 → Original Car Type: Hatchback
Encoded ID: 14 → Original Car Type: SUV
Encoded ID: 11 → Original Car Type: SUV
Encoded ID: 10 → Original Car Type: SUV
Encoded ID: 5 → Original Car Type: Luxury


In [None]:
# Sort the dictionary by encoded ID (keys)
sorted_car_id_mapping = dict(sorted(car_id_mapping.items()))

# Print the sorted mapping
print("Car Type Mapping (Encoded → Original) [Sorted]:")
for enc_id, orig_car in sorted_car_id_mapping.items():
    print(f"Encoded ID: {enc_id} → Original Car Type: {orig_car}")

Car Type Mapping (Encoded → Original) [Sorted]:
Encoded ID: 0 → Original Car Type: Hatchback
Encoded ID: 1 → Original Car Type: Hatchback
Encoded ID: 2 → Original Car Type: Hatchback
Encoded ID: 3 → Original Car Type: Hatchback
Encoded ID: 4 → Original Car Type: Hatchback
Encoded ID: 5 → Original Car Type: Luxury
Encoded ID: 6 → Original Car Type: Luxury
Encoded ID: 7 → Original Car Type: Luxury
Encoded ID: 8 → Original Car Type: Luxury
Encoded ID: 9 → Original Car Type: Luxury
Encoded ID: 10 → Original Car Type: SUV
Encoded ID: 11 → Original Car Type: SUV
Encoded ID: 12 → Original Car Type: SUV
Encoded ID: 13 → Original Car Type: SUV
Encoded ID: 14 → Original Car Type: SUV
Encoded ID: 15 → Original Car Type: Sedan
Encoded ID: 16 → Original Car Type: Sedan
Encoded ID: 17 → Original Car Type: Sedan
Encoded ID: 18 → Original Car Type: Sedan
Encoded ID: 19 → Original Car Type: Sedan


⬆️this mapping indicates that for every rental agency there are five cars.

this is to check for the other users.

In [None]:
def recommend_cars(user_id, model, user_encoder, car_encoder, df, num_recommendations=5):
    """
    Recommend cars for a given user.
    """
    # Ensure the user exists in the dataset before encoding
    if user_id not in df["User_ID"].unique():
        print(f"User {user_id} not found in the dataset!")
        return

    # Convert User_ID to encoded user_id
    encoded_user_id = user_encoder.transform([user_id])[0]

    # Get recommended car indices
    recommended_car_ids, _ = model.recommend(encoded_user_id, interaction_matrix_csr[encoded_user_id], N=num_recommendations)

    # Decode car IDs to get carType and rentalAgency
    recommended_cars = car_encoder.inverse_transform(recommended_car_ids)

    print(f"Top {num_recommendations} recommended cars for User {user_id}:")
    # print("Car Type\tRental Agency")
    for car in recommended_cars:
        car_type, agency = car.split(" - ")
        print(f"Car Type {car_type}, Rental Agency {agency}")
        # print(f"{car_type}\t\t{agency}")

# Example: Run recommendations for User_ID = 5
recommend_cars(user_id=44, model=model, user_encoder=user_encoder, car_encoder=car_encoder, df=df_carrental)

Top 5 recommended cars for User 44:
Car Type SUV, Rental Agency Hertz
Car Type SUV, Rental Agency Avis
Car Type SUV, Rental Agency Budget
Car Type Luxury, Rental Agency Budget
Car Type Hatchback, Rental Agency Avis


In [None]:
# Example: Run recommendations for User_ID = 5
recommend_cars(user_id=1327, model=model, user_encoder=user_encoder, car_encoder=car_encoder, df=df_carrental)

Top 5 recommended cars for User 1327:
Car Type Luxury, Rental Agency Avis
Car Type Luxury, Rental Agency Hertz
Car Type SUV, Rental Agency Hertz
Car Type SUV, Rental Agency Enterprise
Car Type Hatchback, Rental Agency Budget


Below are the code to get the unique user and there mapping.

In [None]:
#check if user_id exists:
print(f"User IDs in the interaction matrix: {interaction_matrix.index.min()} to {interaction_matrix.index.max()}")

User IDs in the interaction matrix: 0 to 1339


this is the mapping in for the unique user because the user_id with value two is considered as first and so on..

In [None]:
# Create a mapping of encoded user_id to original User_ID
user_id_mapping = dict(zip(df["user_id"], df["User_ID"]))

# Display unique mappings
print("User ID Mapping (Encoded → Original):")
for enc_id, orig_id in user_id_mapping.items():
    print(f"Encoded ID: {enc_id} → Original User_ID: {orig_id}")

User ID Mapping (Encoded → Original):
Encoded ID: 0 → Original User_ID: 2
Encoded ID: 1 → Original User_ID: 3
Encoded ID: 2 → Original User_ID: 4
Encoded ID: 3 → Original User_ID: 6
Encoded ID: 4 → Original User_ID: 8
Encoded ID: 5 → Original User_ID: 9
Encoded ID: 6 → Original User_ID: 10
Encoded ID: 7 → Original User_ID: 11
Encoded ID: 8 → Original User_ID: 12
Encoded ID: 9 → Original User_ID: 13
Encoded ID: 10 → Original User_ID: 15
Encoded ID: 11 → Original User_ID: 16
Encoded ID: 12 → Original User_ID: 19
Encoded ID: 13 → Original User_ID: 21
Encoded ID: 14 → Original User_ID: 22
Encoded ID: 15 → Original User_ID: 25
Encoded ID: 16 → Original User_ID: 27
Encoded ID: 17 → Original User_ID: 32
Encoded ID: 18 → Original User_ID: 34
Encoded ID: 19 → Original User_ID: 36
Encoded ID: 20 → Original User_ID: 39
Encoded ID: 21 → Original User_ID: 40
Encoded ID: 22 → Original User_ID: 41
Encoded ID: 23 → Original User_ID: 44
Encoded ID: 24 → Original User_ID: 47
Encoded ID: 25 → Original Us

In [None]:
# Get unique User_IDs that exist in the dataset
unique_users = df["User_ID"].unique()
print("Available User_IDs for recommendations:", unique_users)


Available User_IDs for recommendations: [   2    3    4    6    8    9   10   11   12   13   15   16   19   21
   22   25   27   32   34   36   39   40   41   44   47   49   50   52
   55   57   58   60   61   64   65   66   67   68   69   73   75   76
   79   80   82   83   90   93  106  107  108  109  111  112  113  115
  116  117  118  119  120  125  126  129  131  133  134  135  137  141
  145  146  148  149  150  152  153  155  158  160  161  162  165  166
  168  169  174  175  176  177  178  179  180  182  186  187  188  193
  194  195  199  201  202  204  208  209  210  212  216  218  219  220
  221  222  223  228  229  232  233  234  236  237  239  240  241  242
  243  244  245  247  248  249  250  251  254  259  260  263  266  267
  268  271  272  273  274  276  277  279  280  281  284  285  288  290
  292  295  296  299  300  301  302  304  307  308  312  314  317  318
  319  321  326  328  329  330  331  337  342  343  344  348  349  352
  357  358  359  360  362  363  366  

In [None]:
# Count how many times each User_ID appears in the dataset (i.e., number of car bookings)
user_car_booking_counts = df.groupby("User_ID")["travelCode"].count().reset_index()

# Rename the column for clarity
user_car_booking_counts.columns = ["User_ID", "Number_of_Car_Bookings"]

# Display the result
print(user_car_booking_counts)

     User_ID  Number_of_Car_Bookings
0          2                       2
1          3                       3
2          4                       2
3          6                       1
4          8                       1
..       ...                     ...
718     1324                       2
719     1325                       2
720     1327                       2
721     1330                       3
722     1335                       2

[723 rows x 2 columns]


# !!!!! Not to run !!!!

In [None]:
#ensuring correct indexing is done:
if user_id in interaction_matrix.index:
    user_index = interaction_matrix.index.get_loc(user_id)  # Get row index
    print(f"User {user_id} is at index {user_index} in the interaction matrix.")
else:
    print(f"User {user_id} is not found in the interaction matrix!")


User 2 is at index 1 in the interaction matrix.


In [None]:
user_id=2
if user_id in interaction_matrix.index:
    user_index = interaction_matrix.index.get_loc(user_id)  # Convert User_ID to matrix index
    
    recommended_car_indices, scores = model.recommend(
        user_index,  # Pass the matrix index instead of User_ID
        interaction_sparse,
        N=3  # Number of recommendations
    )

    # Map encoded carType indices back to original names
    car_type_mapping = dict(enumerate(df_carrental["carType"].astype("category").cat.categories))
    recommended_cars = [car_type_mapping[idx] for idx in recommended_car_indices]

    # Display recommendations
    for i, (car, score) in enumerate(zip(recommended_cars, scores)):
        print(f"Recommendation {i+1}: {car} (Score: {score:.2f})")

else:
    print(f"User {user_id} not found in interaction matrix!")


ValueError: user_items must contain 1 row for every user in userids

In [None]:
import pymysql
import os
# Database connection parameters (update these with your actual credentials)
host = os.environ.get("DB_HOST")  # Use os.environ instead of os.getenv
user = os.environ.get("DB_USER")
password = os.environ.get("DB_PASSWORD")
database = "makemytrip"

# Establish the database connection
conn = pymysql.connect(host=host, user=user, password=password, database=database)

# SQL query to retrieve car rental data
query = "SELECT * FROM carrental;"

# Load the data into a Pandas DataFrame
df_car_rental = pd.read_sql(query, conn)

# Close the connection
conn.close()

# Display the first few rows
df_car_rental.head()


  df_car_rental = pd.read_sql(query, conn)


Unnamed: 0,travelCode,User_ID,Check_in,pickupLocation,dropoffLocation,carType,rentalAgency,rentalDuration,Car_total_distance,fuelPolicy,Car_bookingStatus,total_rent_price
0,22,0,2020-02-27,Johntown,Port Brian,Sedan,Sixt,3,285,Prepaid,Pending,3410.0
1,45,0,2020-08-06,Owensland,Ruizfort,Sedan,Enterprise,3,412,Full-to-Full,Pending,4644.0
2,117,2,2020-02-27,Edwardview,Katiefort,Sedan,Hertz,1,433,Partial,Confirmed,4740.0
3,119,2,2020-03-12,Moranborough,Lake Stephanie,Hatchback,Budget,3,421,Full-to-Full,Confirmed,3744.0
4,153,2,2020-11-05,Port Kathrynstad,East Ronnieberg,Sedan,Enterprise,5,100,Prepaid,Cancelled,1694.0


In [None]:
# Check for missing values
missing_values = df_car_rental.isnull().sum()

# Check for duplicates
duplicate_rows = df_car_rental.duplicated().sum()

# Display results
print("Missing Values:\n", missing_values)
print("\nDuplicate Rows:", duplicate_rows)

Missing Values:
 travelCode            0
User_ID               0
Check_in              0
pickupLocation        0
dropoffLocation       0
carType               0
rentalAgency          0
rentalDuration        0
Car_total_distance    0
fuelPolicy            0
Car_bookingStatus     0
total_rent_price      0
dtype: int64

Duplicate Rows: 0


In [None]:
# Check if 'user_id' column exists in the dataset
if 'User_ID' in df_car_rental.columns:
    print("✅ 'user_id' is available in the dataset!")
else:
    print("❌ 'user_id' is missing. We need user booking history for personalized recommendations.")


✅ 'user_id' is available in the dataset!


In [None]:
# Standardize column names
df_car_rental.rename(columns={'User_ID': 'user_id'}, inplace=True)

# Check unique users and their booking frequency
user_booking_counts = df_car_rental['user_id'].value_counts()

# Display the number of unique users and top 5 users with most bookings
print(f"Total Unique Users: {df_car_rental['user_id'].nunique()}")
print("\nTop 5 Users by Booking Frequency:\n", user_booking_counts.head())

# Display the first few rows of the dataset
df_car_rental[['user_id', 'carType', 'rentalAgency']].head()


Total Unique Users: 1099

Top 5 Users by Booking Frequency:
 user_id
775     13
936     13
381     12
251     12
1155    11
Name: count, dtype: int64


Unnamed: 0,user_id,carType,rentalAgency
0,0,Sedan,Sixt
1,0,Sedan,Enterprise
2,2,Sedan,Hertz
3,2,Hatchback,Budget
4,2,Sedan,Enterprise


In [None]:
from sklearn.preprocessing import LabelEncoder

# Encode user_id, car_type, and rental_agency
user_encoder = LabelEncoder()
car_encoder = LabelEncoder()
agency_encoder = LabelEncoder()

df_car_rental['user_id'] = user_encoder.fit_transform(df_car_rental['user_id'])
df_car_rental['carType'] = car_encoder.fit_transform(df_car_rental['carType'])
df_car_rental['rentalAgency'] = agency_encoder.fit_transform(df_car_rental['rentalAgency'])

# Display first few rows after encoding
df_car_rental.head()


Unnamed: 0,travelCode,user_id,Check_in,pickupLocation,dropoffLocation,carType,rentalAgency,rentalDuration,Car_total_distance,fuelPolicy,Car_bookingStatus,total_rent_price
0,22,0,2020-02-27,Johntown,Port Brian,3,4,3,285,Prepaid,Pending,3410.0
1,45,0,2020-08-06,Owensland,Ruizfort,3,2,3,412,Full-to-Full,Pending,4644.0
2,117,1,2020-02-27,Edwardview,Katiefort,3,3,1,433,Partial,Confirmed,4740.0
3,119,1,2020-03-12,Moranborough,Lake Stephanie,0,1,3,421,Full-to-Full,Confirmed,3744.0
4,153,1,2020-11-05,Port Kathrynstad,East Ronnieberg,3,2,5,100,Prepaid,Cancelled,1694.0


In [None]:
# Retrieve mappings
user_mapping = dict(zip(user_encoder.classes_, user_encoder.transform(user_encoder.classes_)))
car_mapping = dict(zip(car_encoder.classes_, car_encoder.transform(car_encoder.classes_)))
agency_mapping = dict(zip(agency_encoder.classes_, agency_encoder.transform(agency_encoder.classes_)))

# Display mappings
print("User ID Mapping:\n", user_mapping)
print("\nCar Type Mapping:\n", car_mapping)
print("\nRental Agency Mapping:\n", agency_mapping)


User ID Mapping:
 {np.int64(0): np.int64(0), np.int64(2): np.int64(1), np.int64(3): np.int64(2), np.int64(4): np.int64(3), np.int64(6): np.int64(4), np.int64(7): np.int64(5), np.int64(8): np.int64(6), np.int64(9): np.int64(7), np.int64(10): np.int64(8), np.int64(11): np.int64(9), np.int64(12): np.int64(10), np.int64(13): np.int64(11), np.int64(14): np.int64(12), np.int64(15): np.int64(13), np.int64(16): np.int64(14), np.int64(17): np.int64(15), np.int64(19): np.int64(16), np.int64(20): np.int64(17), np.int64(21): np.int64(18), np.int64(22): np.int64(19), np.int64(23): np.int64(20), np.int64(24): np.int64(21), np.int64(25): np.int64(22), np.int64(27): np.int64(23), np.int64(28): np.int64(24), np.int64(29): np.int64(25), np.int64(30): np.int64(26), np.int64(32): np.int64(27), np.int64(34): np.int64(28), np.int64(36): np.int64(29), np.int64(38): np.int64(30), np.int64(39): np.int64(31), np.int64(40): np.int64(32), np.int64(41): np.int64(33), np.int64(43): np.int64(34), np.int64(44): np.in

In [None]:
import pandas as pd
from scipy.sparse import csr_matrix

# Create an interaction matrix (users × car types)
interaction_matrix = df_car_rental.pivot_table(index='user_id', columns='carType', aggfunc='size', fill_value=0)

# Convert to sparse matrix for efficient computation
interaction_sparse = csr_matrix(interaction_matrix)

# Display matrix shape
print(f"Interaction Matrix Shape: {interaction_matrix.shape}")
interaction_matrix.head()


Interaction Matrix Shape: (1099, 4)


carType,0,1,2,3
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0,0,0,2
1,1,0,0,2
2,2,0,3,1
3,1,4,0,2
4,1,0,0,0


In [None]:
import implicit
print("Implicit library installed successfully!")

Implicit library installed successfully!


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import numpy as np
import pandas as pd
import scipy.sparse as sparse
from implicit.als import AlternatingLeastSquares

print("All libraries loaded successfully!")


All libraries loaded successfully!


In [None]:
# Load your dataset
df = pd.read_excel(r"D:\Make_my_trip\FinalDataset\CarFINALdataset.xlsx")  # Replace with your actual dataset


In [None]:
df.head()

Unnamed: 0,User_ID,travelCode,Check-in,pickupLocation,dropoffLocation,carType,rentalAgency,rentalDuration,Car_total_distance,fuelPolicy,Car_bookingStatus,total_rent_price
0,0,22,02/27/2020,Johntown,Port Brian,Sedan,Sixt,3,285,Prepaid,Pending,3410
1,0,45,08/06/2020,Owensland,Ruizfort,Sedan,Enterprise,3,412,Full-to-Full,Pending,4644
2,2,117,02/27/2020,Edwardview,Katiefort,Sedan,Hertz,1,433,Partial,Confirmed,4740
3,2,119,03/12/2020,Moranborough,Lake Stephanie,Hatchback,Budget,3,421,Full-to-Full,Confirmed,3744
4,2,153,11/05/2020,Port Kathrynstad,East Ronnieberg,Sedan,Enterprise,5,100,Prepaid,Cancelled,1694


In [None]:
df.columns

Index(['User_ID', 'travelCode', 'Check-in', 'pickupLocation',
       'dropoffLocation', 'carType', 'rentalAgency', 'rentalDuration',
       'Car_total_distance', 'fuelPolicy', 'Car_bookingStatus',
       'total_rent_price'],
      dtype='object')

In [None]:
from scipy import sparse

# Aggregate the total number of bookings per (User_ID, carType)
df["num_bookings"] = 1  # Count each booking as 1
user_car_agg = df.groupby(["User_ID", "carType"]).agg({"num_bookings": "sum"}).reset_index()

# Create a pivot table (User-CarType matrix)
user_car_matrix = user_car_agg.pivot(index="User_ID", columns="carType", values="num_bookings").fillna(0)

# Convert to a sparse matrix
sparse_user_car = sparse.csr_matrix(user_car_matrix.values)

print("User-Car Matrix Shape:", sparse_user_car.shape)


User-Car Matrix Shape: (1099, 4)


In [None]:

from implicit.evaluation import train_test_split

# Split data into training (80%) and testing (20%) sets while keeping memory low
train_matrix, test_matrix = train_test_split(sparse_user_car, train_percentage=0.8)

# Print the shapes to confirm
print("Train Matrix Shape:", train_matrix.shape)
print("Test Matrix Shape:", test_matrix.shape)


Train Matrix Shape: (1099, 4)
Test Matrix Shape: (1099, 4)


In [None]:
import implicit

# Convert the train matrix to the correct format for implicit ALS (item-user matrix)
train_csr = train_matrix.T  # Transpose to item-user format

# Initialize the ALS model with memory-efficient parameters
als_model = implicit.als.AlternatingLeastSquares(
    factors=50,   # Number of latent factors (adjust based on memory)
    iterations=15,  # Number of training iterations
    regularization=0.1,  # Prevents overfitting
    use_gpu=False  # Set to False to avoid GPU memory issues
)

# Train the model
als_model.fit(train_csr)

print("ALS model training completed successfully!")


  check_blas_config()
100%|██████████| 15/15 [00:00<00:00, 274.92it/s]

ALS model training completed successfully!





In [None]:
user_sample = 0  # Change this to any valid user index

try:
    recommended = als_model.recommend(user_sample, train_matrix[user_sample], N=4)
    print(f"Recommendations for User {user_sample}: {recommended}")
except Exception as e:
    print(f"Error for User {user_sample}: {e}")


Recommendations for User 0: (array([244, 827, 314, 106], dtype=int32), array([1.0001731, 1.0000683, 1.0000671, 1.0000056], dtype=float32))


In [None]:
nonzero_test_users = [user for user in range(test_matrix.shape[0]) if len(test_matrix[user].indices) > 0]
print(f"Users with test interactions: {len(nonzero_test_users)} / {test_matrix.shape[0]}")


Users with test interactions: 367 / 1099


In [None]:
skipped_users = 0
valid_users = 0
correct_predictions = 0
total_predictions = 0

for user in range(test_matrix.shape[0]):
    recommended_items, _ = als_model.recommend(user, train_matrix, N=4)
    recommended_items = set(recommended_items)  # Convert to set for fast lookup

    actual_items = set(test_matrix[user].indices)  # Get actual items from test data

    if not actual_items:
        skipped_users += 1
        continue  # Skip users with no interactions in test set

    valid_users += 1
    correct_predictions += len(recommended_items & actual_items)
    total_predictions += len(recommended_items)

precision_at_4 = correct_predictions / total_predictions if total_predictions > 0 else 0
print(f"✅ Precision@4: {precision_at_4:.4f}")
print(f"🔹 Skipped Users: {skipped_users}, Valid Users: {valid_users}")


ValueError: user_items must contain 1 row for every user in userids