In [2]:
import os
import pandas as pd
from sqlalchemy import create_engine
from urllib.parse import quote_plus

# ✅ Fetch database credentials from environment variables
db_config = {
    "host": os.getenv("DB_HOST"),
    "user": os.getenv("DB_USER"),
    "password": os.getenv("DB_PASSWORD"),
    "database": "mmt"
}

# ✅ Encode the password (handles special characters like '@', ':', etc.)
encoded_password = quote_plus(db_config["password"])

# ✅ Create SQLAlchemy engine
engine = create_engine(f"mysql+pymysql://{db_config['user']}:{encoded_password}@{db_config['host']}/{db_config['database']}")

try:
    # ✅ Fetch car table data
    query_car = "SELECT * FROM car"
    car_df = pd.read_sql(query_car, engine)
    print("\nCar Table:")
    car_df.info()

    # ✅ Fetch car_rental table data
    query_car_rental = "SELECT * FROM rentals"
    car_rental_df = pd.read_sql(query_car_rental, engine)
    print("\nCar Rental Table:")
    car_rental_df.info()

except Exception as e:
    print(f"Error: {e}")



Car Table:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10800 entries, 0 to 10799
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   CarID                10800 non-null  int64  
 1   Make                 10800 non-null  object 
 2   Model                10800 non-null  object 
 3   CarType              10800 non-null  object 
 4   Mileage_kmpl         10800 non-null  int64  
 5   Year_Of_Manufacture  10800 non-null  int64  
 6   Price_Per_Day        10800 non-null  float64
 7   City                 10800 non-null  object 
 8   Car_Agency           10800 non-null  object 
 9   Agency_Price         10800 non-null  float64
 10  LocationID           10800 non-null  int64  
dtypes: float64(2), int64(4), object(5)
memory usage: 928.3+ KB

Car Rental Table:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39979 entries, 0 to 39978
Data columns (total 8 columns):
 #   Column           Non-Null Count  

In [3]:
car_df.rename(columns={"CarID":"Car_id"},inplace=True)

In [4]:
# Convert date columns to datetime format
car_rental_df["RentalDate"] = pd.to_datetime(car_rental_df["RentalDate"])
car_rental_df["ReturnDate"] = pd.to_datetime(car_rental_df["ReturnDate"])

In [5]:
# Merge car_rental_df with car_df using 'car_id' from car_rental_df and 'Car_id' from car_df
merged_df = car_rental_df.merge(car_df, left_on="CarID", right_on="Car_id", how="left")

# Drop the duplicate 'Car_id' column from car_df after merging
merged_df.drop(columns=["Car_id"], inplace=True)

# Display the first few rows of the merged dataset
print("Merged Data Sample:")
print(merged_df.head())

# Check for missing values
print("\nMissing Values:")
print(merged_df.isnull().sum())


Merged Data Sample:
   RentalID  UserID Pickup_Location RentalDate  Duration ReturnDate  CarID  \
0         1       0   Visakhapatnam 2019-09-26         3 2019-09-29   9818   
1         2       0        Shillong 2019-10-10         3 2019-10-13   6755   
2         3       0          Ranchi 2019-11-14         6 2019-11-20    331   
3         4       0         Raigarh 2019-12-12         3 2019-12-15   4288   
4         5       0         Kolkata 2019-12-26         5 2019-12-31   6287   

   TotalAmount        Make            Model    CarType  Mileage_kmpl  \
0      28350.0        Ford      Ford Bronco        SUV            10   
1      25200.0     Hyundai    Hyundai Verna      Sedan            19   
2      56400.0      Toyota     Toyota Camry      Sedan            19   
3      18600.0  Volkswagen  Volkswagen Golf  Hatchback            17   
4      29500.0  Volkswagen  Volkswagen Polo  Hatchback            18   

   Year_Of_Manufacture  Price_Per_Day           City  \
0                 2024

In [6]:
merged_df.columns

Index(['RentalID', 'UserID', 'Pickup_Location', 'RentalDate', 'Duration',
       'ReturnDate', 'CarID', 'TotalAmount', 'Make', 'Model', 'CarType',
       'Mileage_kmpl', 'Year_Of_Manufacture', 'Price_Per_Day', 'City',
       'Car_Agency', 'Agency_Price', 'LocationID'],
      dtype='object')

In [7]:
# Drop unnecessary columns but KEEP "UserID" and "RentalID"
columns_to_drop = ["Pickup_Location", "ReturnDate", "LocationID"]
merged_df.drop(columns=columns_to_drop, inplace=True)

# Display the updated dataframe
print("Updated Data Sample:")
print(merged_df.head())


Updated Data Sample:
   RentalID  UserID RentalDate  Duration  CarID  TotalAmount        Make  \
0         1       0 2019-09-26         3   9818      28350.0        Ford   
1         2       0 2019-10-10         3   6755      25200.0     Hyundai   
2         3       0 2019-11-14         6    331      56400.0      Toyota   
3         4       0 2019-12-12         3   4288      18600.0  Volkswagen   
4         5       0 2019-12-26         5   6287      29500.0  Volkswagen   

             Model    CarType  Mileage_kmpl  Year_Of_Manufacture  \
0      Ford Bronco        SUV            10                 2024   
1    Hyundai Verna      Sedan            19                 2018   
2     Toyota Camry      Sedan            19                 2017   
3  Volkswagen Golf  Hatchback            17                 2005   
4  Volkswagen Polo  Hatchback            18                 2019   

   Price_Per_Day           City                  Car_Agency  Agency_Price  
0         8500.0  Visakhapatnam      

In [8]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39979 entries, 0 to 39978
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   RentalID             39979 non-null  int64         
 1   UserID               39979 non-null  int64         
 2   RentalDate           39979 non-null  datetime64[ns]
 3   Duration             39979 non-null  int64         
 4   CarID                39979 non-null  int64         
 5   TotalAmount          39979 non-null  float64       
 6   Make                 39979 non-null  object        
 7   Model                39979 non-null  object        
 8   CarType              39979 non-null  object        
 9   Mileage_kmpl         39979 non-null  int64         
 10  Year_Of_Manufacture  39979 non-null  int64         
 11  Price_Per_Day        39979 non-null  float64       
 12  City                 39979 non-null  object        
 13  Car_Agency           39979 non-

Encoding the categorical columns of the dataframe.

In [9]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# Make a copy to avoid modifying the original DataFrame
df_encoded = merged_df.copy()

# 1. One-Hot Encoding for 'Make' and 'Car Type'
one_hot_features = ['Make', 'CarType']
df_encoded = pd.get_dummies(df_encoded, columns=one_hot_features, drop_first=True)

# 2. Label Encoding for 'Model', 'city', and 'car_agency'
label_encoders = {}
label_features = ['Model', 'City', 'Car_Agency']

for col in label_features:
    label_encoders[col] = LabelEncoder()
    df_encoded[col] = label_encoders[col].fit_transform(df_encoded[col])

# Display the first few rows of the encoded DataFrame
print(df_encoded.head())


   RentalID  UserID RentalDate  Duration  CarID  TotalAmount  Model  \
0         1       0 2019-09-26         3   9818      28350.0      0   
1         2       0 2019-10-10         3   6755      25200.0     22   
2         3       0 2019-11-14         6    331      56400.0     24   
3         4       0 2019-12-12         3   4288      18600.0     33   
4         5       0 2019-12-26         5   6287      29500.0     35   

   Mileage_kmpl  Year_Of_Manufacture  Price_Per_Day  City  Car_Agency  \
0            10                 2024         8500.0    53         252   
1            19                 2018         5500.0    43          75   
2            19                 2017         6500.0    42          83   
3            17                 2005         4500.0    40         170   
4            18                 2019         4000.0    26         153   

   Agency_Price  Make_Honda  Make_Hyundai  Make_Toyota  Make_Volkswagen  \
0         950.0       False         False        False     

In [None]:
import numpy as np
import pandas as pd
import faiss
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from scipy.stats import spearmanr

# Load and copy dataset
df = merged_df.copy()

# Select numerical features with relevance
numerical_features = ["TotalAmount", "Price_Per_Day", "Mileage_kmpl", 
                      "Year_Of_Manufacture", "Agency_Price", "Duration"]

# Feature Scaling (Choose MinMaxScaler for Cosine Similarity)
scaler = MinMaxScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])
print(df[numerical_features])


       TotalAmount  Price_Per_Day  Mileage_kmpl  Year_Of_Manufacture  \
0         0.168535       0.370370      0.214286             1.000000   
1         0.146461       0.148148      0.857143             0.900000   
2         0.365102       0.222222      0.857143             0.883333   
3         0.100210       0.074074      0.714286             0.683333   
4         0.176594       0.037037      0.785714             0.916667   
...            ...            ...           ...                  ...   
39974     0.406447       0.185185      0.785714             0.833333   
39975     0.421864       0.481481      0.142857             0.616667   
39976     0.428171       0.407407      0.571429             0.833333   
39977     0.071479       0.148148      0.857143             0.900000   
39978     0.152067       0.000000      0.928571             0.783333   

       Agency_Price  Duration  
0          0.046875  0.333333  
1          0.656250  0.333333  
2          0.656250  0.833333  
3      

In [12]:
# Convert to NumPy array (FAISS requires contiguous array)
data_matrix = np.ascontiguousarray(df[numerical_features].values.astype("float32"))

# Get feature count
num_features = data_matrix.shape[1]
print("Data matrix shape:", data_matrix.shape)

# Create FAISS Indexes for different distance metrics
index_cosine = faiss.IndexFlatIP(num_features)  # Cosine Similarity (IP)
index_euclidean = faiss.IndexFlatL2(num_features)  # Euclidean Distance (L2)

# Normalize data for cosine similarity
faiss.normalize_L2(data_matrix)

# Add data to FAISS indexes
index_cosine.add(data_matrix)
index_euclidean.add(data_matrix)

print("Total cars indexed:", index_cosine.ntotal)



Data matrix shape: (39979, 6)
Total cars indexed: 39979


In [13]:
# Define hybrid recommendation function
def hybrid_recommendation(query_index, N=10, alpha=0.5):
    """Generate hybrid recommendations using weighted similarity."""
    
    query_vector = data_matrix[query_index].reshape(1, -1)

    # Search in both FAISS indexes
    cosine_distances, cosine_indices = index_cosine.search(query_vector, N)
    euclidean_distances, euclidean_indices = index_euclidean.search(query_vector, N)

    # Normalize Euclidean distances (MinMax Scaling)
    scaler = MinMaxScaler()
    normalized_euclidean = scaler.fit_transform(euclidean_distances.T).T

    # Hybrid Score Calculation
    hybrid_scores = alpha * cosine_distances + (1 - alpha) * (1 - normalized_euclidean)
    
    # Rank by hybrid score
    final_indices = np.argsort(-hybrid_scores[0])
    recommended_indices = cosine_indices[0][final_indices]
    
    return recommended_indices

In [14]:
# Select a random car index for recommendation
random_index = np.random.randint(0, data_matrix.shape[0])

# Generate recommendations
recommended_indices = hybrid_recommendation(random_index, N=10, alpha=0.7)

# Retrieve recommended cars
recommended_cars = df.iloc[recommended_indices]

print("\nSelected Car Details:")
print(df.iloc[random_index])

print("\nHybrid Recommended Cars:")
print(recommended_cars)


Selected Car Details:
RentalID                             17700
UserID                                 581
RentalDate             2019-10-03 00:00:00
Duration                          0.833333
CarID                                 9996
TotalAmount                        0.38192
Make                               Hyundai
Model                      Hyundai Elantra
CarType                              Sedan
Mileage_kmpl                      0.714286
Year_Of_Manufacture               0.866667
Price_Per_Day                     0.185185
City                         Visakhapatnam
Car_Agency             Luxury Motion Vizag
Agency_Price                        0.9375
Name: 17699, dtype: object

Hybrid Recommended Cars:
       RentalID  UserID RentalDate  Duration  CarID  TotalAmount     Make  \
35670     35671    1204 2021-02-25  0.833333   7396      0.38192  Hyundai   
32541     32542    1101 2020-10-01  0.833333  10796      0.38192  Hyundai   
18441     18442     608 2019-10-31  0.833333   6

In [None]:
# ---- Performance Evaluation ----

def evaluate_recommendations(recommended_indices, actual_rented_indices, N=10):
    """Evaluate recommendations using Precision@N and Recall@N."""
    
    recommended_set = set(recommended_indices[:N])
    actual_set = set(actual_rented_indices)

    TP = len(recommended_set.intersection(actual_set))
    
    # Precision@N: TP / N
    precision_at_N = TP / N

    # Recall@N: TP / total actual rented cars
    recall_at_N = TP / len(actual_set) if len(actual_set) > 0 else 0

    return precision_at_N, recall_at_N

In [17]:
# Create user-to-car mapping
user_rented_cars = df.groupby("UserID")["CarID"].apply(set).to_dict()

# Get actual rented cars for selected user
selected_user_id = df.iloc[random_index]["UserID"]
actual_rented_indices = user_rented_cars.get(selected_user_id, [])

# Compute evaluation metrics
precision, recall = evaluate_recommendations(recommended_indices, actual_rented_indices, N=10)
# Print results
print(f"Precision@10: {precision:.2f}")
print(f"Recall@10: {recall:.2f}")

Precision@10: 0.00
Recall@10: 0.00


In [19]:
print(f"Shape of recommended_indices: {np.shape(recommended_indices)}")
print(f"Shape of actual_rented_indices: {np.shape(actual_rented_indices)}")


Shape of recommended_indices: (10,)
Shape of actual_rented_indices: ()


In [18]:
# ---- Spearman Rank Correlation ----
def compute_spearman_rank(similarity_metric1, similarity_metric2):
    """Calculate Spearman correlation between rankings."""
    
    return spearmanr(similarity_metric1, similarity_metric2).correlation

# Compute ranking correlations
spearman_corr = compute_spearman_rank(recommended_indices, actual_rented_indices)
print(f"Spearman Rank Correlation: {spearman_corr:.2f}")


ValueError: all the input array dimensions except for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 10 and the array at index 1 has size 1

____

In [None]:
import numpy as np
import faiss
from sklearn.preprocessing import MinMaxScaler

# Extract numerical features from merged_df
numerical_features = merged_df.select_dtypes(include=[np.number])

# Convert numerical features to NumPy array
data_matrix = numerical_features.to_numpy().astype("float32")
data_matrix = np.ascontiguousarray(data_matrix)

num_features = data_matrix.shape[1]
print("Data matrix shape:", data_matrix.shape)

# Normalize data for cosine similarity
faiss.normalize_L2(data_matrix)

# Create FAISS indexes for both similarity measures
index_cosine = faiss.IndexFlatIP(num_features)  # Cosine Similarity (IP)
index_euclidean = faiss.IndexFlatL2(num_features)  # Euclidean Distance (L2)

# Add data to FAISS indexes
index_cosine.add(data_matrix)
index_euclidean.add(data_matrix)

print("Total cars indexed:", index_cosine.ntotal)

# Number of recommendations
N = 10  

# Select a random car index
random_index = np.random.randint(0, data_matrix.shape[0])

# Extract query vector
query_vector = data_matrix[random_index].reshape(1, -1)

# Search using both indexes
cosine_distances, cosine_indices = index_cosine.search(query_vector, N)
euclidean_distances, euclidean_indices = index_euclidean.search(query_vector, N)

# Normalize Euclidean distances (Min-Max Scaling)
scaler = MinMaxScaler()
normalized_euclidean = scaler.fit_transform(euclidean_distances.T).T  # Normalize per row

# Hybrid Score Calculation: Weighted combination of Cosine & Euclidean
# alpha = 0.7  # Weighting factor (Adjustable)
# hybrid_scores = alpha * cosine_distances + (1 - alpha) * (1 - normalized_euclidean)
alpha = 0.5  # Change weight
hybrid_scores = alpha * cosine_distances + (1 - alpha) * (1 - normalized_euclidean)

# Get final ranked indices
final_indices = np.argsort(-hybrid_scores[0])  # Sort in descending order
recommended_indices = cosine_indices[0][final_indices]  # Apply sorted order

# Print recommendations
print(f"Selected Car Index: {random_index}")
print(f"Hybrid Top {N} Recommended Car Indices: {recommended_indices}")

# Retrieve car details
recommended_cars = merged_df.iloc[recommended_indices]

# Display results
print("Selected Car Details:")
print(merged_df.iloc[random_index])

print("\nHybrid Recommended Cars:")
print(recommended_cars)


Data matrix shape: (39979, 9)
Total cars indexed: 39979
Selected Car Index: 11444
Hybrid Top 10 Recommended Car Indices: [11444 12401 10125 10244  7422 13053  5629  7458  7983  7877]
Selected Car Details:
RentalID                                11445
UserID                                    371
RentalDate                2020-02-13 00:00:00
Duration                                    7
CarID                                    9062
TotalAmount                          127400.0
Make                                     Ford
Model                                 Ford GT
CarType                                Luxury
Mileage_kmpl                                7
Year_Of_Manufacture                      2004
Price_Per_Day                         17000.0
City                               Chandigarh
Car_Agency             Budget Auto Chandigarh
Agency_Price                           1200.0
Name: 11444, dtype: object

Hybrid Recommended Cars:
       RentalID  UserID RentalDate  Duration  CarID 

In [None]:
def evaluate_recommendations(recommended_indices, actual_rented_indices, N=10):
    """
    Evaluate recommendation performance using Precision@N and Recall@N.

    Parameters:
    - recommended_indices: List/Array of recommended car indices
    - actual_rented_indices: Set of actual rented car indices by the user
    - N: Number of recommendations

    Returns:
    - Precision@N, Recall@N
    """
    recommended_set = set(recommended_indices[:N])
    actual_set = set(actual_rented_indices)

    # Calculate True Positives (TP) - items in both recommended and actual
    TP = len(recommended_set.intersection(actual_set))

    # Precision@N: TP / N
    precision_at_N = TP / N

    # Recall@N: TP / total actual rented cars
    recall_at_N = TP / len(actual_set) if len(actual_set) > 0 else 0

    return precision_at_N, recall_at_N


# Assume 'user_rented_cars' is a dictionary mapping user IDs to their rented car indices
# Example: { user_id_1: [car_index_1, car_index_2, ...], ... }

# Get actual rented cars for the selected user
selected_user_id = merged_df.iloc[random_index]["UserID"]
actual_rented_indices = user_rented_cars.get(selected_user_id, [])

# Calculate performance metrics
precision, recall = evaluate_recommendations(recommended_indices, actual_rented_indices, N=10)

# Print results
print(f"Precision@10: {precision:.2f}")
print(f"Recall@10: {recall:.2f}")


NameError: name 'user_rented_cars' is not defined

In [None]:
# Create a dictionary mapping each user to their rented car indices
user_rented_cars = merged_df.groupby("UserID")["CarID"].apply(set).to_dict()

# Get the actual rented cars for the selected user
selected_user_id = merged_df.iloc[random_index]["UserID"]
actual_rented_cars = user_rented_cars.get(selected_user_id, set())

# Evaluate recommendation performance
precision, recall = evaluate_recommendations(recommended_indices, actual_rented_cars, N=10)

# Print results
print(f"Precision@10: {precision:.2f}")
print(f"Recall@10: {recall:.2f}")


Precision@10: 0.00
Recall@10: 0.00


In [None]:
print(f"User {selected_user_id} rented cars: {user_rented_cars.get(selected_user_id, [])}")


User 1266 rented cars: {736, 3708, 2341, 5096, 5481, 745, 10007, 1202, 9623, 4953, 5367, 4284, 5021, 3455}


In [None]:
print(f"User {selected_user_id} rented cars: {actual_rented_indices}")
print(f"Recommended cars: {recommended_indices.tolist()}")


NameError: name 'actual_rented_indices' is not defined

using two distance and comparing the performance.

In [None]:
import numpy as np
import pandas as pd
import faiss
from sklearn.preprocessing import StandardScaler

# Load your dataset (replace 'merged_df' with your actual DataFrame)
# Ensure the dataset is cleaned and preprocessed
df = merged_df.copy()

# Select numerical features for similarity calculation
numerical_features = ["TotalAmount", "Mileage_kmpl", "Price_Per_Day", "Agency_Price", "Duration"]

# Convert selected columns to float and normalize
scaler = StandardScaler()
data_matrix = scaler.fit_transform(df[numerical_features].astype("float32"))

# Ensure the array is C-contiguous (FAISS requires this)
data_matrix = np.ascontiguousarray(data_matrix)

# Get the number of features (columns)
num_features = data_matrix.shape[1]
print("Data matrix shape:", data_matrix.shape)

# Try different FAISS distance metrics
distance_metrics = {
    "Cosine Similarity (IP)": faiss.IndexFlatIP(num_features),  # Inner Product (Cosine Similarity)
    "Euclidean Distance (L2)": faiss.IndexFlatL2(num_features),  # Euclidean Distance
}

for metric_name, index in distance_metrics.items():
    print(f"\nUsing {metric_name}")

    # Normalize data for cosine similarity (only needed for IP)
    if "IP" in metric_name:
        faiss.normalize_L2(data_matrix)

    # Create FAISS index and add data
    index.add(data_matrix)

    # Check total indexed cars
    print("Total cars indexed:", index.ntotal)

    # Number of similar cars to retrieve
    N = 10  

    # Select a random car from the dataset (or choose a specific index)
    random_index = np.random.randint(0, data_matrix.shape[0])

    # Extract the feature vector of the selected car
    query_vector = data_matrix[random_index].reshape(1, -1)

    # Perform the search
    distances, indices = index.search(query_vector, N)

    # Print recommended cars
    print(f"Selected Car Index: {random_index}")
    print(f"Top {N} Recommended Cars Indexes: {indices[0]}")
    print(f"Distances: {distances[0]}")

    # Retrieve the original car details for recommended indices
    recommended_cars = df.iloc[indices[0]]

    # Display results
    print("\nSelected Car Details:")
    print(df.iloc[random_index])  # Print the original car details

    print("\nRecommended Cars:")
    print(recommended_cars)

    # --- Performance Evaluation ---
    def compute_precision_recall(recommended_indices, actual_indices, N):
        """Calculate Precision@N and Recall@N"""
        relevant_items = set(actual_indices)
        retrieved_items = set(recommended_indices)

        # Precision@N = Relevant recommended items / N
        precision_at_n = len(retrieved_items & relevant_items) / N

        # Recall@N = Relevant recommended items / Total relevant items
        recall_at_n = len(retrieved_items & relevant_items) / len(relevant_items)

        return precision_at_n, recall_at_n

    # Assuming that 'CarID' is a key to find relevant recommendations
    actual_car_id = df.iloc[random_index]["CarID"]
    actual_indices = df[df["CarID"] == actual_car_id].index.tolist()

    precision, recall = compute_precision_recall(indices[0], actual_indices, N)

    print(f"\nPerformance Metrics for {metric_name}:")
    print(f"Precision@{N}: {precision:.2f}")
    print(f"Recall@{N}: {recall:.2f}")


Data matrix shape: (39979, 5)

Using Cosine Similarity (IP)
Total cars indexed: 39979
Selected Car Index: 12966
Top 10 Recommended Cars Indexes: [33011 32122 28383 23718 21539 21207 17635 13269 12966   638]
Distances: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]

Selected Car Details:
RentalID                                 12967
UserID                                     428
RentalDate                 2021-05-27 00:00:00
Duration                                     5
CarID                                     5759
TotalAmount                            34500.0
Make                                   Hyundai
Model                              Hyundai i20
CarType                              Hatchback
Mileage_kmpl                                20
Year_Of_Manufacture                       2008
Price_Per_Day                           4000.0
City                                  Itanagar
Car_Agency             Prestige Drive Itanagar
Agency_Price                            2900.0
Name: 12966, dtype: ob

_____

feature scaling the numerical features

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Create a MinMaxScaler instance
scaler = MinMaxScaler()

# List of numerical features to scale
numerical_features = ['TotalAmount', 'Price_Per_Day', 'Mileage_kmpl', 
                      'Year_Of_Manufacture', 'Agency_Price', 'Duration']
# Apply MinMax Scaling
df_encoded[numerical_features] = scaler.fit_transform(df_encoded[numerical_features])

# Display transformed data
print(df_encoded.head())


   RentalID  UserID RentalDate  Duration  CarID  TotalAmount  Model  \
0         1       0 2019-09-26  0.333333   9818     0.168535      0   
1         2       0 2019-10-10  0.333333   6755     0.146461     22   
2         3       0 2019-11-14  0.833333    331     0.365102     24   
3         4       0 2019-12-12  0.333333   4288     0.100210     33   
4         5       0 2019-12-26  0.666667   6287     0.176594     35   

   Mileage_kmpl  Year_Of_Manufacture  Price_Per_Day  City  Car_Agency  \
0      0.214286             1.000000       0.370370    53         252   
1      0.857143             0.900000       0.148148    43          75   
2      0.857143             0.883333       0.222222    42          83   
3      0.714286             0.683333       0.074074    40         170   
4      0.785714             0.916667       0.037037    26         153   

   Agency_Price  Make_Honda  Make_Hyundai  Make_Toyota  Make_Volkswagen  \
0      0.046875       False         False        False     

recommendation approach:

In [None]:
import numpy as np
import faiss

# Select only numerical columns from merged_df
numerical_features = merged_df.select_dtypes(include=[np.number])

# Convert numerical features DataFrame to NumPy array
data_matrix = numerical_features.to_numpy().astype("float32")

# Ensure the array is C-contiguous
data_matrix = np.ascontiguousarray(data_matrix)

# Get the number of features (columns)
num_features = data_matrix.shape[1]
print("Data matrix shape:", data_matrix.shape)

# Normalize data for cosine similarity
faiss.normalize_L2(data_matrix)

# Create FAISS index for Inner Product (Cosine Similarity when vectors are normalized)
index = faiss.IndexFlatIP(num_features)

# Add data to the FAISS index
index.add(data_matrix)

# Check total indexed cars
print("Total cars indexed:", index.ntotal)

# Number of similar cars to retrieve
N = 10  

# Select a random car from the dataset (or choose a specific index)
random_index = np.random.randint(0, data_matrix.shape[0])

# Extract the feature vector of the selected car
query_vector = data_matrix[random_index].reshape(1, -1)

# Perform the search
distances, indices = index.search(query_vector, N)

# Print recommended cars
print(f"Selected Car Index: {random_index}")
print(f"Top {N} Recommended Cars Indexes: {indices[0]}")
print(f"Distances: {distances[0]}")

# Retrieve the original car details for recommended indices
recommended_cars = merged_df.iloc[indices[0]]

# Display results
print("Selected Car Details:")
print(merged_df.iloc[random_index])  # Print the original car details

print("\nRecommended Cars:")
print(recommended_cars)


Data matrix shape: (39979, 9)
Total cars indexed: 39979
Selected Car Index: 39183
Top 10 Recommended Cars Indexes: [39183 36339 35548 39664 38705 35729 39576 30032 38662 38371]
Distances: [0.99999976 0.99986863 0.99980414 0.9997848  0.99976534 0.9997544
 0.9997502  0.999747   0.9997448  0.9997236 ]
Selected Car Details:
RentalID                                39184
UserID                                   1313
RentalDate                2019-10-17 00:00:00
Duration                                    6
CarID                                    6000
TotalAmount                           42000.0
Make                                  Hyundai
Model                  Hyundai Grand i10 Nios
CarType                             Hatchback
Mileage_kmpl                               21
Year_Of_Manufacture                      2019
Price_Per_Day                          3500.0
City                                 Varanasi
Car_Agency             Luxury Motion Varanasi
Agency_Price                      

In [None]:
# Define function to calculate Precision@N and Recall@N
def evaluate_recommendations(recommended_indices, actual_indices, N):
    recommended_set = set(recommended_indices[:N])  # Top-N recommended indices
    actual_set = set(actual_indices)  # Actual cars booked by the user
    
    # True Positives: Intersection of recommended and actual cars
    true_positives = len(recommended_set.intersection(actual_set))
    
    # Precision@N = TP / N
    precision_at_n = true_positives / N

    # Recall@N = TP / total actual relevant items
    recall_at_n = true_positives / len(actual_set) if len(actual_set) > 0 else 0
    
    return precision_at_n, recall_at_n

# Assuming actual booked car indices are known for the user
# (Replace with actual lookup based on user history)
actual_booked_indices = merged_df[merged_df["UserID"] == merged_df.iloc[random_index]["UserID"]].index.tolist()

# Compute Precision@N and Recall@N
precision, recall = evaluate_recommendations(indices[0], actual_booked_indices, N)

# Display performance metrics
print(f"Precision@{N}: {precision:.2f}")
print(f"Recall@{N}: {recall:.2f}")


Precision@10: 0.10
Recall@10: 0.17


____

In [None]:
# Get the number of features (columns)
num_features = data_matrix.shape[1]# will get 17 here in this case

# Create a FAISS index using L2 (Euclidean) distance
index = faiss.IndexFlatL2(num_features)

# Add the data to the FAISS index for efficient searching
index.add(data_matrix)

# Check the number of items in the index
print("Total cars indexed:", index.ntotal)

Total cars indexed: 39979


In [None]:
import numpy as np

# Number of similar cars to retrieve
N =5 #10  

# Select a random car from the dataset (you can also choose a specific index)
random_index = np.random.randint(0, data_matrix.shape[0])

# Extract the feature vector of the selected car
query_vector = data_matrix[random_index].reshape(1, -1)

# Perform the search
distances, indices = index.search(query_vector, N)

# Print the recommended cars
print(f"Selected Car Index: {random_index}")
print(f"Top {N} Recommended Cars Indexes: {indices[0]}")
print(f"Distances: {distances[0]}")


Selected Car Index: 6830
Top 10 Recommended Cars Indexes: [6830 6820 6822 6838 6801 6854 6800 6833 6827 6843]
Distances: [   0.       466.14667  652.58716  679.7658   992.5579  1300.0562
 1313.5419  1321.2833  1465.8523  1723.8506 ]


In [None]:
# Retrieve the original car details for the recommended indices
recommended_cars = merged_df.iloc[indices[0]]

# Display the recommended cars
print("Selected Car Details:")
print(merged_df.iloc[random_index])  # Print the original car details

print("\nRecommended Cars:")
print(recommended_cars)


Selected Car Details:
RentalID                                6831
UserID                                   223
RentalDate               2021-06-24 00:00:00
Duration                                   7
CarID                                   5924
TotalAmount                          58100.0
Make                              Volkswagen
Model                      Volkswagen Virtus
CarType                                Sedan
Mileage_kmpl                              20
Year_Of_Manufacture                     2018
Price_Per_Day                         5500.0
City                                Varanasi
Car_Agency             Elite Wheels Varanasi
Agency_Price                          2800.0
Name: 6830, dtype: object

Recommended Cars:
      RentalID  UserID RentalDate  Duration  CarID  TotalAmount        Make  \
6830      6831     223 2021-06-24         7   5924      58100.0  Volkswagen   
6820      6821     223 2020-09-24         2   5958      23600.0     Hyundai   
6822      6823     22

- The recommended cars are all Hyundai i20s, meaning the FAISS search is finding nearly identical cars in terms of features.

- The recommendations are based on similarity in features (Make, Model, Mileage, Price per day, etc.).

- The similarity was measured using FAISS with L2 (Euclidean) distance. The lower the distance, the more similar the cars

Evaluation of performance:

In [None]:
# print(merged_df.columns)

Index(['RentalID', 'UserID', 'RentalDate', 'Duration', 'CarID', 'TotalAmount',
       'Make', 'Model', 'CarType', 'Mileage_kmpl', 'Year_Of_Manufacture',
       'Price_Per_Day', 'City', 'Car_Agency', 'Agency_Price'],
      dtype='object')


In [None]:
def precision_at_k(recommended_cars, actual_cars, k):
    """
    Calculates Precision@K: Measures how many of the recommended cars are relevant.
    """
    recommended_set = set(recommended_cars[:k])  # Take top-K recommendations
    actual_set = set(actual_cars)  # Set of actual booked cars
    relevant_count = len(recommended_set & actual_set)  # Intersection
    
    return relevant_count / k  # Precision formula

def recall_at_k(recommended_cars, actual_cars, k):
    """
    Calculates Recall@K: Measures how many relevant cars were successfully recommended.
    """
    recommended_set = set(recommended_cars[:k])
    actual_set = set(actual_cars)
    relevant_count = len(recommended_set & actual_set)

    return relevant_count / len(actual_set) if len(actual_set) > 0 else 0  # Recall formula

# Get actual booked cars for the same UserID
selected_user_id = merged_df.iloc[random_index]["UserID"]
actual_cars = merged_df[merged_df["UserID"] == selected_user_id]["CarID"].tolist()

# Compute Precision and Recall for K=5
precision_5 = precision_at_k(recommended_cars["CarID"].tolist(), actual_cars, 5)
recall_5 = recall_at_k(recommended_cars["CarID"].tolist(), actual_cars, 5)

print(f"Precision@5: {precision_5:.2f}")
print(f"Recall@5: {recall_5:.2f}")


Precision@5: 0.80
Recall@5: 0.08


for 10

In [None]:
# Number of recommendations to evaluate
K = 10  

# Get actual booked cars for the same user (based on UserID or RentalID)
actual_cars = merged_df[merged_df["UserID"] == merged_df.iloc[random_index]["UserID"]]["CarID"].tolist()

# Compute Precision and Recall for K=10
precision_10 = precision_at_k(recommended_cars["CarID"].tolist(), actual_cars, K)
recall_10 = recall_at_k(recommended_cars["CarID"].tolist(), actual_cars, K)

print(f"Precision@10: {precision_10:.2f}")
print(f"Recall@10: {recall_10:.2f}")


Precision@10: 0.40
Recall@10: 0.11
