In [14]:
#datacollection and cleaning
import requests
from bs4 import BeautifulSoup as Soup
import pandas as pd
import json
import os
from datetime import datetime, timedelta
locations = ["Delhi", "Mumbai", "Kolkata", "Chennai"]
# Function to scrape data from Booking.com
def scrape_bookingdotcom(destination, checkin_date, checkout_date):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36'
    }
    req = requests.get(
        f"https://www.booking.com/searchresults.en-gb.html?ss={destination}&checkin={checkin_date}&checkout={checkout_date}&offset==0",
        headers=headers).text
    soup = Soup(req, 'html.parser')
    ap = soup.find("ol", {"class": "a8b500abde"}).text

    df = pd.DataFrame(columns=["price", "location", "distance", "amenities", "ratings", "type"])
    for pages in range(0, int(ap[len(ap) - 1])):
        req = requests.get(
            f"https://www.booking.com/searchresults.en-gb.html?ss={destination}&checkin={checkin_date}&checkout={checkout_date}&offset=={pages * 25}",
            headers=headers).text
        soup = Soup(req, 'html.parser')
        apts = soup.find_all("div", {"class": "d20f4628d0"})
        rows = []

        for a in range(0, len(apts)):
            obj = {}

            try:
                obj["price"] = apts[a].find("span", {"class": "fcab3ed991 fbd1d3018c e729ed5ab6"}).text
            except:
                obj["price"] = None

            try:
                obj["distance"] = apts[a].find("span", {"class": "cb5ebe3ffb"}).text
            except:
                obj["distance"] = None

            try:
                ap1 = apts[a].find('a', href=True)
                link = ap1['href']
                req1 = requests.get(link, headers=headers).text
                soup2 = Soup(req1, 'html.parser')
                obj["amenities"] = soup2.find("div", {"class": "e5e0727360"}).text
            except:
                obj["amenities"] = None

            try:
                obj["ratings"] = apts[a].find("div", {"class": "b5cd09854e d10a6220b4"}).text
            except:
                obj["ratings"] = None

            try:
                obj["type"] = apts[a].find("span", {"class": "df597226dd"}).text
            except:
                obj["type"] = None

            try:
                obj["location"] = apts[a].find("span", {"class": "f4bd0794db b4273d69aa"}).text
            except:
                obj["location"] = None

            rows.append(obj)

        df = pd.concat([df, pd.DataFrame(rows)])

    # Data cleaning
    df["price"] = df["price"].str.replace(r"₹", "")
    df["price"] = df["price"].str.replace(r" ", "")
    df["price"] = df["price"].str.replace(r",", "")
    df["price"] = df["price"].str.strip()
    df['price'] = pd.to_numeric(df['price'])
    df['ratings'] = pd.to_numeric(df['ratings'], errors='coerce')
    df['ratings'] = df['ratings'].fillna(df['ratings'].mean())

    return df

# Load scraped locations from JSON file
scraped_locations_file = "scraped_locations.json"  # JSON file to store scraped locations
try:
    with open(scraped_locations_file, "r") as file:
        scraped_locations = set(json.load(file))
except FileNotFoundError:
    scraped_locations = set()

# Take user input for the location, check-in date, and check-out date
user_location = input("Enter a location: ").strip().capitalize()
current_date = datetime.now().strftime("%Y-%m-%d")
checkin_date = current_date
checkout_date = (datetime.now() + timedelta(days=1)).strftime("%Y-%m-%d")

# Check if location has already been scraped
if user_location.lower() in map(str.lower, scraped_locations):
    print(f"Skipping {user_location}. Already scraped.")
else:
    # Scrape data for the location
    df = scrape_bookingdotcom(user_location, checkin_date, checkout_date)

    # Update the set of scraped locations
    scraped_locations.add(user_location)

    # Save the data to a CSV file with current date in the filename
    csv_filename = f"{user_location}_{current_date}.csv"
    df.to_csv(csv_filename, index=False)

    print(f"Scraped and saved data for {user_location}.")
for location in locations:
    if location.lower() not in map(str.lower, scraped_locations):
        # Scrape data for the location
        df = scrape_bookingdotcom(location, checkin_date, checkout_date)

        # Update the set of scraped locations
        scraped_locations.add(location)

        # Save the data to a CSV file with current date in the filename
        csv_filename = f"{location}_{current_date}.csv"
        df.to_csv(csv_filename, index=False)

        print(f"Scraped and saved data for {location}.")

# Save the updated set of scraped locations to the JSON file
with open(scraped_locations_file, "w") as file:
    json.dump(list(scraped_locations), file)

# Combine all CSV files into a single dataframe
combined_df = pd.DataFrame()
for location in list(scraped_locations):
    csv_filename = f"{location}_{current_date}.csv"
    if os.path.isfile(csv_filename):
        df = pd.read_csv(csv_filename)
        combined_df = pd.concat([combined_df, df])

# Save the combined dataframe to a CSV file with current date in the filename
final_csv_filename = f"combined_{current_date}.csv"
combined_df.to_csv(final_csv_filename, index=False)

print("Scraping completed.")


Skipping Kottayam. Already scraped.
Scraping completed.


In [15]:
import pandas as pd
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag
import re
import joblib
import nltk
from nltk.corpus import stopwords

# Download the necessary NLTK modules
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

# Read the data into a Pandas DataFrame
df = pd.read_csv(final_csv_filename)

# Fill in any missing values in the amenities column with an empty string
df['amenities'].fillna('', inplace=True)

# Define a custom tokenizer that removes adverbs, adjectives, and verbs


def custom_tokenizer(text):
    tokens = word_tokenize(text)
    pos_tags = pos_tag(tokens)  # Perform POS tagging
    filtered_tokens = [token for token, pos in pos_tags if pos not in [
        'RB', 'JJ', 'VB']]  # Filter out adverbs, adjectives, and verbs
    return filtered_tokens


# Create a TfidfVectorizer object and fit it to the amenities column
vectorizer = TfidfVectorizer(tokenizer=custom_tokenizer, ngram_range=(
    1, 2), stop_words=set(stopwords.words('english')))
#amenities_tfidf = vectorizer.fit_transform(df['amenities'])

# Convert the TfidfVectorizer output to a DataFrame
amenities_df = pd.DataFrame()  
#amenities_tfidf.toarray(), columns=vectorizer.get_feature_names())

# Create a LabelEncoder object and fit it to the type and location columns
label_encoder = LabelEncoder()
df['type'] = label_encoder.fit_transform(df['type'].str.lower())
df['location'] = label_encoder.fit_transform(df['location'].str.lower())

# Save the LabelEncoder objects to disk
joblib.dump(label_encoder, 'label_encoder.joblib')

# Drop the amenities and distance columns from the DataFrame
df_processed = df.drop(columns=['amenities', 'distance'])

# Rename the columns in the DataFrame
df_processed.columns = [re.sub('[^A-Za-z]+', '', col)
                        for col in df_processed.columns]

# Write the processed DataFrame to a CSV file
df_processed.to_csv('preprocessed.csv', index=False)

# Print the processed DataFrame
print(df_processed)


     price  location   ratings  type
0     1512        10  5.200000    15
1     4500        10  7.900000    54
2     2700        10  6.400000     7
3     2650        10  8.400000    44
4     2765        10  7.800000     8
..     ...       ...       ...   ...
494   3150         9  8.500000     7
495   1606         9  5.500000     7
496   1532         9  5.000000    44
497   3040        15  7.834783    44
498   3325         9  7.500000    50

[499 rows x 4 columns]


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\jaida\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jaida\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
#PCA
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
df = pd.read_csv('preprocessed.csv')
df=df.drop('ratings',axis=1)
X = df.drop('price', axis=1)  # Adjust the column name for your target variable
y = df['price']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA
pca = PCA(n_components=2)  # Set the number of components you want to retain
X_pca = pca.fit_transform(X_scaled)

# Create a new DataFrame with the reduced dimensions
df_pca = pd.DataFrame(data=X_pca, columns=['PC1', 'PC2'])

# Concatenate the reduced dimensions with the target variable
df_pca_target = pd.concat([df_pca, y], axis=1)

# Save the PCA-reduced dataset to a new CSV file
df_pca_target.to_csv('pca_reduced.csv', index=False)


In [17]:
#model training
from pycaret.regression import *
import pandas as pd
data = pd.read_csv('pca_reduced.csv')

regression_setup = setup(
    data,
    target='price',
    normalize=True,
    train_size=0.8,
    session_id=123,
    log_experiment=True,
    experiment_name='your_experiment_name'
)

best_models = compare_models()

print(best_models)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,price
2,Target type,Regression
3,Original data shape,"(499, 3)"
4,Transformed data shape,"(499, 3)"
5,Transformed train set shape,"(399, 3)"
6,Transformed test set shape,"(100, 3)"
7,Numeric features,2
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,208.7641,647137.1975,758.1749,0.9359,0.1709,0.0573,0.06
xgboost,Extreme Gradient Boosting,216.1014,783961.5051,788.8588,0.9253,0.1802,0.0569,0.136
rf,Random Forest Regressor,431.5277,991186.8812,925.8144,0.8993,0.2148,0.122,0.064
dt,Decision Tree Regressor,276.0589,1287128.7043,959.5767,0.8624,0.2074,0.0769,0.024
gbr,Gradient Boosting Regressor,904.1564,1728644.0587,1295.9219,0.8226,0.3272,0.2775,0.05
lightgbm,Light Gradient Boosting Machine,1038.7595,2163910.8359,1465.2258,0.7767,0.3601,0.317,0.026
knn,K Neighbors Regressor,1112.056,3097784.9471,1742.571,0.6825,0.3561,0.27,0.021
ada,AdaBoost Regressor,2148.635,5743909.3677,2391.6538,0.4,0.624,0.7572,0.025
br,Bayesian Ridge,2492.0137,9126338.1632,3008.4373,0.0733,0.6716,0.7747,0.018
ridge,Ridge Regression,2490.0811,9125944.434,3008.5821,0.0726,0.6703,0.7712,0.018


ExtraTreesRegressor(n_jobs=-1, random_state=123)


In [18]:
#user prediction
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag
import re
import nltk
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from pycaret.regression import load_model, predict_model

# Load the label encoders
location_encoder = LabelEncoder()
room_type_encoder = LabelEncoder()
location_encoder = joblib.load('location_encoder.joblib')
room_type_encoder = joblib.load('type_encoder.joblib')

# User input
user_location = input("Enter the location: ").strip().lower()
user_room_type = input("Enter the room type: ").strip().lower()
user_amenities = input("Enter the amenities: ").strip()

# Encode user inputs
encoded_location = location_encoder.transform([user_location])[0]
encoded_room_type = room_type_encoder.transform([user_room_type])[0]


# Preprocess amenities text
def custom_tokenizer(text):
    tokens = word_tokenize(text)
    pos_tags = pos_tag(tokens)  # Perform POS tagging
    filtered_tokens = [token for token, pos in pos_tags if pos not in ['RB', 'JJ', 'VB']]  # Filter out adverbs, adjectives, and verbs
    return filtered_tokens

# Apply the same preprocessing to user amenities
user_amenities_tfidf = vectorizer.transform([user_amenities])
user_amenities_df = pd.DataFrame(user_amenities_tfidf.toarray(), columns=vectorizer.get_feature_names())

# Create a dataframe with the user inputs
user_input_df = pd.DataFrame({
    'location': [encoded_location],
    'type': [encoded_room_type]
})
user_processed = user_input_df.copy()
user_processed.columns = [re.sub('[^A-Za-z]+', '', col) for col in user_processed.columns]
user_processed = pd.concat([user_processed, amenities_df.iloc[:1]], axis=1)
#user_processed.drop(columns=['amenities', 'distance'], axis=1, inplace=True)
user_scaled = scaler.transform(user_processed)
user_pca = pca.transform(user_scaled)
user_pca_df = pd.DataFrame(data=user_pca, columns=['PC1', 'PC2'])
predictions = predict_model(best_models, data=user_pca_df)
predicted_price = predictions['prediction_label'].iloc[0]
print(f"Predicted price for the given input: {predicted_price}")



FileNotFoundError: [Errno 2] No such file or directory: 'location_encoder.joblib'

In [None]:
combined_df

NameError: name 'combined_df' is not defined