In [1]:
import os
import glob

import pickle
import requests

import re
import math
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

import folium
from IPython.display import Image
from geopy.geocoders import Nominatim

from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, accuracy_score, classification_report

import warnings
warnings.filterwarnings('ignore')

In [3]:
# Specify the folder path where the files are located
folder_path = '/Users/mani/Desktop/cxapp/Space-Optimization/Data'

# Create an empty list to store file names
file_names = []

# Use a loop to iterate over all files in the folder
for file_name in glob.glob(os.path.join(folder_path, '*')):
    # Append the full file path to the list
    file_names.append(folder_path + '/' + os.path.basename(file_name))

# Return the list of file names
file_names

['/Users/mani/Desktop/cxapp/Space-Optimization/Data/Building.csv',
 '/Users/mani/Desktop/cxapp/Space-Optimization/Data/Floor.csv',
 '/Users/mani/Desktop/cxapp/Space-Optimization/Data/Room.csv',
 '/Users/mani/Desktop/cxapp/Space-Optimization/Data/Entitlement Group.csv',
 '/Users/mani/Desktop/cxapp/Space-Optimization/Data/RoomBookingLog.csv',
 '/Users/mani/Desktop/cxapp/Space-Optimization/Data/Neighborhood.csv',
 '/Users/mani/Desktop/cxapp/Space-Optimization/Data/Rule.csv',
 '/Users/mani/Desktop/cxapp/Space-Optimization/Data/Desk_Booking_Log.csv',
 '/Users/mani/Desktop/cxapp/Space-Optimization/Data/Desk.csv']

In [4]:
# Read the data into seperate pandas dataframes
Building_df = pd.read_csv(file_names[0])
Floor_df = pd.read_csv(file_names[1])
Room_df = pd.read_csv(file_names[2])
Desk_df = pd.read_csv(file_names[8])

Rule_df = pd.read_csv(file_names[6])
Neighborhood_df = pd.read_csv(file_names[5])
EntitlementGroup_df = pd.read_csv(file_names[3])

RoomBookingLog_df = pd.read_csv(file_names[4])
DeskBookingLog_df = pd.read_csv(file_names[7])

In [5]:
# Create a copy of the RoomBookingLog_df DataFrame and assign it to 'data'
data = RoomBookingLog_df.copy()

# Drop unnecessary columns from 'data'
data.drop(columns=['MEETING_NAME', 'BUILDING_NAME', 'ACCEPTED_COUNT', 'INVITED_COUNT', 'LAST_MODIFIED', 'TYPE', 'FLOOR_NAME', 'DATE_CREATED', 'ROOM_NAME'], inplace=True)

# data = data.loc[data['IS_CANCELLED'] == False]

# Convert 'START_TM' and 'END_TM' columns to datetime format
data['START_TM'] = pd.to_datetime(data['START_TM'])
data['END_TM'] = pd.to_datetime(data['END_TM'])


In [6]:
# Define a function to calculate duration
def calculate_duration(row):
    # If END_TM is on the next day, set it to midnight
    if row['END_TM'].date() > row['START_TM'].date():
        row['END_TM'] = row['END_TM'].replace(hour=23, minute=59, second=59)
    # Calculate duration in 30-minute intervals
    duration_seconds = (row['END_TM'] - row['START_TM']).total_seconds()
    return int(np.ceil(duration_seconds / (30 * 60)))

# Apply the function to calculate duration
data['Duration'] = data.apply(calculate_duration, axis=1)
data.drop(columns=['IS_CANCELLED', 'END_TM'], inplace=True)

data.head()

Unnamed: 0,ID,BUILDING_ID,ROOM_ID,FLOOR_ID,START_TM,MEETING_ID,Duration
0,RB958,B033,R1869,F108,2023-08-21 01:30:00+00:00,M019,1
1,RB847,B060,R835,F087,2023-08-21 02:01:00+00:00,M044,2
2,RB848,B033,R1869,F108,2023-08-21 02:30:00+00:00,M019,3
3,RB689,B042,R1215,F210,2023-08-15 18:00:00+00:00,M032,1
4,RB39717,B029,R1985,F183,2023-09-12 20:30:00+00:00,M016,1


In [7]:
# Filter the 'data' DataFrame to include rows with 'Duration' less than or equal to 12 hours
# data_cleaned = data.loc[data['Duration'] <= 24]

# Filter the 'data_cleaned' DataFrame to include rows with 'START_TM' years in [2021, 2022, 2023]
data_cleaned = data[data['START_TM'].dt.year.isin([2021, 2022, 2023])]


In [8]:
# handling outliers
# keeing the data with duration less than 24 hours (1 day i.e, 48 30-minute intervals)
data_cleaned = data_cleaned.loc[(data_cleaned['Duration']<=48) & (data_cleaned['Duration']>0)]

- Feature Engineering

In [9]:
data_cleaned['time_of_day'] = data_cleaned['START_TM'].dt.hour
data_cleaned['day_of_week'] = data_cleaned['START_TM'].dt.dayofweek
data_cleaned['month_of_year'] = data_cleaned['START_TM'].dt.month

In [10]:
data_cleaned.drop(columns=['START_TM', 'BUILDING_ID', 'FLOOR_ID', 'MEETING_ID'], inplace=True)

data = pd.merge(data_cleaned, Room_df, left_on='ROOM_ID', right_on='ID', how='left')
data.drop(columns=['ID_y', 'DATECREATED', 'MEETINGID', 'NAME', 'FLOORID', 'POINTS', 'BOOKABLE'], inplace=True)
data.rename(columns={'ID_x': 'ID'}, inplace=True)

data.columns

Index(['ID', 'ROOM_ID', 'Duration', 'time_of_day', 'day_of_week',
       'month_of_year', 'BUILDINGID', 'CAPACITY', 'LASTMODIFIED', 'AMENITIES'],
      dtype='object')

In [11]:
print(data.shape)
data = data[(data['Duration'] >= data['Duration'].quantile(.05)) & (data['Duration'] <= data['Duration'].quantile(.95))]
data = data[data['CAPACITY'] <= data['CAPACITY'].quantile(.95)]
data.dropna(inplace=True)
data.shape

(72992, 10)


(66203, 10)

- More Feature Engineering

In [12]:
# Encode categorical variables
le = LabelEncoder()

data['AMENITIES'] = le.fit_transform(data['AMENITIES'])
data['BUILDINGID'] = le.fit_transform(data['BUILDINGID'])
data['Total_Building_Meetings'] = data.groupby('BUILDINGID')['BUILDINGID'].transform('count')

- Feature Selection

In [13]:
# Split the data into training and testing sets
X = data[['AMENITIES', 'time_of_day', 'day_of_week','month_of_year', 'CAPACITY', 'BUILDINGID', 'Total_Building_Meetings']]
y = data['Duration']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((52962, 7), (13241, 7), (52962,), (13241,))

In [14]:
# Initialize the model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the model to the training data
model.fit(X_train, y_train)

In [15]:
# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Generate a classification report
report = classification_report(y_test, y_pred)
print(report)

Accuracy: 0.50
              precision    recall  f1-score   support

           1       0.55      0.61      0.58      5373
           2       0.50      0.55      0.53      5345
           3       0.25      0.14      0.18       976
           4       0.32      0.19      0.24       975
           5       0.22      0.12      0.15       153
           6       0.16      0.08      0.11       178
           7       0.00      0.00      0.00        62
           8       0.20      0.11      0.14        75
           9       0.08      0.04      0.05        26
          10       0.17      0.12      0.14        50
          11       0.46      0.21      0.29        28

    accuracy                           0.50     13241
   macro avg       0.27      0.20      0.22     13241
weighted avg       0.48      0.50      0.48     13241



In [16]:
#Saving the model
rf_model = model

# Pickle the model to a file
with open('random_forest_model.pkl', 'wb') as model_file:
    pickle.dump(rf_model, model_file)

In [17]:
# Test the model by making a prediction for a sample input

# Choose a random index from X_test
random_index = random.randint(0, len(X_test) - 1)

# Get the data for the chosen index and convert it to a dictionary
sample_input = dict(X_test.iloc[random_index])

# Make a POST request to the /predict endpoint with the sample input
response = requests.post('http://localhost:8000/predict', json=sample_input)

# Print the response
print(response.json())

{'prediction': 1}
