In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


In [None]:
detailed_property=pd.read_csv("/kaggle/input/tensorlabs-2025-internships/Detailed_Property.csv")
property_reviews=pd.read_csv("/kaggle/input/tensorlabs-2025-internships/Property_Reviews.csv")
property_byPlace=pd.read_csv("/kaggle/input/tensorlabs-2025-internships/property_by_place.csv")


# ****Load the data from files and convert them into dataframes****

In [None]:
detailedProperty_df=pd.DataFrame(detailed_property)
propertyReviews_df=pd.DataFrame(property_reviews)
propertyByPlace_df=pd.DataFrame(property_byPlace)

print(detailedProperty_df.shape)
print(propertyReviews_df.shape)
print(propertyByPlace_df.shape)

# ****Check whether there are null values in the dataframe****

In [None]:
print(detailedProperty_df.isnull().sum().sort_values(ascending=False))
print(propertyReviews_df.isnull().sum().sort_values(ascending=False))
print(propertyByPlace_df.isnull().sum().sort_values(ascending=False))


# ****Convert the missing values into a percentage to remove them on the basis of a certain threhold****

In [None]:
# missing_prctg_dr -> missing percentage values fot the detailed property dataframe
missing_prctg_dr = detailedProperty_df.isnull().mean() * 100
# missing_prctg_pr -> missing percentage values fot the property reviews dataframe
missing_prctg_pr = propertyReviews_df.isnull().mean() * 100
# missing_prctg_pbp -> missing percentage values fot the property by place dataframe
missing_prctg_pbp = propertyByPlace_df.isnull().mean() * 100
print("Missing Percentage for the detailed price dataframe",missing_prctg_dr)
print("Missing Percentage for the property reviews dataframe",missing_prctg_pr)
print("Missing Percentage for the property by place dataframe",missing_prctg_pbp)

# ****Now on the basis of the 60 threshold we remove the columns because they are not able to contribute in model development and predictions****

In [None]:
#columns to remove for the detailed property dataframe
cols_to_drop_dp = missing_prctg_dr[missing_prctg_dr > 60].index
#columns to remove for the property reviews dataframe
cols_to_drop_pr = missing_prctg_pr[missing_prctg_pr > 60].index
#columns to remove for the property by place dataframe
cols_to_drop_pbp = missing_prctg_pbp[missing_prctg_pbp > 60].index
print(cols_to_drop_dp)
print(cols_to_drop_pr)
print(cols_to_drop_pbp)

# **Now below cell will remove all those columns in the dataframes who are either empty or below the threshold**

In [None]:
detailedProperty_df=detailedProperty_df.drop(columns=cols_to_drop_dp)
propertyReviews_df=propertyReviews_df.drop(columns=cols_to_drop_pr)
propertyByPlace_df=propertyByPlace_df.drop(columns=cols_to_drop_pbp)
print(detailedProperty_df.shape)
print(propertyReviews_df.shape)
print(propertyByPlace_df.shape)


 # **Fill the missing values if any of the rows have by using forward fill which will fill values by propagating the last valid observation to next valid**

In [None]:
# Fill missing values
detailedProperty_df.fillna(method="ffill", inplace=True)
propertyReviews_df.fillna(method="ffill", inplace=True)
propertyByPlace_df.fillna(method="ffill", inplace=True)


In [None]:
print(detailedProperty_df.shape)

In [None]:
print(propertyReviews_df.columns)
print(propertyByPlace_df.columns)

# **We are also dropping those columns [features]  which are not suitable or valid for the model to train during the training process**

In [None]:
import pandas as pd

columns_to_drop_detailedProperty = [
    "property_id", "bookingData.hostId", "bookingData.hostName",
    "sections.hero.previewImageLoggingEventData.loggingId",
    "sections.hero.previewImageLoggingEventData.component",
    "sections.hero.previewImageLoggingEventData.section",
    "title", "sections.description.title", "sections.description.items",
    "sections.descriptionDefault.descriptionSummary.htmlText",
    "sections.descriptionDefault.descriptionSummary.minimumNumberOfLinesForTruncation",
    "sections.descriptionDefault.descriptionSummary.recommendedNumberOfLines",
    "sections.descriptionDefault.hasExtraDescriptionDetails",
    "sections.hero.previewImages", "sections.photoTour.mediaItems",
    "sections.petCounts.mediaItem.baseUrl",
    "sections.bookIt.reviewItem.accessibilityLabel", "sections.bookIt.reviewItem.title",
    "sections.availabilityCalendarDefault.title", "sections.availabilityCalendarDefault.subtitle",
    "sections.availabilityCalendarDefault.priceDisclaimer",
    "sections.amenities.seeAllAmenitiesButton.title",
    "sections.reviewsDefault.heading.accessibilityLabel", "sections.reviewsDefault.heading.icon",
    "sections.reviewsDefault.heading.title", "sections.reviewsDefault.heading.subtitle",
    "sections.location.seeAllDetailsButton.title",
    "sections.policies.seeAllHouseRulesButton.accessibilityLabel",
    "sections.policies.seeAllHouseRulesButton.title",
    "sections.policies.seeCancellationPolicyButton.accessibilityLabel",
    "sections.policies.seeCancellationPolicyButton.title",
    "sections.policies.seeAllSafetyAndPropertyButton.accessibilityLabel",
    "sections.policies.seeAllSafetyAndPropertyButton.title",
    "sections.petCounts.html.htmlText",
    "sections.policies.cancellationPolicies", "sections.policies.houseRules",
    "sections.policies.houseRulesTitle", "bookingData.hostProfilePhotoUrl",
    "sections.petCounts.subtitle", "sections.description.logoData",
    "sections.description.mediaItems", "sections.hero.mediaItems",
    "sections.hero.seePhotosButton.accessibilityLabel",
    "sections.highlights.highlightsA",
    "sections.sleepingArrangement.title",
    "sections.sleepingArrangement.arrangementDetails",
    "sections.title.actionableIcon.accessibilityLabel",
    "sections.title.actionableIcon.icon", "sections.title.actionableIcon.title",
    "sections.title.actionableIcon.subtitle", "sections.policies.previewSafetyAndProperties",
    "sections.policies.seeAllSafetyAndPropertyButton.icon",
    "sections.reviewsDefault.ratings", "sections.photoTour.title",
    "sections.location.hostGuidebookButton.icon", "sections.location.hostGuidebookButton.title",
    "sections.location.hostGuidebookButton.subtitle",
    "sections.location.seeAllDetailsButton.accessibilityLabel",
    "sections.location.seeAllDetailsButton.subtitle",
    "sections.location.seeAllLocationDetails", "sections.location.previewLocationDetails",
    "sections.amenities.seeAllAmenitiesGroups", "sections.amenities.title",
    "sections.amenities.seeAllAmenitiesButton.accessibilityLabel",
    "sections.amenities.seeAllAmenitiesButton.icon",
    "sections.amenities.seeAllAmenitiesButton.subtitle",
    "sections.amenities.previewAmenitiesGroups",
    "sections.location.homeIcon", "bookingData.cancellationPolicies",
    "sections.availabilityCalendarDefault.reviewAccessibilityLabel",
    "sections.availabilityCalendarDefault.descriptionItems",
    "sections.petCounts.title", "sections.hero.seePhotosButton.title",
    "sections.title.title", "sections.location.subtitle",
    "sections.location.title", "sections.location.mapMarkerRadiusInMeters",
    "sections.policies.title", "sections.policies.safetyAndPropertyTitle",
    "sections.highlights.highlights", "sections.policies.cancellationPolicyTitle"
]
columns_to_drop_propertyReviews=['property_id', 'review_id', 'reviewer_name', 'reviewer_picture_url',
                 'review_date', 'disclaimer', 'localized_date'
]
columns_to_drop_propertyByPlace=['Unnamed: 0.1', 'Unnamed: 0', 'id', 'listingName',
                 'localizedCityName', 'listingObjType', 'previewTags',
                 'publicAddress', 'qualifier','roomType','roomTypeCategory','title','avgRatingLocalized',
                                 
    
]
  
# Drop non-useful columns
detailedProperty_cleaned = detailedProperty_df.drop(columns=columns_to_drop_detailedProperty, errors='ignore')
propertyReviews_cleaned  = propertyReviews_df.drop(columns=columns_to_drop_propertyReviews, errors='ignore')
propertyByPlace_cleaned  = propertyByPlace_df.drop(columns=columns_to_drop_propertyByPlace, errors='ignore')




In [None]:
#print(f"Removed {len(columns_to_drop)} irrelevant columns.")
#print("Remaining columns:", detailedProperty_cleaned.shape[1])
print(detailedProperty_cleaned.shape)
print(propertyReviews_cleaned.shape)
print(propertyReviews_cleaned.columns)
print(propertyByPlace_cleaned.shape)
print(propertyByPlace_cleaned.columns)

# **Covert the strings of rating columns into dummy rating numbers from 1 to 5**

In [None]:
def convert_to_dummy_rating(timestamp):
    return (hash(timestamp) % 5) + 1  # Ensure values between 1 and 5


# Apply the transformation
propertyReviews_cleaned['rating'] = propertyReviews_cleaned['rating'].apply(convert_to_dummy_rating)
print(propertyReviews_cleaned.head)

# **Add dummy comments to make the data more suitable for the model** 

In [None]:
def generate_dummy_comment(rating):
    comments_dict = {
        1: "Terrible experience. Would not recommend.",
        2: "Not great. Needs improvement.",
        3: "It was okay. Nothing special.",
        4: "Good experience. Enjoyed my stay.",
        5: "Amazing! Highly recommended."
    }
    return comments_dict.get(rating, "No comment.")

# Apply function to create comments
propertyReviews_cleaned["comments"] = propertyReviews_cleaned["rating"].apply(generate_dummy_comment)
print(propertyReviews_cleaned.head)

# **Preprocessing for the price and Accessibility Label of Property by Place dataframe and convert them into floating points from strings**

In [None]:
print(propertyByPlace_cleaned.tail)

In [None]:
def extract_price(text):
    if pd.notna(text):  # Check if value is not NaN
        return float(''.join(filter(str.isdigit, text.split(' ')[0])))
    return None
for column in ['accessibilityLabel', 'price']:
    propertyByPlace_cleaned[f'{column}'] = propertyByPlace_cleaned[column].apply(extract_price)
print(propertyByPlace_cleaned.head)




# **Now we are extracting categorical columns to convert them into numbers by using one hot encoding**

In [None]:
categorical_cols_dp = detailedProperty_cleaned.select_dtypes(include=['object']).columns
print("Categorical Data of the Detailed Property Dataframe",categorical_cols_dp)
categorical_cols_pr = propertyReviews_cleaned.select_dtypes(include=['object']).columns
print("Categorical Data of the Property Reviews Dataframe",categorical_cols_pr)
categorical_cols_pbp = propertyByPlace_cleaned.select_dtypes(include=['object']).columns
print("Categorical Data of the Property By Place Dataframe",categorical_cols_pbp)


# **One hot encoding of the dataframes**

In [None]:
dp_encoded = pd.get_dummies(detailedProperty_cleaned, columns=categorical_cols_dp, drop_first=True)
pr_encoded = pd.get_dummies(propertyReviews_cleaned, columns=categorical_cols_pr, drop_first=True)
pbp_encoded = pd.get_dummies(propertyByPlace_cleaned, columns=categorical_cols_pbp, drop_first=True)

print("One hot encoding for the detailed property",dp_encoded.tail)
print("One hot encoding for the property reviews",pr_encoded.tail)
print("One hot encoding for the property by place",pbp_encoded.tail)

# **Use Corelation to reduce the redundant features**

In [None]:
pr_encoded.head

In [None]:
import numpy as np
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Compute correlation matrix
corr_matrix_dp = dp_encoded.corr()

# Replace NaN values with 0 (to avoid plotting errors)
corr_matrix_dp = corr_matrix_dp.fillna(0)

# Select upper triangle of correlation matrix
upper_dp = corr_matrix_dp.where(np.triu(np.ones(corr_matrix_dp.shape), k=1).astype(bool))

# Find features with correlation > 0.85
to_drop_dp = [column for column in upper_dp.columns if any(upper_dp[column] > 0.85)]

# Drop correlated features
dp_encoded.drop(columns=to_drop_dp, inplace=True)

print(f"Removed {len(to_drop_dp)} highly correlated features.")
# Plot heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix_dp, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Correlation Heatmap of dp_encoded Features")
plt.show()


In [None]:
dp_encoded.head

In [None]:
import numpy as np

# Compute correlation matrix
corr_matrix_pbp = pbp_encoded.corr()

# Select upper triangle of correlation matrix
upper_pbp = corr_matrix_pbp.where(np.triu(np.ones(corr_matrix_pbp.shape), k=1).astype(bool))

# Find features with correlation > 0.85
to_drop_pbp = [column for column in upper_pbp.columns if any(upper_pbp[column] > 0.85)]

# Drop correlated features
pbp_encoded.drop(columns=to_drop_pbp, inplace=True)

print(f"Removed {len(to_drop_pbp)} highly correlated features.")


In [None]:
print(dp_encoded.shape)
print(pbp_encoded.shape)

# **Normalization / Scaling of the features** 

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_features_dp = scaler.fit_transform(dp_encoded)
dp_encoded = pd.DataFrame(scaled_features_dp, columns=dp_encoded.columns)


# **Taining Linear Regression on the Basis of Customer Staisfaction (Target Variable) to predict the investers what type of property is best to invest**

In [None]:
from sklearn.model_selection import train_test_split

X = dp_encoded.drop(columns=['guestSatisfactionOverall'])  # Assuming this is the target
y = dp_encoded['guestSatisfactionOverall']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)


In [None]:
from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("MSE:", mse)
print("R² Score:", r2)


# **Training Linear Regression model for the Property reviews dataframe**

In [None]:
from sklearn.preprocessing import StandardScaler
scaler_pr = StandardScaler()
scaled_features_pr = scaler_pr.fit_transform(pr_encoded)
pr_encoded = pd.DataFrame(scaled_features_pr, columns=pr_encoded.columns)

In [None]:
from sklearn.model_selection import train_test_split

X = pr_encoded.drop(columns=['rating'])
y = pr_encoded['rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("MSE:", mse)
print("R² Score:", r2)


# **Model Training for the Property By Place dataframe based on the accessibilityLabel using RandomForest Classifier which predicts whether the investment is useful or not based on the features like accessibility Label etc**

In [None]:
# Define a basic rule for investment decision
# Example: Invest if price is below median and starRating is above average
price_threshold = pbp_encoded['accessibilityLabel'].median()
rating_threshold = pbp_encoded['starRating'].mean()

# Create a new binary column based on conditions
pbp_encoded['investment_decision'] = pbp_encoded.apply(
    lambda row: 1 if (row['accessibilityLabel'] <= price_threshold and row['starRating'] >= rating_threshold) else 0,
    axis=1
)

# Check the distribution of the new target
print(pbp_encoded['investment_decision'].value_counts())


In [None]:
# Import required libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler

# Assuming your DataFrame is named 'propertyByPlace_cleaned'

# Step 1: Define Features and Target
features = [
    'avgRating', 'bathrooms', 'bedrooms', 'beds', 'isAutoTranslated',
    'isNewListing', 'isSuperhost', 'reviewsCount', 'starRating',
    'accessibilityLabel', 'adults', 'children', 'infants', 'pets',
    'listingBathroomLabel_1 bath', 'listingBathroomLabel_1 private bath',
    'listingBathroomLabel_1 shared bath', 'listingBathroomLabel_1.5 baths',
    'listingBathroomLabel_1.5 shared baths', 'listingBathroomLabel_2 baths',
    'listingBathroomLabel_2 shared baths', 'listingBathroomLabel_2.5 baths',
    'listingBathroomLabel_3 baths', 'listingBathroomLabel_3.5 baths',
    'listingBedLabel_10 beds', 'listingBedLabel_2 beds', 'listingBedLabel_3 beds',
    'listingBedLabel_4 beds', 'listingBedLabel_5 beds', 'listingBedLabel_6 beds',
    'city_Chestermere', 'listingGuestLabel_10 guests', 'listingGuestLabel_12 guests',
    'listingGuestLabel_2 guests', 'listingGuestLabel_3 guests', 'listingGuestLabel_4 guests',
    'listingGuestLabel_5 guests', 'listingGuestLabel_6 guests', 'listingGuestLabel_7 guests',
    'listingGuestLabel_8 guests', 'spaceType_Bungalow', 'spaceType_Condo', 'spaceType_Cottage',
    'spaceType_Guest suite', 'spaceType_Guesthouse', 'spaceType_Home', 'spaceType_Hotel room',
    'spaceType_Loft', 'spaceType_Place to stay', 'spaceType_Private room', 'spaceType_Shared room'
]

# Target column (Assuming binary classification: 1 for invest, 0 for not invest)
target = 'investment_decision'

# Drop rows with missing values
pbp_encoded = pbp_encoded.dropna(subset=features + [target])


scaler = StandardScaler()
pbp_encoded[features] = scaler.fit_transform(pbp_encoded[features])


X = pbp_encoded[features]
y = pbp_encoded[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)


y_pred = rf_model.predict(X_test)

# Step 6: Evaluate the Model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Step 7: Feature Importance
feature_importance = pd.Series(rf_model.feature_importances_, index=features).sort_values(ascending=False)
print("Feature Importance:\n", feature_importance)
