
# **Imports**



In [4]:
import numpy as np
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.metrics import roc_curve, auc, f1_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from datetime import datetime
from sklearn.metrics import accuracy_score, roc_auc_score
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import silhouette_score
from geopy.distance import geodesic

# **Linear Regression function**

In [5]:
class LinearRegression1:
    def __init__(self):
        self.coefficients = None

    def fit(self, X, y):
        # Add a column of ones to X for the intercept term
        X = np.concatenate((np.ones((X.shape[0], 1)), X), axis=1)
        # Calculate the coefficients using the pseudoinverse
        self.coefficients = np.linalg.pinv(X.T.dot(X)).dot(X.T).dot(y)

    def predict(self, X):
        # Add a column of ones to X for the intercept term
        X = np.concatenate((np.ones((X.shape[0], 1)), X), axis=1)

        # Calculate the predicted values
        return X.dot(self.coefficients)

# **Logistic Regression function**

In [6]:
class LogisticRegression1:
    '''
    A class which implements logistic regression model with gradient descent.
    '''
    def __init__(self, learning_rate=0.01, n_iterations=3000):
        self.learning_rate = learning_rate
        self.n_iterations = n_iterations
        self.weights, self.bias = None, None

    @staticmethod
    def _sigmoid(x):
        '''
        Private method, used to pass results of the line equation through the sigmoid function.

        :param x: float, prediction made by the line equation
        :return: float
        '''
        return 1 / (1 + np.exp(-x))
    def fit(self, X, y):
        '''
        Used to calculate the coefficient of the logistic regression model.

        :param X: array, features
        :param y: array, true values
        :return: None
        '''

        # 1. Initialize coefficients
        self.weights = np.zeros(X.shape[1])
        self.bias = 0

        # 2. Perform gradient descent
        for i in range(self.n_iterations):
            linear_pred = np.dot(X, self.weights) + self.bias
            probability = self._sigmoid(linear_pred)

            # Calculate derivatives
            partial_w = (1 / X.shape[0]) * (np.dot(X.T, (probability - y)))
            partial_d = (1 / X.shape[0]) * (np.sum(probability - y))

            # Update the coefficients
            self.weights -= self.learning_rate * partial_w
            self.bias -= self.learning_rate * partial_d

    def predict_proba(self, X):
        '''
        Calculates prediction probabilities for a given threshold using the line equation
        passed through the sigmoid function.

        :param X: array, features
        :return: array, prediction probabilities
        '''
        linear_pred = np.dot(X, self.weights) + self.bias
        return self._sigmoid(linear_pred)
    def predict(self, X, threshold=0.5):
        '''
        Makes predictions using the line equation passed through the sigmoid function.

        :param X: array, features
        :param threshold: float, classification threshold
        :return: array, predictions
        '''
        probabilities = self.predict_proba(X)
        return [1 if i > threshold else 0 for i in probabilities]



# **PCA function**

In [7]:
class PCA1:
    def __init__(self, n_components):
        self.n_components = n_components
        self.components = None
        self.mean = None

    def fit(self, X):
        # Compute the mean of the data
        self.mean = np.mean(X, axis=0)

        # Center the data

        # Compute the covariance matrix
        cov_matrix = np.cov(X,rowvar=0, bias=1)

        # Perform eigenvalue decomposition
        eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)

        # Sort the eigenvalues and corresponding eigenvectors in descending order
        idx = np.argsort(eigenvalues)[::-1]
        eigenvalues = eigenvalues[idx]
        eigenvectors = eigenvectors[:, idx]

        # Select the top n_components eigenvectors
        self.components = eigenvectors.T[:self.n_components]

    def transform(self, X):
        # Center the data
        return np.dot(X, self.components.T)


# **features engineering**



> Attempting to identify and select specific amenities that might be indicative of whether a property is expensive

In [8]:
def check_amenities(train):

    # Clean amenities strings by removing square brackets if present
    cleaned_strings = [
        amenities_string[2:-2] if amenities_string.startswith('["') and amenities_string.endswith('"]') else amenities_string
        for amenities_string in train['amenities']
    ]

    # Extract words from each cleaned string
    words_lists = [
        [word.strip('" ,') for word in cleaned_string.split(',')]
        for cleaned_string in cleaned_strings
    ]

    # One-hot encode amenities using pandas get_dummies
    amenities_df = train['amenities'].str.get_dummies(sep=', ')

    # Calculate the count of each amenity
    amenity_counts = amenities_df.apply(pd.Series.value_counts).sum().sort_values(ascending=False)

    # Combine the original DataFrame with the one-hot encoded amenities
    combined_df = pd.concat([train, amenities_df], axis=1)

    # Calculate the correlation between amenities and the target variable 'expensive'
    correlation_matrix = combined_df[amenities_df.columns].corrwith(combined_df['expensive'])

    # Sort the correlations in descending order
    sorted_correlations = correlation_matrix.sort_values(ascending=False)

    # Set a threshold for selecting features based on correlation
    threshold = 0.2

    # Select features (amenities) with correlation above the threshold
    selected_feature_names = sorted_correlations[sorted_correlations > threshold].index.tolist()

    # Remove double quotes from selected feature names
    selected_feature_names = [word.replace('"', '') for word in selected_feature_names]

    # Check if each property has at least one of the selected amenities
    is_in_selected_amenities = []

    for amenity_list in words_lists:
        flag = 0
        for good_word in amenity_list:
            if good_word in selected_feature_names:
                flag = 1
                break
        is_in_selected_amenities.append(flag)

    # Add a new column indicating whether selected amenities are present
    train['Selected_Feature_Names'] = is_in_selected_amenities

    # Return the updated DataFrame and the list of selected feature names
    return train, selected_feature_names




--- Same function for test data


In [9]:
def check_amenities_test(test, train):
    # Clean amenities strings by removing square brackets if present
    cleaned_strings = [
        amenities_string[2:-2] if amenities_string.startswith('["') and amenities_string.endswith('"]') else amenities_string
        for amenities_string in test['amenities']
    ]

    # Extract words from each cleaned string
    words_lists = [
        [word.strip('" ,') for word in cleaned_string.split(',')]
        for cleaned_string in cleaned_strings
    ]

    # Initialize a list to store whether selected amenities are present
    is_in_selected_amenities = []

    # Get the selected feature names from the training set
    _, selected_feature_names = check_amenities(train)

    # Check if each property in the test set has at least one of the selected amenities
    for amenity_list in words_lists:
        flag = 0
        for good_word in amenity_list:
            if good_word in selected_feature_names:
                flag = 1
                break
        is_in_selected_amenities.append(flag)

    # Add a new column indicating whether selected amenities are present in the test set
    test['Selected_Feature_Names'] = is_in_selected_amenities

    return test




> Data preprocessing - cleaning and transforming various columns in the dataset to make them suitable for analysis or modeling. The preprocessing includes handling percentages, mapping binary categories, converting date-related columns, normalizing availability values, aggregating review scores, and calculating the days since the last review.



In [10]:
def data_preprocessing(train):
    # Remove '%' from 'host_response_rate' and 'host_acceptance_rate'
    train['host_response_rate'] = train['host_response_rate'].str.replace('%', '')
    train['host_acceptance_rate'] = train['host_acceptance_rate'].str.replace('%', '')

    # Convert 'host_response_rate' and 'host_acceptance_rate' columns to numeric
    train['host_response_rate'] = pd.to_numeric(train['host_response_rate'])
    train['host_acceptance_rate'] = pd.to_numeric(train['host_acceptance_rate'])

    # Map binary categorical columns 'f' to 0 and 't' to 1
    train['instant_bookable'] = train['instant_bookable'].map({'f': 0, 't': 1})
    train['host_is_superhost'] = train['host_is_superhost'].map({'f': 0, 't': 1})

    # Convert 'host_since' to datetime format and calculate months since host registration
    train['host_since'] = pd.to_datetime(train['host_since'])
    current_date = pd.to_datetime(datetime.now().date())
    train['host_since'] = (current_date - train['host_since']) / np.timedelta64(1, 'M')

    # Normalize availability related columns
    train["availability_60"] = train["availability_60"] / 2
    train["availability_90"] = train["availability_90"] / 3
    train["availability_365"] = train["availability_365"] / 12
    train["availability"] = (train["availability_365"] + train["availability_90"] + train["availability_60"] + train["availability_30"]) / 30

    # Calculate an overall review score by averaging individual scores
    train["review_scores"] = (train["review_scores_rating"] + train["review_scores_accuracy"] + train["review_scores_cleanliness"] +
                              train["review_scores_checkin"] + train["review_scores_communication"] + train["review_scores_location"] +
                              train["review_scores_value"]) / 7

    # Convert 'last_review' to datetime format and calculate days since last review
    train['last_review'] = pd.to_datetime(train['last_review'])
    current_date = pd.to_datetime(datetime.now().date())
    train['days_since_last_review'] = (current_date - train['last_review']).dt.days

    return train




>  I checked the correlation of the columns with the target variable and I remove certain columns from the DataFrame, because they are not needed for the analysis or modeling phase.



In [11]:
def drop_columns(train):
    # Define a list of columns to be dropped
    columns_to_drop = [
        'id', 'host_id', 'host_listings_count',
        'host_has_profile_pic', 'host_identity_verified', 'latitude', 'longitude', 'amenities', 'has_availability',
        'availability_30', 'availability_60', 'availability_90', 'availability_365', 'number_of_reviews',
        'first_review', 'last_review', 'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness',
        'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value',
        'host_verifications', 'license', 'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
        'maximum_minimum_nights', 'minimum_maximum_nights', 'maximum_maximum_nights',
        'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'calculated_host_listings_count_shared_rooms',
        'calculated_host_listings_count_private_rooms'
    ]

    # Drop the specified columns from the DataFrame
    train = train.drop(columns_to_drop, axis=1)

    # Return the modified DataFrame
    return train




Prepare the data for machine learning models by converting categorical variables into a format that can be used in numerical calculations by One-hot encoding.


In [12]:
def encoding(train):
    # Identify categorical columns in the DataFrame
    categorical_columns = train.select_dtypes(include='object').columns

    # Apply one-hot encoding to all categorical columns
    df_encoded = pd.get_dummies(train[categorical_columns], drop_first=True)

    # Drop the original categorical columns from the original DataFrame
    train = train.drop(categorical_columns, axis=1)

    # Concatenate the one-hot encoded DataFrame with the original DataFrame
    combined_df = pd.concat([train, df_encoded], axis=1)

    return combined_df



> Normalize the scale of numeric features.



In [13]:
def normalization(train):

  numeric_columns = train.select_dtypes(include='number').columns

  # Create a MinMaxScaler
  scaler = MinMaxScaler()

  # Apply Min-Max scaling to the selected numeric columns
  train[numeric_columns] = scaler.fit_transform(train[numeric_columns])
  return train




> Standardize the scale of numeric features.



In [14]:
def standardization(train):

  numeric_columns = train.select_dtypes(include='number').columns.difference(['expensive'])
  # Create a StandardScaler
  scaler = StandardScaler()

  # Apply standardization to the selected numeric columns
  train[numeric_columns] = scaler.fit_transform(train[numeric_columns])
  return train




--- Standardize the scale of numeric features for the test (without target column).



In [15]:
def standardization_test(train):

  numeric_columns = train.select_dtypes(include='number').columns
  # Create a StandardScaler
  scaler = StandardScaler()

  # Apply standardization to the selected numeric columns
  train[numeric_columns] = scaler.fit_transform(train[numeric_columns])
  return train



> Handle missing values - fill missing values by Multivariate_impute/KNN - (no NaN value in categorical columns).




In [16]:
def fill_NA(train):
  numerical_cols = train.select_dtypes(include='number').columns

  # # Impute missing values using KNN
  # imputer = KNNImputer(n_neighbors=8, weights="uniform")
  # train[numerical_cols] = pd.DataFrame(imputer.fit_transform(train[numerical_cols]), columns=numerical_cols)

  # Impute missing values using IterativeImputer
  imputer_numeric = IterativeImputer(random_state=0)
  train[numerical_cols] = pd.DataFrame(imputer_numeric.fit_transform(train[numerical_cols]), columns=numerical_cols)
  return train





> balances the dataset using the Synthetic Minority Over-sampling Technique (SMOTE) - SMOTE is being applied to balance the distribution of the 'expensive' class.



In [17]:
def balance_SMOTE(train):
    # Separate features (X) and target variable (y)
    X = train.drop('expensive', axis=1)  # Features
    y = train['expensive']  # Target

    # Create an instance of SMOTE
    smote = SMOTE()

    # Apply SMOTE to the dataset
    X_resampled, y_resampled = smote.fit_resample(X, y)

    # Create a new DataFrame with resampled data
    train = pd.concat([pd.DataFrame(X_resampled, columns=X.columns), pd.DataFrame(y_resampled, columns=['expensive'])], axis=1)

    return train




> The purpose of this function appears to be to replace the original 'bathrooms_text' and 'property_type' columns with their respective percentage columns, representing the percentage of the 'expensive' class within each category.



In [68]:
def change_categorial_cloumns_train(train):
    # Step 1: Calculate the sum of expensive values for each unique category
    sum_expensive = train.groupby("bathrooms_text")["expensive"].sum()

    # Step 2: Calculate the total count of each unique category
    total_count = train.groupby("bathrooms_text").size()

    # Step 3: Calculate the ratio of sum_expensive to total_count for each category
    expensive_ratio = sum_expensive / total_count

    # Step 4: Merge the calculated ratio back into the original DataFrame
    train = pd.merge(train, expensive_ratio.rename('percentage_bathrooms_text'), left_on='bathrooms_text', right_index=True, how='left')

    train = train.drop(['bathrooms_text', 'property_type'], axis=1)

    return train



--- Same function for test data.



In [69]:
def change_categorial_columns_test(test, train):

    # Step 1: Calculate the sum of expensive values for each unique category
    sum_expensive = train.groupby("bathrooms_text")["expensive"].sum()

    # Step 2: Calculate the total count of each unique category
    total_count = train.groupby("bathrooms_text").size()

    # Step 3: Calculate the ratio of sum_expensive to total_count for each category
    expensive_ratio = sum_expensive / total_count

    # Step 4: Merge the calculated ratio back into the original DataFrame
    train = pd.merge(train, expensive_ratio.rename('percentage_bathrooms_text'), left_on='bathrooms_text', right_index=True, how='left')


    # Step 5: Merge the calculated ratio from the training set into the test set
    test = pd.merge(test, expensive_ratio.rename('percentage_bathrooms_text'), left_on='bathrooms_text', right_index=True, how='left')

    # Step 6: Fill missing values with a default value (e.g., 0) in the test set
    mean_percentage_train = expensive_ratio.mean()
    test['percentage_bathrooms_text'].fillna(mean_percentage_train, inplace=True)

    # Optional: Drop unnecessary columns if needed
    test = test.drop(['bathrooms_text', 'property_type'], axis=1)

    return test




> calculate the centroid coordinates using the KMeans clustering algorithm for both expensive and non-expensive properties. Determine K=4 by elbow method and silhouette score methods.



In [20]:
def calculate_centroids(train):
    # Extract expensive and non-expensive rows
    expensive_rows = train[train['expensive'] == 1][['latitude', 'longitude']]
    expensive_rows_0 = train[train['expensive'] == 0][['latitude', 'longitude']]

    # Fit KMeans with 4 clusters for expensive properties
    kmeans = KMeans(n_clusters=4, random_state=42).fit(expensive_rows)
    centroid_coordinates = kmeans.cluster_centers_

    # Fit KMeans with 4 clusters for non-expensive properties
    kmeans_0 = KMeans(n_clusters=4, random_state=42).fit(expensive_rows_0)
    centroid_coordinates_0 = kmeans_0.cluster_centers_

    return centroid_coordinates,centroid_coordinates_0



> Calculates the minimum geodesic distance from each property to the centroids of expensive properties.




In [21]:
def new_centers_train(train,centroid_coordinates):

  def calculate_min_distance(row):
      property_coordinates = (row['latitude'], row['longitude'])
      min_distance = min([geodesic(property_coordinates, location).km for location in centroid_coordinates])
      return min_distance
  # Create a new column for minimum distance in the DataFrame
  train['min_distance_to_top'] = train.apply(calculate_min_distance, axis=1)
  return train



---Same function for the test data.



In [22]:
def new_centers_test(test,centroid_coordinates):

  def calculate_min_distance(row):
      property_coordinates = (row['latitude'], row['longitude'])
      min_distance = min([geodesic(property_coordinates, location).km for location in centroid_coordinates])
      return min_distance
  # Create a new column for minimum distance in the DataFrame
  test['min_distance_to_top'] = test.apply(calculate_min_distance, axis=1)
  return test



> Calculates the minimum geodesic distances from each property to both expensive and non-expensive centroids and assigns a binary value (1 or 0) based on which set of centroids is closer.



In [23]:
def new_centers_train_binary(train,centroid_coordinates,centroid_coordinates_0):

    def calculate_min_distance(row):
        property_coordinates = (row['latitude'], row['longitude'])

        # Calculate minimum distance to expensive centroids
        min_distance_expensive = min([geodesic(property_coordinates, location).km for location in centroid_coordinates])

        # Calculate minimum distance to non-expensive centroids
        min_distance_non_expensive = min([geodesic(property_coordinates, location).km for location in centroid_coordinates_0])

        # Set the new column value based on the minimum distance
        if min_distance_expensive <= min_distance_non_expensive:
            return 1  # Minimum distance to expensive centroid
        else:
            return 0  # Minimum distance to non-expensive centroid

    # Create a new column for minimum distance classification in your DataFrame
    train['min_distance_to_top_binary'] = train.apply(calculate_min_distance, axis=1)

    return train



---Same function for the test data.



In [24]:
def new_centers_test_binary(test,centroid_coordinates,centroid_coordinates_0):

    def calculate_min_distance(row):
        property_coordinates = (row['latitude'], row['longitude'])

        # Calculate minimum distance to expensive centroids
        min_distance_expensive = min([geodesic(property_coordinates, location).km for location in centroid_coordinates])

        # Calculate minimum distance to non-expensive centroids
        min_distance_non_expensive = min([geodesic(property_coordinates, location).km for location in centroid_coordinates_0])

        # Set the new column value based on the minimum distance
        if min_distance_expensive <= min_distance_non_expensive:
            return 1  # Minimum distance to expensive centroid
        else:
            return 0  # Minimum distance to non-expensive centroid

    # Create a new column for minimum distance classification in your DataFrame
    test['min_distance_to_top_binary'] = test.apply(calculate_min_distance, axis=1)

    return test


# **Data Recipe**

In [60]:
def data_recipe(train):
  centroid_coordinates , centroid_coordinates_0 = calculate_centroids(train)
  train = new_centers_train(train,centroid_coordinates)
  train = new_centers_train_binary(train,centroid_coordinates,centroid_coordinates_0)
  train = change_categorial_cloumns_train(train)
  train, _ = check_amenities(train)
  train = data_preprocessing(train)
  train = drop_columns(train)
  train = fill_NA(train)
  train = encoding(train)
  train = balance_SMOTE(train)
  # train = normalization(train)
  train = standardization(train)
  return train


In [59]:
def data_recipe_test(test,train):
  centroid_coordinates , centroid_coordinates_0 = calculate_centroids(train)
  test = new_centers_test(test,centroid_coordinates)
  test = new_centers_test_binary(test,centroid_coordinates,centroid_coordinates_0)
  test = change_categorial_columns_test(test,train)
  test = check_amenities_test(test,train)
  test = data_preprocessing(test)
  test = drop_columns(test)
  test = fill_NA(test)
  test = encoding(test)
  # test = normalization(test)
  test = standardization(test)
  return test

# **Train model**

In [27]:
def train_model(train):

  # feature_ engineering
  train = data_recipe(train)

  # Assuming you have already defined X_train and y_train
  X_train = train
  y_train = train['expensive']
  del X_train['expensive']

  # Standardize the features (important for some models)
  scaler = StandardScaler()
  X_train_standardized = scaler.fit_transform(X_train)

  # Define logistic regression model
  model = LogisticRegression1()

  # Create k-fold cross-validation iterator
  kf = KFold(n_splits=10, shuffle=True, random_state=2023)

  # Lists to store cross-validation results
  roc_auc_scores = []

  # Perform k-fold cross-validation
  for train_index, test_index in kf.split(X_train_standardized):
      X_train_fold, X_test_fold = X_train_standardized[train_index], X_train_standardized[test_index]
      y_train_fold, y_test_fold = y_train.iloc[train_index], y_train.iloc[test_index]

      # Fit the logistic regression model on the training fold
      model.fit(X_train_fold, y_train_fold)

      # Make predictions on the test fold
      y_pred_fold = model.predict_proba(X_test_fold)

      # Evaluate the model on the current fold
      roc_auc_fold = roc_auc_score(y_test_fold, y_pred_fold)

      roc_auc_scores.append(roc_auc_fold)

  # Calculate the mean scores across all folds
  mean_roc_auc = np.mean(roc_auc_scores)

  print(f'Mean ROC AUC: {mean_roc_auc}')

  return mean_roc_auc


In [28]:
def train_model_pca(train, n_components=13):

    # Feature engineering
    X_train = data_recipe(train)
    y_train = train['expensive']
    del X_train['expensive']

    # Standardize the features
    scaler = StandardScaler()
    X_train_standardized = scaler.fit_transform(X_train)

    # Apply PCA
    pca = PCA1(n_components=n_components)
    X_train_pca = pca.fit_transform(X_train_standardized)

    # Define logistic regression model
    model = LogisticRegression1()

    # Create k-fold cross-validation iterator
    kf = KFold(n_splits=10, shuffle=True, random_state=2023)

    # Lists to store cross-validation results
    roc_auc_scores = []

    # Perform k-fold cross-validation
    for train_index, test_index in kf.split(X_train_pca):
        X_train_fold, X_test_fold = X_train_pca[train_index], X_train_pca[test_index]
        y_train_fold, y_test_fold = y_train.iloc[train_index], y_train.iloc[test_index]

        # Fit the logistic regression model on the training fold
        model.fit(X_train_fold, y_train_fold)

        # Make predictions on the test fold
        y_pred_fold = model.predict_proba(X_test_fold)

        # Evaluate the model on the current fold
        roc_auc_fold = roc_auc_score(y_test_fold, y_pred_fold)

        roc_auc_scores.append(roc_auc_fold)

    # Calculate the mean scores across all folds
    mean_roc_auc = np.mean(roc_auc_scores)

    print(f'Mean ROC AUC with PCA: {mean_roc_auc}')

    return mean_roc_auc

# **Test model**

In [29]:
def test_model(train,test,test_res):

  X_train = data_recipe(train)
  test = data_recipe_test(test,train)
  model = LogisticRegression1()

  # Assuming you have already defined X_train and y_train
  y_train = X_train['expensive']
  del X_train['expensive']

  model.fit(X_train, y_train)

  # Make predictions on the test set
  y_pred = model.predict_proba(test)

  # Evaluate the model
  roc_auc = roc_auc_score(test_res['expensive'], y_pred)

  print("ROC AUC Score:", roc_auc)

  # Plot the Receiver Operating Characteristic (ROC) curve and calculate the Area Under the Curve (AUC).
  def plot_roc_auc(y_true, y_scores, title='ROC Curve'):
    fpr, tpr, thresholds = roc_curve(y_true, y_scores)
    roc_auc = auc(fpr, tpr)

    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc))
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(title)
    plt.legend(loc='lower right')
    plt.show()

  # plot_roc_auc(test_res['expensive'], y_pred, title='ROC Curve')

  return roc_auc, y_pred

In [30]:
def test_model_pca(train,test,test_res):
  # Assuming you have already defined data_recipe and data_recipe_test functions
  X_train = data_recipe(train)
  X_test = data_recipe_test(test, train)

  # Separate the target variable
  y_train = X_train['expensive']
  del X_train['expensive']

  # Initialize PCA with the desired number of components
  n_components = 13  # You can adjust this based on your requirements
  pca = PCA1(n_components=n_components)

  # Fit and transform the training data
  pca.fit(X_train)
  X_train_pca = pca.transform(X_train)

  # Transform the test data using the same PCA transformation
  X_test_pca = pca.transform(X_test)

  # Initialize Logistic Regression model
  model = LogisticRegression1()

  # Fit the model on the PCA-transformed training data
  model.fit(X_train_pca, y_train)

  # Make predictions on the test set using PCA-transformed data
  y_pred = model.predict_proba(X_test_pca)

  # Evaluate the model using ROC AUC score
  roc_auc = roc_auc_score(test_res['expensive'], y_pred)

  print("ROC AUC Score:", roc_auc)

  # Plot the Receiver Operating Characteristic (ROC) curve and calculate the Area Under the Curve (AUC).
  def plot_roc_auc(y_true, y_scores, title='ROC Curve'):
    fpr, tpr, thresholds = roc_curve(y_true, y_scores)
    roc_auc = auc(fpr, tpr)

    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc))
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(title)
    plt.legend(loc='lower right')
    plt.show()

  # plot_roc_auc(test_res['expensive'], y_pred, title='ROC Curve')

  return roc_auc, y_pred

# ***Testing***

In [None]:
np.random.seed(2023)

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
test_res = pd.read_csv('test_lab.csv')

# train_model(train)
# train_model_pca(train)

test_model(train,test,test_res)
# test_model_pca(train,test,test_res)

# **Code for analayze**



> Generates a count plot using Seaborn to visualize the distribution of properties based on any column and the 'expensive' label in a dataset.



In [None]:
train = pd.read_csv('train.csv')
# Count plot with seaborn based on property_type and expensive
plt.figure(figsize=(12, 8))
ax = sns.countplot(x='property_type', hue='expensive', data=train, palette='viridis')
plt.title('Distribution of Properties Based on Property Type and Expensive Label')
plt.xlabel('Property Type')
plt.ylabel('Count')
plt.legend(title='Expensive', bbox_to_anchor=(1.05, 1), loc='upper left')

# Rotate x-axis labels
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')

plt.show()



> Find the optimal number of clusters (K) using the elbow method.



In [None]:
train = pd.read_csv('train.csv')
df = train[train['expensive'] == 0]
Coordinate = df[['latitude', 'longitude']]

scaler = StandardScaler()
Coordinate_standardized = scaler.fit_transform(Coordinate)

# Using the elbow method to find the optimal number of clusters
inertia = []

for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, random_state=42)
    kmeans.fit(Coordinate_standardized)
    inertia.append(kmeans.inertia_)

# Plotting the elbow curve
plt.plot(range(1, 11), inertia, marker='o')
plt.title('Elbow Method for Optimal K')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Inertia')
plt.show()



> Find the optimal number of clusters (K) using the Silhouette Method.



In [None]:
train = pd.read_csv('train.csv')
df = train[train['expensive'] == 0]
Coordinate = df[['latitude', 'longitude']]

# Set k clusters range
k_values = range(2, 11)

# Calculate silhouette scores for different cluster numbers
silhouette_scores = []

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    cluster_labels = kmeans.fit_predict(Coordinate)
    silhouette_avg = silhouette_score(Coordinate, cluster_labels)
    silhouette_scores.append(silhouette_avg)

# Plotting the Silhouette Score curve
plt.figure(figsize=(8, 6))
plt.plot(k_values, silhouette_scores, marker='o')
plt.title('Silhouette Method for Optimal K')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Silhouette Score')
plt.show()



> Creat a scatter plot to visualize the distribution of expensive properties based on latitude and longitude, along with the centroids of the clusters obtained using KMeans clustering.



In [None]:
train = pd.read_csv('train.csv')
df = train[train['expensive'] == 1]

# Fit KMeans with 4 clusters for expensive properties
kmeans_expensive = KMeans(n_clusters=4, random_state=21).fit(df[['latitude', 'longitude']])
centroid_coordinates_expensive = kmeans_expensive.cluster_centers_

# Plot existing points
plt.figure(figsize=(12, 8))
ax = sns.scatterplot(x='longitude', y='latitude', hue='expensive', data=df, palette='viridis')
plt.title('Distribution of Expensive Properties Based on Latitude and Longitude')
plt.xlabel('Latitude')
plt.ylabel('Longitude')
plt.legend(title='Expensive', bbox_to_anchor=(1.05, 1), loc='upper left')

# Plot centroids for expensive properties
for i, (centroid_latitude, centroid_longitude) in enumerate(centroid_coordinates_expensive):
    plt.scatter(centroid_longitude, centroid_latitude, color='red', marker='X', s=100, label=f'Expensive Centroid {i+1}')

plt.legend()
plt.show()




> Creat a scatter plot to visualize the distribution of non-expensive properties based on latitude and longitude, along with the centroids of the clusters obtained using KMeans clustering.



In [None]:
train = pd.read_csv('train.csv')
df = train[train['expensive'] == 0]

# Fit KMeans with 4 clusters for expensive properties
kmeans_expensive = KMeans(n_clusters=4, random_state=21).fit(df[['latitude', 'longitude']])
centroid_coordinates_expensive = kmeans_expensive.cluster_centers_

# Plot existing points
plt.figure(figsize=(12, 8))
ax = sns.scatterplot(x='longitude', y='latitude', hue='expensive', data=df, palette='viridis')
plt.title('Distribution of Expensive Properties Based on Latitude and Longitude')
plt.xlabel('Latitude')
plt.ylabel('Longitude')
plt.legend(title='Expensive', bbox_to_anchor=(1.05, 1), loc='upper left')

# Plot centroids for expensive properties
for i, (centroid_latitude, centroid_longitude) in enumerate(centroid_coordinates_expensive):
    plt.scatter(centroid_longitude, centroid_latitude, color='red', marker='X', s=100, label=f'Expensive Centroid {i+1}')


plt.legend()
plt.show()




> The code snippet explores and prints the counts of expensive and not expensive properties, and as we can see the data is not balance.



In [None]:
expensive_counts = train['expensive'].value_counts()
print(f'Number of Expensive Properties (labeled as 1): {expensive_counts[1]}')
print(f'Number of Not Expensive Properties (labeled as 0): {expensive_counts[0]}')

# Bar plot
plt.bar(['Expensive (1)', 'Not Expensive (0)'], expensive_counts, color=['#7EB3D1', '#FFD08A'])
plt.title('Number of Expensive and Not Expensive Properties')
plt.xlabel('Property Type')
plt.ylabel('Count')
plt.show()



> The variable 'correlations' will hold the correlation coefficients between each feature and the target variable, providing insights into the linear relationships between features and the target.



In [None]:
train = pd.read_csv('train.csv')
features = train.drop(columns=['expensive'])
target = train['expensive']

# Calculate correlations
correlations = features.corrwith(target)
correlations