In [559]:
import pandas as pd
import statistics

# Load the dataset into a DataFrame
df = pd.read_csv('Data/AB_NYC_2019.csv')

# Rename columns for better clarity
df.rename(columns={'name': 'description', 'id': 'property_id'}, inplace=True)

# Remove rows where all values are missing
df.dropna(how='all', inplace=True)

# Fill missing values for specific columns with default values
df.fillna({'reviews_per_month': 'Unknown', 'last_review': 'Unknown'}, inplace=True)

# Update 'reviews_per_month' to 0 for listings with 0 reviews
df.loc[df['number_of_reviews'] == 0, 'reviews_per_month'] = 0

# Set 'last_review' to 'Never' for listings with 0 reviews
df.loc[df['number_of_reviews'] == 0, 'last_review'] = 'Never'

# Remove any instances of "'" (apostrophe) in the 'neighbourhood' column
df['neighbourhood_group'] = df['neighbourhood_group'].str.replace("'", "", regex=False)
df['neighbourhood_group'] = df['neighbourhood_group'].str.replace(".", "", regex=False)
df['neighbourhood_group'] = df['neighbourhood_group'].str.replace(' ', '-', regex=False)
df['neighbourhood_group'] = df['neighbourhood_group'].str.lower()

df['neighbourhood'] = df['neighbourhood'].str.replace("'", "", regex=False)
df['neighbourhood'] = df['neighbourhood'].str.replace(".", "", regex=False)
df['neighbourhood'] = df['neighbourhood'].str.replace(' ', '-', regex=False)
df['neighbourhood'] = df['neighbourhood'].str.lower()

# Define the mapping for neighbourhood name changes
neighbourhood_mapping = {
    'bay-terrace,-staten-island': 'bay-terrace',
    'east-morrisania': 'morrisania', 
    'fordham': 'fordham-manor',
    'sea-gate': 'seagate',
    'stuyvesant-town': 'stuyvesant-town-cooper-village',
    'concourse-village': 'concourse',
    'gramercy': 'gramercy-park',
    'rockaway-beach': 'rockaway-park', 
    'flatbush': 'flatbush-ditmas-park',  
}

# Replace the values in the 'neighbourhood' column using the mapping
df['neighbourhood'] = df['neighbourhood'].replace(neighbourhood_mapping)

In [561]:
# Create a new DataFrame containing property-related details
df_property = df[['property_id', 'description', 'neighbourhood', 
                  'room_type', 'price', 'minimum_nights',
                  'reviews_per_month', 'availability_365', 'host_id']]

# Create a separate DataFrame containing host-related details
df_host = df[['host_id', 'host_name', 'calculated_host_listings_count']]

In [563]:
# Select the relevant columns from the original dataframe: 'neighbourhood_group' and 'neighbourhood'
df_neighbourhoods = df[['neighbourhood_group', 'neighbourhood']]

# Remove duplicate rows based on the selected columns
df_neighbourhoods = df_neighbourhoods.drop_duplicates()

# Reset the index of the dataframe to have a continuous range of index values
df_neighbourhoods = df_neighbourhoods.reset_index(drop=True)

In [567]:
import requests
from bs4 import BeautifulSoup

# Initialize an empty dictionary to store neighbourhood names and their average prices
average_prices = {}

# Iterate over the rows of the DataFrame
for index, row in df_neighbourhoods.iterrows():
    # Extract neighbourhood group and neighbourhood from each row
    neighbourhood_group = row['neighbourhood_group']
    neighbourhood = row['neighbourhood']
    
    # Construct the URL dynamically using the neighbourhood group and neighbourhood
    url = f"https://www.propertynest.com/for-rent/{neighbourhood_group}/{neighbourhood}/"
    
    # Make a GET request to the URL
    response = requests.get(url)
    # print(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extract all the price elements (use find_all since multiple elements may match)
        price_elements = soup.find_all('span', class_='avg-rental-list__result')
        
        # If price elements are found
        if price_elements:
            # Clean the price text from each element and store it
            prices = [price_element.get_text(strip=True) for price_element in price_elements]
            # Save the prices in the dictionary for the neighbourhood
            average_prices[neighbourhood] = prices
        else:
            print(f"Price not found for {neighbourhood} in group {neighbourhood_group}, moving on.")
            continue  # Skip to the next neighbourhood if no price is found

# Print the dictionary containing neighbourhoods and their average prices
print(average_prices)

Price not found for columbia-st in group brooklyn, moving on.
Price not found for civic-center in group manhattan, moving on.
Price not found for navy-yard in group brooklyn, moving on.
Price not found for howland-hook in group staten-island, moving on.
Price not found for douglaston in group queens, moving on.
Price not found for edenwald in group bronx, moving on.
Price not found for fort-wadsworth in group staten-island, moving on.
{'kensington': ['$2,100', '$2,100', '$2,500', '$3,300'], 'midtown': ['$3,200', '$4,800', '$8,250', '$18,000'], 'harlem': ['$1,998', '$3,027', '$3,500', '$2,850'], 'clinton-hill': ['$2,100', '$2,850', '$4,035', '$4,500'], 'east-harlem': ['$2,575', '$2,600', '$3,400', '$3,600'], 'murray-hill': ['$2,950', '$3,600', '$5,500', '$5,500'], 'bedford-stuyvesant': ['$2,000', '$2,700', '$2,800', '$3,399'], 'hells-kitchen': ['$2,500', '$4,200', '$4,350', '$6,500'], 'upper-west-side': ['$2,490', '$3,495', '$4,950', '$7,350'], 'chinatown': ['$1,900', '$2,500', '$3,200'

In [568]:
# Initialize an empty list to store the neighbourhood and its corresponding median price
neighbourhood_data = []

# Iterate through the dictionary to calculate the median price for each neighbourhood
for neighbourhood, prices in average_prices.items():
    # Clean the prices and convert them to float, while handling any invalid prices (like 'N/A')
    cleaned_prices = []
    for price in prices:
        try:
            cleaned_price = float(price.replace('$', '').replace(',', ''))
            cleaned_prices.append(cleaned_price)
        except ValueError:
            continue
    
    # Calculate the median price
    if cleaned_prices:  # Ensure there are valid prices to calculate the median
        median_price = statistics.median(cleaned_prices)
    else:
        median_price = None  # If there are no valid prices, set median as None
    
    # Add the neighbourhood and its median price to the list
    neighbourhood_data.append({'neighbourhood': neighbourhood, 'median_price': median_price})

# Create the DataFrame from the list
df_median_prices = pd.DataFrame(neighbourhood_data)

In [569]:
df_median_prices.dropna(inplace = True)

In [570]:
df_median_prices

Unnamed: 0,neighbourhood,median_price
0,kensington,2300.0
1,midtown,6525.0
2,harlem,2938.5
3,clinton-hill,3442.5
4,east-harlem,3000.0
...,...,...
200,westchester-square,1750.0
201,little-neck,2300.0
203,unionport,1699.5
204,mill-basin,1859.5


In [571]:
df_property

Unnamed: 0,property_id,description,neighbourhood,room_type,price,minimum_nights,reviews_per_month,availability_365,host_id
0,2539,Clean & quiet apt home by the park,kensington,Private room,149,1,0.21,365,2787
1,2595,Skylit Midtown Castle,midtown,Entire home/apt,225,1,0.38,355,2845
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,harlem,Private room,150,3,0,365,4632
3,3831,Cozy Entire Floor of Brownstone,clinton-hill,Entire home/apt,89,1,4.64,194,4869
4,5022,Entire Apt: Spacious Studio/Loft by central park,east-harlem,Entire home/apt,80,10,0.1,0,7192
...,...,...,...,...,...,...,...,...,...
48890,36484665,Charming one bedroom - newly renovated rowhouse,bedford-stuyvesant,Private room,70,2,0,9,8232441
48891,36485057,Affordable room in Bushwick/East Williamsburg,bushwick,Private room,40,4,0,36,6570630
48892,36485431,Sunny Studio at Historical Neighborhood,harlem,Entire home/apt,115,10,0,27,23492952
48893,36485609,43rd St. Time Square-cozy single bed,hells-kitchen,Shared room,55,1,0,2,30985759


In [572]:
df_host

Unnamed: 0,host_id,host_name,calculated_host_listings_count
0,2787,John,6
1,2845,Jennifer,2
2,4632,Elisabeth,1
3,4869,LisaRoxanne,1
4,7192,Laura,1
...,...,...,...
48890,8232441,Sabrina,2
48891,6570630,Marisol,2
48892,23492952,Ilgar & Aysel,1
48893,30985759,Taz,6
