In [606]:
import pandas as pd
import statistics

# Load the dataset into a DataFrame
df = pd.read_csv('Data/AB_NYC_2019.csv')

# Rename columns for better clarity
df.rename(columns={'name': 'description', 'id': 'property_id'}, inplace=True)

# Remove rows where all values are missing
df.dropna(how='all', inplace=True)

# Fill missing values for specific columns with default values
df.fillna({'reviews_per_month': 'Unknown', 'last_review': 'Unknown'}, inplace=True)

# Update 'reviews_per_month' to 0 for listings with 0 reviews
df.loc[df['number_of_reviews'] == 0, 'reviews_per_month'] = 0

# Set 'last_review' to 'Never' for listings with 0 reviews
df.loc[df['number_of_reviews'] == 0, 'last_review'] = 'Never'

# Remove any instances of "'" (apostrophe) in the 'neighbourhood' column
df['neighbourhood_group'] = df['neighbourhood_group'].str.replace("'", "", regex=False)
df['neighbourhood_group'] = df['neighbourhood_group'].str.replace(".", "", regex=False)
df['neighbourhood_group'] = df['neighbourhood_group'].str.replace(' ', '-', regex=False)
df['neighbourhood_group'] = df['neighbourhood_group'].str.lower()

df['neighbourhood'] = df['neighbourhood'].str.replace("'", "", regex=False)
df['neighbourhood'] = df['neighbourhood'].str.replace(".", "", regex=False)
df['neighbourhood'] = df['neighbourhood'].str.replace(' ', '-', regex=False)
df['neighbourhood'] = df['neighbourhood'].str.lower()

# Define the mapping for neighbourhood name changes
neighbourhood_mapping = {
    'bay-terrace,-staten-island': 'bay-terrace',
    'east-morrisania': 'morrisania', 
    'fordham': 'fordham-manor',
    'sea-gate': 'seagate',
    'stuyvesant-town': 'stuyvesant-town-cooper-village',
    'concourse-village': 'concourse',
    'gramercy': 'gramercy-park',
    'rockaway-beach': 'rockaway-park', 
    'flatbush': 'flatbush-ditmas-park',  
}

# Replace the values in the 'neighbourhood' column using the mapping
df['neighbourhood'] = df['neighbourhood'].replace(neighbourhood_mapping)

df_cleaned = df[['property_id', 'description', 'neighbourhood', 'room_type', 'price', 'minimum_nights', 'number_of_reviews', 
                 'reviews_per_month', 'availability_365', 'host_id', 'host_name', 'calculated_host_listings_count']]

In [561]:
# Create a new DataFrame containing property-related details
df_property = df[['property_id', 'description', 'neighbourhood', 
                  'room_type', 'price', 'minimum_nights',
                  'reviews_per_month', 'availability_365', 'host_id']]

# Create a separate DataFrame containing host-related details
df_host = df[['host_id', 'host_name', 'calculated_host_listings_count']]

In [563]:
# Select the relevant columns from the original dataframe: 'neighbourhood_group' and 'neighbourhood'
df_neighbourhoods = df[['neighbourhood_group', 'neighbourhood']]

# Remove duplicate rows based on the selected columns
df_neighbourhoods = df_neighbourhoods.drop_duplicates()

# Reset the index of the dataframe to have a continuous range of index values
df_neighbourhoods = df_neighbourhoods.reset_index(drop=True)

# Code to transfer to another notebook

# Continue

In [699]:
df_nh_prices = pd.read_csv('Data/neighbourhood_prices.csv')

In [701]:
inflation_rate = 0.02 
years = 6

# arp = apartment renting price
df_nh_prices['arp_2019'] = df_nh_prices['median_price'] / ((1 + inflation_rate) ** years)

df_nh_prices['arp_2019'] = df_nh_prices['arp_2019'].round()

In [703]:
del df_nh_prices['median_price']

In [705]:
df_nh_prices

Unnamed: 0,neighbourhood,arp_2019
0,kensington,2042.0
1,midtown,5794.0
2,harlem,2609.0
3,clinton-hill,3057.0
4,east-harlem,2664.0
...,...,...
153,westchester-square,1554.0
154,little-neck,2042.0
155,unionport,1509.0
156,mill-basin,1651.0


In [707]:
df_merged = pd.merge(df_cleaned, df_nh_prices, on='neighbourhood', how='left')

In [714]:
df_merged.dropna(how='any')

Unnamed: 0,property_id,description,neighbourhood,room_type,price,minimum_nights,number_of_reviews,reviews_per_month,availability_365,host_id,host_name,calculated_host_listings_count,arp_2019
0,2539,Clean & quiet apt home by the park,kensington,Private room,149,1,9,0.21,365,2787,John,6,2042.0
1,2595,Skylit Midtown Castle,midtown,Entire home/apt,225,1,45,0.38,355,2845,Jennifer,2,5794.0
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,harlem,Private room,150,3,0,0,365,4632,Elisabeth,1,2609.0
3,3831,Cozy Entire Floor of Brownstone,clinton-hill,Entire home/apt,89,1,270,4.64,194,4869,LisaRoxanne,1,3057.0
4,5022,Entire Apt: Spacious Studio/Loft by central park,east-harlem,Entire home/apt,80,10,9,0.1,0,7192,Laura,1,2664.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
48890,36484665,Charming one bedroom - newly renovated rowhouse,bedford-stuyvesant,Private room,70,2,0,0,9,8232441,Sabrina,2,2442.0
48891,36485057,Affordable room in Bushwick/East Williamsburg,bushwick,Private room,40,4,0,0,36,6570630,Marisol,2,2553.0
48892,36485431,Sunny Studio at Historical Neighborhood,harlem,Entire home/apt,115,10,0,0,27,23492952,Ilgar & Aysel,1,2609.0
48893,36485609,43rd St. Time Square-cozy single bed,hells-kitchen,Shared room,55,1,0,0,2,30985759,Taz,6,3796.0


In [716]:
df_merged.to_csv('AirBnB_NY_arp2019.csv', index=False)