In [1]:
import pandas as pd
import geopandas as gpd
import json

# Reading CSV file
def read_csv(file_path):
    try:
        df = pd.read_csv(file_path)
        print(f"Data from {file_path} loaded successfully!")
        return df
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return None

# Reading Excel file
def read_excel(file_path):
    try:
        df = pd.read_excel(file_path, engine='openpyxl')
        print(f"Data from {file_path} loaded successfully!")
        return df
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return None

# Reading JSON file
def read_json(file_path):
    try:
        df = pd.read_json(file_path)
        print(f"Data from {file_path} loaded successfully!")
        return df
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return None

# Reading GeoJSON file
def read_geojson(file_path):
    try:
        # First attempt: Try using geopandas
        try:
            gdf = gpd.read_file(file_path)
            print(f"Data from {file_path} loaded successfully using GeoPandas!")
            return gdf
        except ImportError:
            # If geopandas is not installed, fall back to manual JSON parsing
            with open(file_path, 'r', encoding='utf-8') as f:
                geojson_data = json.load(f)
            
            # Extract features and properties
            features = []
            for feature in geojson_data['features']:
                # Get properties
                properties = feature['properties']
                
                # Get geometry
                geometry = feature['geometry']
                
                # Combine properties and geometry into one dictionary
                feature_dict = {
                    **properties,
                    'geometry_type': geometry['type'],
                    'coordinates': str(geometry['coordinates'])  # Convert to string to avoid nested structure
                }
                features.append(feature_dict)
            
            # Convert to pandas DataFrame
            df = pd.DataFrame(features)
            print(f"Data from {file_path} loaded successfully using manual parsing!")
            return df
            
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return None

# Reading a dataset from any of these formats
def read_dataset(file_path):
    # Detect file extension to determine how to read it
    if file_path.endswith('.csv'):
        return read_csv(file_path)
    elif file_path.endswith('.xlsx'):
        return read_excel(file_path)
    elif file_path.endswith('.json'):
        return read_json(file_path)
    elif file_path.endswith('.geojson'):
        return read_geojson(file_path)
    else:
        print("Unsupported file type")
        return None

# Helper function to check if geopandas is installed
def is_geopandas_available():
    try:
        import geopandas
        return True
    except ImportError:
        return False

# Helper function to install geopandas if needed
def install_geopandas():
    try:
        import subprocess
        subprocess.check_call(["pip", "install", "geopandas"])
        print("GeoPandas installed successfully!")
        return True
    except Exception as e:
        print(f"Error installing GeoPandas: {e}")
        return False

In [2]:
# Example usage
merged_calendar = read_dataset(r"C:\Users\matth\OneDrive\Documents\KU Leuven\Thesis\Merged_Data\paris_merged_calendar.csv")
listings = read_dataset(r"C:\Users\matth\OneDrive\Documents\KU Leuven\Thesis\Data_Mor\paris\2024-09-06\listings.csv")        
neighbourhoods = read_dataset(r"C:\Users\matth\OneDrive\Documents\KU Leuven\Thesis\Data_Mor\paris\2024-09-06\neighbourhoods.csv")   
neighbourhoods_geojson = read_dataset(r"C:\Users\matth\OneDrive\Documents\KU Leuven\Thesis\Data_Mor\paris\2024-09-06\neighbourhoods.geojson")     
reviews = read_dataset(r"C:\Users\matth\OneDrive\Documents\KU Leuven\Thesis\Data_Mor\paris\2024-09-06\reviews.csv")


# If you'd like to inspect the loaded data
if merged_calendar is not None:
    print(merged_calendar.head())
if listings is not None:
    print(listings.head())
if neighbourhoods is not None:
    print(neighbourhoods.head())
if neighbourhoods_geojson is not None:
    print(neighbourhoods_geojson.head())
if reviews is not None:
    print(reviews.head())

Data from C:\Users\matth\OneDrive\Documents\KU Leuven\Thesis\Merged_Data\paris_merged_calendar.csv loaded successfully!
Data from C:\Users\matth\OneDrive\Documents\KU Leuven\Thesis\Data_Mor\paris\2024-09-06\listings.csv loaded successfully!
Data from C:\Users\matth\OneDrive\Documents\KU Leuven\Thesis\Data_Mor\paris\2024-09-06\neighbourhoods.csv loaded successfully!
Data from C:\Users\matth\OneDrive\Documents\KU Leuven\Thesis\Data_Mor\paris\2024-09-06\neighbourhoods.geojson loaded successfully using GeoPandas!
Data from C:\Users\matth\OneDrive\Documents\KU Leuven\Thesis\Data_Mor\paris\2024-09-06\reviews.csv loaded successfully!
   listing_id        date  price
0        3109  2023-09-05  110.0
1        3109  2023-09-06  110.0
2        3109  2023-09-07  110.0
3        3109  2023-09-08  110.0
4        3109  2023-09-09  110.0
       id                          listing_url       scrape_id last_scraped  \
0    3109    https://www.airbnb.com/rooms/3109  20240906025355   2024-09-11   
1    5396

In [3]:
import pandas as pd
import json
import re
from collections import Counter

def read_dataset(file_path):
    # Your existing data loading function
    return pd.read_csv(file_path)

def get_top_amenities(listings_df, n=20):
    # Check if amenities column exists
    if 'amenities' not in listings_df.columns:
        print("No amenities column found in the DataFrame")
        print(f"Available columns: {', '.join(listings_df.columns)}")
        return []
    
    # Function to parse amenities string to list
    def parse_amenities(amenities_str):
        try:
            # Handle different formats of amenities strings
            if pd.isna(amenities_str) or amenities_str == '':
                return []
            
            # Clean the string if needed
            cleaned_str = amenities_str
            
            # If the string is already wrapped in quotes, remove them
            if cleaned_str.startswith('"') and cleaned_str.endswith('"'):
                cleaned_str = cleaned_str[1:-1]
            
            # Try to parse as JSON
            try:
                return json.loads(cleaned_str)
            except:
                # If direct parsing fails, try to fix the format
                cleaned_str = re.sub(r"'", '"', cleaned_str)
                return json.loads(cleaned_str)
        except Exception as e:
            print(f"Error parsing amenities: {e}")
            print(f"Problematic string: {amenities_str[:100]}...")
            return []
    
    # Apply the function to parse all amenities
    print("Parsing amenities...")
    amenities_lists = listings_df['amenities'].apply(parse_amenities)
    
    # Flatten the list of lists into a single list of all amenities
    all_amenities = []
    for amenities in amenities_lists:
        all_amenities.extend(amenities)
    
    # Count the frequency of each amenity
    amenity_counts = Counter(all_amenities)
    
    # Get the top N most common amenities
    top_amenities = amenity_counts.most_common(n)
    
    print(f"\nTop {n} most frequent amenities:")
    for i, (amenity, count) in enumerate(top_amenities, 1):
        print(f"{i}. {amenity}: {count} occurrences")
    
    return top_amenities

# Usage:
if __name__ == "__main__":
    # Load your dataset using your existing function
    listings = read_dataset(r"C:\Users\matth\OneDrive\Documents\KU Leuven\Thesis\Data_Mor\paris\2024-09-06\listings.csv")
    
    # Get top 20 amenities
    top_amenities = get_top_amenities(listings)

Parsing amenities...

Top 20 most frequent amenities:
1. Kitchen: 89849 occurrences
2. Wifi: 86058 occurrences
3. Smoke alarm: 72153 occurrences
4. Essentials: 71263 occurrences
5. Hot water: 68079 occurrences
6. Hair dryer: 65660 occurrences
7. Washer: 62149 occurrences
8. Iron: 61074 occurrences
9. Dishes and silverware: 60320 occurrences
10. Bed linens: 59760 occurrences
11. Hangers: 59613 occurrences
12. Cooking basics: 59238 occurrences
13. Refrigerator: 56867 occurrences
14. Heating: 55835 occurrences
15. TV: 49652 occurrences
16. Microwave: 48051 occurrences
17. Shampoo: 46959 occurrences
18. Hot water kettle: 41882 occurrences
19. Dedicated workspace: 39715 occurrences
20. Cleaning products: 38458 occurrences


In [4]:
import pandas as pd
import json
import re
from collections import Counter

def read_dataset(file_path):
    # Your existing data loading function
    return pd.read_csv(file_path)

def add_top_amenities_as_columns(listings_df, n=20):
    # Check if amenities column exists
    if 'amenities' not in listings_df.columns:
        print("No amenities column found in the DataFrame")
        print(f"Available columns: {', '.join(listings_df.columns)}")
        return listings_df
    
    # Function to parse amenities string to list
    def parse_amenities(amenities_str):
        try:
            # Handle different formats of amenities strings
            if pd.isna(amenities_str) or amenities_str == '':
                return []
            
            # Clean the string if needed
            cleaned_str = amenities_str
            
            # If the string is already wrapped in quotes, remove them
            if cleaned_str.startswith('"') and cleaned_str.endswith('"'):
                cleaned_str = cleaned_str[1:-1]
            
            # Try to parse as JSON
            try:
                return json.loads(cleaned_str)
            except:
                # If direct parsing fails, try to fix the format
                cleaned_str = re.sub(r"'", '"', cleaned_str)
                return json.loads(cleaned_str)
        except Exception as e:
            print(f"Error parsing amenities: {e}")
            print(f"Problematic string: {amenities_str[:100]}...")
            return []
    
    # Apply the function to parse all amenities
    print("Parsing amenities...")
    listings_df['parsed_amenities'] = listings_df['amenities'].apply(parse_amenities)
    
    # Flatten the list of lists into a single list of all amenities
    all_amenities = []
    for amenities in listings_df['parsed_amenities']:
        all_amenities.extend(amenities)
    
    # Count the frequency of each amenity
    amenity_counts = Counter(all_amenities)
    
    # Get the top N most common amenities
    top_amenities = [amenity for amenity, count in amenity_counts.most_common(n)]
    
    print(f"\nAdding top {n} most frequent amenities as boolean columns:")
    for i, amenity in enumerate(top_amenities, 1):
        print(f"{i}. {amenity}")
        
        # Add a boolean column for each top amenity
        column_name = f"has_{amenity.lower().replace(' ', '_').replace('-', '_')}"
        listings_df[column_name] = listings_df['parsed_amenities'].apply(lambda x: 1 if amenity in x else 0)
    
    # Drop the temporary parsed_amenities column
    listings_df = listings_df.drop('parsed_amenities', axis=1)
    
    return listings_df

# Usage:
if __name__ == "__main__":
    # Load your dataset using your existing function
    listings = read_dataset(r"C:\Users\matth\OneDrive\Documents\KU Leuven\Thesis\Data_Mor\paris\2024-09-06\listings.csv")
    
    # Add top 20 amenities as boolean columns
    enhanced_listings = add_top_amenities_as_columns(listings)
    
    # Save the enhanced dataset
    enhanced_listings.to_csv(r"C:\Users\matth\OneDrive\Documents\KU Leuven\Thesis\Data_Mor\paris\2024-09-06\listings_with_amenities.csv", index=False)
    
    print(f"\nDataset shape after adding amenity columns: {enhanced_listings.shape}")

Parsing amenities...

Adding top 20 most frequent amenities as boolean columns:
1. Kitchen
2. Wifi
3. Smoke alarm
4. Essentials
5. Hot water
6. Hair dryer
7. Washer
8. Iron
9. Dishes and silverware
10. Bed linens
11. Hangers
12. Cooking basics
13. Refrigerator
14. Heating
15. TV
16. Microwave
17. Shampoo
18. Hot water kettle
19. Dedicated workspace
20. Cleaning products

Dataset shape after adding amenity columns: (95461, 95)


In [5]:
df = enhanced_listings

In [6]:
df

Unnamed: 0,id,listing_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,picture_url,host_id,...,has_hangers,has_cooking_basics,has_refrigerator,has_heating,has_tv,has_microwave,has_shampoo,has_hot_water_kettle,has_dedicated_workspace,has_cleaning_products
0,3109,https://www.airbnb.com/rooms/3109,20240906025355,2024-09-11,city scrape,zen and calm,Lovely Appartment with one bedroom with a Quee...,Good restaurants<br />very close the Montparna...,https://a0.muscache.com/pictures/miso/Hosting-...,3631,...,1,1,0,0,0,0,0,0,0,0
1,5396,https://www.airbnb.com/rooms/5396,20240906025355,2024-09-13,city scrape,Your perfect Paris studio on Île Saint-Louis,"NEW SOFA-BED SINCE JUNE 2023, Please disregard...","You are within walking distance to the Louvre,...",https://a0.muscache.com/pictures/52413/f9bf76f...,7903,...,1,1,1,1,0,0,1,1,1,1
2,7397,https://www.airbnb.com/rooms/7397,20240906025355,2024-09-06,city scrape,MARAIS - 2ROOMS APT - 2/4 PEOPLE,"VERY CONVENIENT, WITH THE BEST LOCATION !",,https://a0.muscache.com/pictures/67928287/330b...,2626,...,1,1,1,1,0,1,1,0,1,0
3,7964,https://www.airbnb.com/rooms/7964,20240906025355,2024-09-10,previous scrape,Sunny apartment with balcony,"We are renting our a spacious, sunny fully fur...",,https://a0.muscache.com/pictures/miso/Hosting-...,22155,...,1,1,1,1,0,1,0,1,1,1
4,241715,https://www.airbnb.com/rooms/241715,20240906025355,2024-09-11,city scrape,Big Cosy Appartement with 100 m2 Terrace in Paris,Come to stay in our unique Parisian flat to en...,"The 19th arrondissement of Paris, located in t...",https://a0.muscache.com/pictures/miso/Hosting-...,3342097,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95456,1238568635341928457,https://www.airbnb.com/rooms/1238568635341928457,20240906025355,2024-09-13,city scrape,"Spacieux studio 4P, porte st cloud",Forget your worries at this spacious (42m2) an...,,https://a0.muscache.com/pictures/hosting/Hosti...,539077658,...,1,1,1,1,1,1,1,1,0,1
95457,1238594326004996232,https://www.airbnb.com/rooms/1238594326004996232,20240906025355,2024-09-11,city scrape,Luxury 3BR Arc Triomphe Champs Élysée,Spacious 3 Bedroom Apartment - Close to Champs...,,https://a0.muscache.com/pictures/hosting/Hosti...,492686946,...,0,0,1,0,1,1,1,1,1,1
95458,1238755949783144791,https://www.airbnb.com/rooms/1238755949783144791,20240906025355,2024-09-07,city scrape,[F-16] 파리 5인여성 다인실방,Centrally located with the best accessibility.,,https://a0.muscache.com/pictures/miso/Hosting-...,264290202,...,0,0,0,0,0,0,0,0,1,0
95459,1239120896239244086,https://www.airbnb.com/rooms/1239120896239244086,20240906025355,2024-09-13,city scrape,Large and bright flat nearby Montmartre - Paris,MIDTERM STAY ONLY - Located close to the mythi...,"Located 10 min away on foot from Montmartre, y...",https://a0.muscache.com/pictures/prohost-api/H...,169497320,...,1,1,1,1,0,0,0,1,1,0


In [7]:
import pandas as pd

def clean_data(df):
    # Drop columns: 'listing_url', 'scrape_id' and 68 other columns
    df = df.drop(columns=['listing_url', 'price','scrape_id', 'last_scraped', 'source', 'name', 'neighborhood_overview', 'description', 'picture_url', 'host_id', 'host_url', 'host_name', 'host_since', 'host_location', 'host_about', 'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood', 'host_total_listings_count', 'host_listings_count', 'host_verifications', 'host_has_profile_pic', 'host_identity_verified', 'neighbourhood', 'neighbourhood_group_cleansed', 'bathrooms', 'amenities', 'beds', 'minimum_nights', 'maximum_nights', 'minimum_minimum_nights', 'maximum_minimum_nights', 'minimum_maximum_nights', 'maximum_maximum_nights', 'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'calendar_updated', 'has_availability', 'availability_30', 'availability_60', 'availability_90', 'availability_365', 'calendar_last_scraped', 'number_of_reviews', 'number_of_reviews_ltm', 'number_of_reviews_l30d', 'first_review', 'last_review', 'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'license', 'review_scores_communication', 'review_scores_location', 'review_scores_value', 'instant_bookable', 'calculated_host_listings_count', 'calculated_host_listings_count_entire_homes', 'calculated_host_listings_count_private_rooms', 'calculated_host_listings_count_shared_rooms', 'reviews_per_month'])
    return df

cleaned_listings = clean_data(df.copy())
cleaned_listings.head()

Unnamed: 0,id,neighbourhood_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms_text,bedrooms,has_kitchen,...,has_hangers,has_cooking_basics,has_refrigerator,has_heating,has_tv,has_microwave,has_shampoo,has_hot_water_kettle,has_dedicated_workspace,has_cleaning_products
0,3109,Observatoire,48.83191,2.3187,Entire rental unit,Entire home/apt,2,1 bath,1.0,1,...,1,1,0,0,0,0,0,0,0,0
1,5396,Hôtel-de-Ville,48.85247,2.35835,Entire rental unit,Entire home/apt,2,1 bath,0.0,1,...,1,1,1,1,0,0,1,1,1,1
2,7397,Hôtel-de-Ville,48.85909,2.35315,Entire rental unit,Entire home/apt,4,1 bath,2.0,1,...,1,1,1,1,0,1,1,0,1,0
3,7964,Opéra,48.87417,2.34245,Entire rental unit,Entire home/apt,3,1 bath,2.0,1,...,1,1,1,1,0,1,0,1,1,1
4,241715,Buttes-Chaumont,48.893464,2.378341,Entire rental unit,Entire home/apt,6,1 bath,3.0,1,...,0,0,0,0,0,0,0,0,0,0


In [8]:
def clean_data(cleaned_listings):
    # Rename column 'id' to 'listing_id'
    cleaned_listings = cleaned_listings.rename(columns={'id': 'listing_id'})
    return cleaned_listings

cleaned_listings_clean = clean_data(cleaned_listings.copy())
cleaned_listings_clean.head()

Unnamed: 0,listing_id,neighbourhood_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms_text,bedrooms,has_kitchen,...,has_hangers,has_cooking_basics,has_refrigerator,has_heating,has_tv,has_microwave,has_shampoo,has_hot_water_kettle,has_dedicated_workspace,has_cleaning_products
0,3109,Observatoire,48.83191,2.3187,Entire rental unit,Entire home/apt,2,1 bath,1.0,1,...,1,1,0,0,0,0,0,0,0,0
1,5396,Hôtel-de-Ville,48.85247,2.35835,Entire rental unit,Entire home/apt,2,1 bath,0.0,1,...,1,1,1,1,0,0,1,1,1,1
2,7397,Hôtel-de-Ville,48.85909,2.35315,Entire rental unit,Entire home/apt,4,1 bath,2.0,1,...,1,1,1,1,0,1,1,0,1,0
3,7964,Opéra,48.87417,2.34245,Entire rental unit,Entire home/apt,3,1 bath,2.0,1,...,1,1,1,1,0,1,0,1,1,1
4,241715,Buttes-Chaumont,48.893464,2.378341,Entire rental unit,Entire home/apt,6,1 bath,3.0,1,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# Further filter merged_df_clean to keep only "Entire rental unit" property types
cleaned_listings_clean = cleaned_listings_clean[cleaned_listings_clean['property_type'] == 'Entire rental unit']

# Check how many rows remain after filtering
print(f"Number of rows after filtering for 'Entire rental unit': {len(cleaned_listings_clean)}")

Number of rows after filtering for 'Entire rental unit': 80176


In [10]:
# Further filter merged_df_clean to keep only "Entire rental unit" property types
cleaned_listings_clean = cleaned_listings_clean[cleaned_listings_clean['room_type'] == 'Entire home/apt']

# Check how many rows remain after filtering
print(f"Number of rows after filtering for 'Entire rental unit': {len(cleaned_listings_clean)}")

Number of rows after filtering for 'Entire rental unit': 80176


In [11]:
import pandas as pd

def clean_cleaned_data(df):
    # Drop columns
    df = df.drop(columns=['room_type','property_type'])
    return df

cleaned_listings_clean = clean_cleaned_data(cleaned_listings_clean)
cleaned_listings_clean.head()

Unnamed: 0,listing_id,neighbourhood_cleansed,latitude,longitude,accommodates,bathrooms_text,bedrooms,has_kitchen,has_wifi,has_smoke_alarm,...,has_hangers,has_cooking_basics,has_refrigerator,has_heating,has_tv,has_microwave,has_shampoo,has_hot_water_kettle,has_dedicated_workspace,has_cleaning_products
0,3109,Observatoire,48.83191,2.3187,2,1 bath,1.0,1,1,1,...,1,1,0,0,0,0,0,0,0,0
1,5396,Hôtel-de-Ville,48.85247,2.35835,2,1 bath,0.0,1,1,1,...,1,1,1,1,0,0,1,1,1,1
2,7397,Hôtel-de-Ville,48.85909,2.35315,4,1 bath,2.0,1,1,1,...,1,1,1,1,0,1,1,0,1,0
3,7964,Opéra,48.87417,2.34245,3,1 bath,2.0,1,1,0,...,1,1,1,1,0,1,0,1,1,1
4,241715,Buttes-Chaumont,48.893464,2.378341,6,1 bath,3.0,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
# Extract only the numerical part from bathrooms_text
cleaned_listings_clean['bathrooms'] = cleaned_listings_clean['bathrooms_text'].str.extract(r'(^\d+\.?\d*)').astype(float)

# Check for special cases with "half" that might not be captured correctly
half_bath_mask = cleaned_listings_clean['bathrooms_text'].str.contains(r'\d+\s*and a half|\d+\s*\+\s*half', na=False)
if any(half_bath_mask):
    # Extract the whole number and add 0.5 for the half bath
    cleaned_listings_clean.loc[half_bath_mask, 'bathrooms'] = cleaned_listings_clean.loc[half_bath_mask, 'bathrooms_text'].str.extract(r'(\d+)').astype(float) + 0.5

# Check the result
print(cleaned_listings_clean[['bathrooms_text', 'bathrooms']].head(10))
print("\nUnique bathroom values:")
print(cleaned_listings_clean['bathrooms'].value_counts().sort_index())

  bathrooms_text  bathrooms
0         1 bath        1.0
1         1 bath        1.0
2         1 bath        1.0
3         1 bath        1.0
4         1 bath        1.0
5         1 bath        1.0
6         1 bath        1.0
7      1.5 baths        1.5
8         1 bath        1.0
9         1 bath        1.0

Unique bathroom values:
bathrooms
0.0      291
1.0    65507
1.5     5487
2.0     5852
2.5     1391
3.0      909
3.5      250
4.0      151
4.5       44
5.0       32
5.5       11
6.0        8
6.5        6
7.0        5
7.5        2
Name: count, dtype: int64


In [13]:
cleaned_listings_clean = cleaned_listings_clean.drop(columns=['bathrooms_text'])

In [14]:
cleaned_listings_clean

Unnamed: 0,listing_id,neighbourhood_cleansed,latitude,longitude,accommodates,bedrooms,has_kitchen,has_wifi,has_smoke_alarm,has_essentials,...,has_cooking_basics,has_refrigerator,has_heating,has_tv,has_microwave,has_shampoo,has_hot_water_kettle,has_dedicated_workspace,has_cleaning_products,bathrooms
0,3109,Observatoire,48.831910,2.318700,2,1.0,1,1,1,0,...,1,0,0,0,0,0,0,0,0,1.0
1,5396,Hôtel-de-Ville,48.852470,2.358350,2,0.0,1,1,1,1,...,1,1,1,0,0,1,1,1,1,1.0
2,7397,Hôtel-de-Ville,48.859090,2.353150,4,2.0,1,1,1,1,...,1,1,1,0,1,1,0,1,0,1.0
3,7964,Opéra,48.874170,2.342450,3,2.0,1,1,0,1,...,1,1,1,0,1,0,1,1,1,1.0
4,241715,Buttes-Chaumont,48.893464,2.378341,6,3.0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95455,1238563676697016528,Entrepôt,48.874400,2.371717,4,1.0,1,1,0,0,...,1,1,1,1,0,1,1,1,0,1.0
95456,1238568635341928457,Passy,48.838180,2.257970,4,1.0,1,1,0,1,...,1,1,1,1,1,1,1,0,1,1.0
95457,1238594326004996232,Batignolles-Monceau,48.875698,2.289969,6,3.0,1,0,1,1,...,0,1,0,1,1,1,1,1,1,2.5
95459,1239120896239244086,Buttes-Montmartre,48.892113,2.360589,1,,1,1,1,0,...,1,1,1,0,0,0,1,1,0,1.0


In [15]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
 
# Encode categorical variables
le = LabelEncoder()
cleaned_listings_clean['neighbourhood_cleansed_encoded'] = le.fit_transform(cleaned_listings_clean['neighbourhood_cleansed'])

In [16]:
# Create a mapping dictionary between encoded values and original neighborhoods
neighborhood_mapping = dict(zip(le.transform(le.classes_), le.classes_))

# Display the mapping as a DataFrame for better readability
import pandas as pd
legend_df = pd.DataFrame({
    'Encoded Value': list(neighborhood_mapping.keys()),
    'Neighborhood': list(neighborhood_mapping.values())
}).sort_values('Encoded Value')

print("Neighborhood Encoding Legend:")
print(legend_df)

# Optionally, save this mapping to a CSV file for future reference
legend_df.to_csv('neighborhood_encoding_legend.csv', index=False)
print("\nLegend saved to 'neighborhood_encoding_legend.csv'")

Neighborhood Encoding Legend:
    Encoded Value         Neighborhood
0               0  Batignolles-Monceau
1               1               Bourse
2               2      Buttes-Chaumont
3               3    Buttes-Montmartre
4               4             Entrepôt
5               5             Gobelins
6               6       Hôtel-de-Ville
7               7               Louvre
8               8           Luxembourg
9               9         Ménilmontant
10             10         Observatoire
11             11                Opéra
12             12       Palais-Bourbon
13             13             Panthéon
14             14                Passy
15             15           Popincourt
16             16              Reuilly
17             17               Temple
18             18            Vaugirard
19             19               Élysée

Legend saved to 'neighborhood_encoding_legend.csv'


In [17]:
cleaned_listings_clean

Unnamed: 0,listing_id,neighbourhood_cleansed,latitude,longitude,accommodates,bedrooms,has_kitchen,has_wifi,has_smoke_alarm,has_essentials,...,has_refrigerator,has_heating,has_tv,has_microwave,has_shampoo,has_hot_water_kettle,has_dedicated_workspace,has_cleaning_products,bathrooms,neighbourhood_cleansed_encoded
0,3109,Observatoire,48.831910,2.318700,2,1.0,1,1,1,0,...,0,0,0,0,0,0,0,0,1.0,10
1,5396,Hôtel-de-Ville,48.852470,2.358350,2,0.0,1,1,1,1,...,1,1,0,0,1,1,1,1,1.0,6
2,7397,Hôtel-de-Ville,48.859090,2.353150,4,2.0,1,1,1,1,...,1,1,0,1,1,0,1,0,1.0,6
3,7964,Opéra,48.874170,2.342450,3,2.0,1,1,0,1,...,1,1,0,1,0,1,1,1,1.0,11
4,241715,Buttes-Chaumont,48.893464,2.378341,6,3.0,1,1,0,0,...,0,0,0,0,0,0,0,0,1.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95455,1238563676697016528,Entrepôt,48.874400,2.371717,4,1.0,1,1,0,0,...,1,1,1,0,1,1,1,0,1.0,4
95456,1238568635341928457,Passy,48.838180,2.257970,4,1.0,1,1,0,1,...,1,1,1,1,1,1,0,1,1.0,14
95457,1238594326004996232,Batignolles-Monceau,48.875698,2.289969,6,3.0,1,0,1,1,...,1,0,1,1,1,1,1,1,2.5,0
95459,1239120896239244086,Buttes-Montmartre,48.892113,2.360589,1,,1,1,1,0,...,1,1,0,0,0,1,1,0,1.0,3


In [18]:
# Check how many missing values we have in each column
print(f"Missing values in bedrooms: {cleaned_listings_clean['bedrooms'].isnull().sum()}")
print(f"Missing values in bathrooms: {cleaned_listings_clean['bathrooms'].isnull().sum()}")

# Calculate the median for each column
bedrooms_median = cleaned_listings_clean['bedrooms'].median()
bathrooms_median = cleaned_listings_clean['bathrooms'].median()

print(f"\nMedian values:")
print(f"Bedrooms median: {bedrooms_median}")
print(f"Bathrooms median: {bathrooms_median}")

# Impute missing values with the median
cleaned_listings_clean['bedrooms'] = cleaned_listings_clean['bedrooms'].fillna(bedrooms_median)
cleaned_listings_clean['bathrooms'] = cleaned_listings_clean['bathrooms'].fillna(bathrooms_median)

# Verify no missing values remain
print(f"\nAfter imputation:")
print(f"Missing values in bedrooms: {cleaned_listings_clean['bedrooms'].isnull().sum()}")
print(f"Missing values in bathrooms: {cleaned_listings_clean['bathrooms'].isnull().sum()}")

Missing values in bedrooms: 4430
Missing values in bathrooms: 230

Median values:
Bedrooms median: 1.0
Bathrooms median: 1.0

After imputation:
Missing values in bedrooms: 0
Missing values in bathrooms: 0


In [19]:
cleaned_listings_clean

Unnamed: 0,listing_id,neighbourhood_cleansed,latitude,longitude,accommodates,bedrooms,has_kitchen,has_wifi,has_smoke_alarm,has_essentials,...,has_refrigerator,has_heating,has_tv,has_microwave,has_shampoo,has_hot_water_kettle,has_dedicated_workspace,has_cleaning_products,bathrooms,neighbourhood_cleansed_encoded
0,3109,Observatoire,48.831910,2.318700,2,1.0,1,1,1,0,...,0,0,0,0,0,0,0,0,1.0,10
1,5396,Hôtel-de-Ville,48.852470,2.358350,2,0.0,1,1,1,1,...,1,1,0,0,1,1,1,1,1.0,6
2,7397,Hôtel-de-Ville,48.859090,2.353150,4,2.0,1,1,1,1,...,1,1,0,1,1,0,1,0,1.0,6
3,7964,Opéra,48.874170,2.342450,3,2.0,1,1,0,1,...,1,1,0,1,0,1,1,1,1.0,11
4,241715,Buttes-Chaumont,48.893464,2.378341,6,3.0,1,1,0,0,...,0,0,0,0,0,0,0,0,1.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95455,1238563676697016528,Entrepôt,48.874400,2.371717,4,1.0,1,1,0,0,...,1,1,1,0,1,1,1,0,1.0,4
95456,1238568635341928457,Passy,48.838180,2.257970,4,1.0,1,1,0,1,...,1,1,1,1,1,1,0,1,1.0,14
95457,1238594326004996232,Batignolles-Monceau,48.875698,2.289969,6,3.0,1,0,1,1,...,1,0,1,1,1,1,1,1,2.5,0
95459,1239120896239244086,Buttes-Montmartre,48.892113,2.360589,1,1.0,1,1,1,0,...,1,1,0,0,0,1,1,0,1.0,3


In [20]:
# Define the output file path 
output_file_path = "cleaned_listings_final.csv"

# Write the cleaned DataFrame to a CSV file
cleaned_listings_clean.to_csv(output_file_path, index=False)

print(f"Successfully saved cleaned data to '{output_file_path}'")
print(f"DataFrame shape: {cleaned_listings_clean.shape}")

Successfully saved cleaned data to 'cleaned_listings_final.csv'
DataFrame shape: (80176, 28)
