In [1]:
import pandas as pd
import geopandas as gpd
import json

# Reading CSV file
def read_csv(file_path):
    try:
        df = pd.read_csv(file_path)
        print(f"Data from {file_path} loaded successfully!")
        return df
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return None

# Reading Excel file
def read_excel(file_path):
    try:
        df = pd.read_excel(file_path, engine='openpyxl')
        print(f"Data from {file_path} loaded successfully!")
        return df
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return None

# Reading JSON file
def read_json(file_path):
    try:
        df = pd.read_json(file_path)
        print(f"Data from {file_path} loaded successfully!")
        return df
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return None

# Reading GeoJSON file
def read_geojson(file_path):
    try:
        # First attempt: Try using geopandas
        try:
            gdf = gpd.read_file(file_path)
            print(f"Data from {file_path} loaded successfully using GeoPandas!")
            return gdf
        except ImportError:
            # If geopandas is not installed, fall back to manual JSON parsing
            with open(file_path, 'r', encoding='utf-8') as f:
                geojson_data = json.load(f)
            
            # Extract features and properties
            features = []
            for feature in geojson_data['features']:
                # Get properties
                properties = feature['properties']
                
                # Get geometry
                geometry = feature['geometry']
                
                # Combine properties and geometry into one dictionary
                feature_dict = {
                    **properties,
                    'geometry_type': geometry['type'],
                    'coordinates': str(geometry['coordinates'])  # Convert to string to avoid nested structure
                }
                features.append(feature_dict)
            
            # Convert to pandas DataFrame
            df = pd.DataFrame(features)
            print(f"Data from {file_path} loaded successfully using manual parsing!")
            return df
            
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return None

# Reading a dataset from any of these formats
def read_dataset(file_path):
    # Detect file extension to determine how to read it
    if file_path.endswith('.csv'):
        return read_csv(file_path)
    elif file_path.endswith('.xlsx'):
        return read_excel(file_path)
    elif file_path.endswith('.json'):
        return read_json(file_path)
    elif file_path.endswith('.geojson'):
        return read_geojson(file_path)
    else:
        print("Unsupported file type")
        return None

# Helper function to check if geopandas is installed
def is_geopandas_available():
    try:
        import geopandas
        return True
    except ImportError:
        return False

# Helper function to install geopandas if needed
def install_geopandas():
    try:
        import subprocess
        subprocess.check_call(["pip", "install", "geopandas"])
        print("GeoPandas installed successfully!")
        return True
    except Exception as e:
        print(f"Error installing GeoPandas: {e}")
        return False

In [2]:
# Example usage
merged_calendar = read_dataset(r"C:\Users\matth\OneDrive\Documents\KU Leuven\Thesis\Merged_Data\paris_merged_calendar.csv")
listings = read_dataset(r"C:\Users\matth\OneDrive\Documents\KU Leuven\Thesis\Data_Mor\paris\2024-09-06\listings.csv")        
neighbourhoods = read_dataset(r"C:\Users\matth\OneDrive\Documents\KU Leuven\Thesis\Data_Mor\paris\2024-09-06\neighbourhoods.csv")   
neighbourhoods_geojson = read_dataset(r"C:\Users\matth\OneDrive\Documents\KU Leuven\Thesis\Data_Mor\paris\2024-09-06\neighbourhoods.geojson")     
reviews = read_dataset(r"C:\Users\matth\OneDrive\Documents\KU Leuven\Thesis\Data_Mor\paris\2024-09-06\reviews.csv")


# If you'd like to inspect the loaded data
if merged_calendar is not None:
    print(merged_calendar.head())
if listings is not None:
    print(listings.head())
if neighbourhoods is not None:
    print(neighbourhoods.head())
if neighbourhoods_geojson is not None:
    print(neighbourhoods_geojson.head())
if reviews is not None:
    print(reviews.head())

Data from C:\Users\matth\OneDrive\Documents\KU Leuven\Thesis\Merged_Data\paris_merged_calendar.csv loaded successfully!
Data from C:\Users\matth\OneDrive\Documents\KU Leuven\Thesis\Data_Mor\paris\2024-09-06\listings.csv loaded successfully!
Data from C:\Users\matth\OneDrive\Documents\KU Leuven\Thesis\Data_Mor\paris\2024-09-06\neighbourhoods.csv loaded successfully!
Data from C:\Users\matth\OneDrive\Documents\KU Leuven\Thesis\Data_Mor\paris\2024-09-06\neighbourhoods.geojson loaded successfully using GeoPandas!
Data from C:\Users\matth\OneDrive\Documents\KU Leuven\Thesis\Data_Mor\paris\2024-09-06\reviews.csv loaded successfully!
   listing_id        date  price
0        3109  2023-09-05  110.0
1        3109  2023-09-06  110.0
2        3109  2023-09-07  110.0
3        3109  2023-09-08  110.0
4        3109  2023-09-09  110.0
       id                          listing_url       scrape_id last_scraped  \
0    3109    https://www.airbnb.com/rooms/3109  20240906025355   2024-09-11   
1    5396

In [4]:
import pandas as pd
import json
import re
from collections import Counter

# Load the dataset
def get_top_amenities(file_path, n=20):
    # Read the CSV file
    df = pd.read_csv(file_path)
    
    # Check if amenities column exists
    if 'amenities' not in df.columns:
        print(f"No amenities column found in {file_path}")
        print(f"Available columns: {', '.join(df.columns)}")
        return
    
    # Function to parse amenities string to list
    def parse_amenities(amenities_str):
        try:
            # Handle different formats of amenities strings
            if pd.isna(amenities_str) or amenities_str == '':
                return []
            
            # Clean the string if needed (remove escape characters, etc.)
            cleaned_str = amenities_str
            
            # If the string is already wrapped in quotes, remove them
            if cleaned_str.startswith('"') and cleaned_str.endswith('"'):
                cleaned_str = cleaned_str[1:-1]
            
            # Try to parse as JSON
            try:
                return json.loads(cleaned_str)
            except:
                # If direct parsing fails, try to fix the format
                # Replace single quotes with double quotes if necessary
                cleaned_str = re.sub(r"'", '"', cleaned_str)
                return json.loads(cleaned_str)
        except Exception as e:
            print(f"Error parsing amenities: {e}")
            print(f"Problematic string: {amenities_str[:100]}...")
            return []
    
    # Apply the function to parse all amenities
    print("Parsing amenities...")
    amenities_lists = df['amenities'].apply(parse_amenities)
    
    # Flatten the list of lists into a single list of all amenities
    all_amenities = []
    for amenities in amenities_lists:
        all_amenities.extend(amenities)
    
    # Count the frequency of each amenity
    amenity_counts = Counter(all_amenities)
    
    # Get the top N most common amenities
    top_amenities = amenity_counts.most_common(n)
    
    print(f"\nTop {n} most frequent amenities:")
    for i, (amenity, count) in enumerate(top_amenities, 1):
        print(f"{i}. {amenity}: {count} occurrences")
    
    return top_amenities

# Usage:
if __name__ == "__main__":
    # Replace with your path to the listings.csv file
    file_path = listings
    top_amenities = get_top_amenities(file_path)

TypeError: argument of type 'method' is not iterable