# Data Cleaning #


### Reading in the datasets 

In [2]:
import pandas as pd
import geopandas as gpd
import json

# Reading CSV file
def read_csv(file_path):
    try:
        df = pd.read_csv(file_path)
        print(f"Data from {file_path} loaded successfully!")
        return df
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return None

# Reading Excel file
def read_excel(file_path):
    try:
        df = pd.read_excel(file_path, engine='openpyxl')
        print(f"Data from {file_path} loaded successfully!")
        return df
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return None

# Reading JSON file
def read_json(file_path):
    try:
        df = pd.read_json(file_path)
        print(f"Data from {file_path} loaded successfully!")
        return df
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return None

# Reading GeoJSON file
def read_geojson(file_path):
    try:
        # First attempt: Try using geopandas
        try:
            gdf = gpd.read_file(file_path)
            print(f"Data from {file_path} loaded successfully using GeoPandas!")
            return gdf
        except ImportError:
            # If geopandas is not installed, fall back to manual JSON parsing
            with open(file_path, 'r', encoding='utf-8') as f:
                geojson_data = json.load(f)
            
            # Extract features and properties
            features = []
            for feature in geojson_data['features']:
                # Get properties
                properties = feature['properties']
                
                # Get geometry
                geometry = feature['geometry']
                
                # Combine properties and geometry into one dictionary
                feature_dict = {
                    **properties,
                    'geometry_type': geometry['type'],
                    'coordinates': str(geometry['coordinates'])  # Convert to string to avoid nested structure
                }
                features.append(feature_dict)
            
            # Convert to pandas DataFrame
            df = pd.DataFrame(features)
            print(f"Data from {file_path} loaded successfully using manual parsing!")
            return df
            
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return None

# Reading a dataset from any of these formats
def read_dataset(file_path):
    # Detect file extension to determine how to read it
    if file_path.endswith('.csv'):
        return read_csv(file_path)
    elif file_path.endswith('.xlsx'):
        return read_excel(file_path)
    elif file_path.endswith('.json'):
        return read_json(file_path)
    elif file_path.endswith('.geojson'):
        return read_geojson(file_path)
    else:
        print("Unsupported file type")
        return None

# Helper function to check if geopandas is installed
def is_geopandas_available():
    try:
        import geopandas
        return True
    except ImportError:
        return False

# Helper function to install geopandas if needed
def install_geopandas():
    try:
        import subprocess
        subprocess.check_call(["pip", "install", "geopandas"])
        print("GeoPandas installed successfully!")
        return True
    except Exception as e:
        print(f"Error installing GeoPandas: {e}")
        return False

In [3]:
# Example usage
merged_calendar = read_dataset(r"C:\Users\matth\OneDrive\Documents\KU Leuven\Thesis\Merged_Data\paris_merged_calendar.csv")
listings = read_dataset(r"C:\Users\matth\OneDrive\Documents\KU Leuven\Thesis\Data_Mor\paris\2024-09-06\listings.csv")        
neighbourhoods = read_dataset(r"C:\Users\matth\OneDrive\Documents\KU Leuven\Thesis\Data_Mor\paris\2024-09-06\neighbourhoods.csv")   
neighbourhoods_geojson = read_dataset(r"C:\Users\matth\OneDrive\Documents\KU Leuven\Thesis\Data_Mor\paris\2024-09-06\neighbourhoods.geojson")     
reviews = read_dataset(r"C:\Users\matth\OneDrive\Documents\KU Leuven\Thesis\Data_Mor\paris\2024-09-06\reviews.csv")


# If you'd like to inspect the loaded data
if merged_calendar is not None:
    print(merged_calendar.head())
if listings is not None:
    print(listings.head())
if neighbourhoods is not None:
    print(neighbourhoods.head())
if neighbourhoods_geojson is not None:
    print(neighbourhoods_geojson.head())
if reviews is not None:
    print(reviews.head())

Data from C:\Users\matth\OneDrive\Documents\KU Leuven\Thesis\Merged_Data\paris_merged_calendar.csv loaded successfully!
Data from C:\Users\matth\OneDrive\Documents\KU Leuven\Thesis\Data_Mor\paris\2024-09-06\listings.csv loaded successfully!
Data from C:\Users\matth\OneDrive\Documents\KU Leuven\Thesis\Data_Mor\paris\2024-09-06\neighbourhoods.csv loaded successfully!
Data from C:\Users\matth\OneDrive\Documents\KU Leuven\Thesis\Data_Mor\paris\2024-09-06\neighbourhoods.geojson loaded successfully using GeoPandas!
Data from C:\Users\matth\OneDrive\Documents\KU Leuven\Thesis\Data_Mor\paris\2024-09-06\reviews.csv loaded successfully!
   listing_id        date  price
0        3109  2023-09-05  110.0
1        3109  2023-09-06  110.0
2        3109  2023-09-07  110.0
3        3109  2023-09-08  110.0
4        3109  2023-09-09  110.0
       id                          listing_url       scrape_id last_scraped  \
0    3109    https://www.airbnb.com/rooms/3109  20240906025355   2024-09-11   
1    5396

In [4]:
if merged_calendar is not None:
    print(merged_calendar.head())

   listing_id        date  price
0        3109  2023-09-05  110.0
1        3109  2023-09-06  110.0
2        3109  2023-09-07  110.0
3        3109  2023-09-08  110.0
4        3109  2023-09-09  110.0


In [5]:
if listings is not None:
    print(listings.head())

       id                          listing_url       scrape_id last_scraped  \
0    3109    https://www.airbnb.com/rooms/3109  20240906025355   2024-09-11   
1    5396    https://www.airbnb.com/rooms/5396  20240906025355   2024-09-13   
2    7397    https://www.airbnb.com/rooms/7397  20240906025355   2024-09-06   
3    7964    https://www.airbnb.com/rooms/7964  20240906025355   2024-09-10   
4  241715  https://www.airbnb.com/rooms/241715  20240906025355   2024-09-11   

            source                                               name  \
0      city scrape                                       zen and calm   
1      city scrape       Your perfect Paris studio on Île Saint-Louis   
2      city scrape                   MARAIS - 2ROOMS APT - 2/4 PEOPLE   
3  previous scrape                       Sunny apartment with balcony   
4      city scrape  Big Cosy Appartement with 100 m2 Terrace in Paris   

                                         description  \
0  Lovely Appartment with one 

In [6]:
if reviews is not None:
    print(reviews.head())

   listing_id         id        date  reviewer_id reviewer_name  \
0        3109  207127433  2017-10-28     51636494      Patricia   
1        3109  208779822  2017-11-03      4142888      Patricia   
2        3109  295840159  2018-07-24      7415343       Laurent   
3        3109  553502638  2019-10-24     21159216     Anastasia   
4        5396       4824  2009-06-30        19995         Sarah   

                                            comments  
0            Tout s'est bien déroulé. Merci bien. PG  
1  Un petit nid fouiller douillet situé dans  app...  
2  Appartement spacieux, propre,clair, et calme à...  
3  Appartement totalement rénové, en parfait état...  
4  Perfect location!! Nasrine was a delight and m...  


In [7]:
if neighbourhoods_geojson is not None:
    print(neighbourhoods_geojson.head(20))

          neighbourhood neighbourhood_group  \
0   Batignolles-Monceau                None   
1        Palais-Bourbon                None   
2       Buttes-Chaumont                None   
3                 Opéra                None   
4              Entrepôt                None   
5              Gobelins                None   
6             Vaugirard                None   
7               Reuilly                None   
8                Louvre                None   
9            Luxembourg                None   
10               Élysée                None   
11               Temple                None   
12         Ménilmontant                None   
13             Panthéon                None   
14                Passy                None   
15         Observatoire                None   
16           Popincourt                None   
17               Bourse                None   
18    Buttes-Montmartre                None   
19       Hôtel-de-Ville                None   

            

In [8]:
if neighbourhoods is not None:
    print(neighbourhoods.head())

   neighbourhood_group        neighbourhood
0                  NaN  Batignolles-Monceau
1                  NaN               Bourse
2                  NaN      Buttes-Chaumont
3                  NaN    Buttes-Montmartre
4                  NaN               Élysée


In [10]:
import pandas as pd

def clean_data(df):
    # Drop columns: 'listing_url', 'scrape_id' and 68 other columns
    df = df.drop(columns=['listing_url', 'scrape_id', 'last_scraped', 'source', 'name', 'neighborhood_overview', 'description', 'picture_url', 'host_id', 'host_url', 'host_name', 'host_since', 'host_location', 'host_about', 'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood', 'host_total_listings_count', 'host_listings_count', 'host_verifications', 'host_has_profile_pic', 'host_identity_verified', 'neighbourhood', 'neighbourhood_group_cleansed', 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bathrooms_text', 'amenities', 'beds', 'bedrooms', 'minimum_nights', 'maximum_nights', 'minimum_minimum_nights', 'maximum_minimum_nights', 'minimum_maximum_nights', 'maximum_maximum_nights', 'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'calendar_updated', 'has_availability', 'availability_30', 'availability_60', 'availability_90', 'availability_365', 'calendar_last_scraped', 'number_of_reviews', 'number_of_reviews_ltm', 'number_of_reviews_l30d', 'first_review', 'last_review', 'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'license', 'review_scores_communication', 'review_scores_location', 'review_scores_value', 'instant_bookable', 'calculated_host_listings_count', 'calculated_host_listings_count_entire_homes', 'calculated_host_listings_count_private_rooms', 'calculated_host_listings_count_shared_rooms', 'reviews_per_month'])
    return df

# Loaded variable 'df' from URI: c:\Users\matth\OneDrive\Documents\KU Leuven\Thesis\Data_Mor\paris\2024-09-06\listings.csv
df = pd.read_csv(r'c:\Users\matth\OneDrive\Documents\KU Leuven\Thesis\Data_Mor\paris\2024-09-06\listings.csv')

cleaned_listings = clean_data(df.copy())
cleaned_listings.head()

Unnamed: 0,id,neighbourhood_cleansed,latitude,longitude,price
0,3109,Observatoire,48.83191,2.3187,$113.00
1,5396,Hôtel-de-Ville,48.85247,2.35835,$95.00
2,7397,Hôtel-de-Ville,48.85909,2.35315,$145.00
3,7964,Opéra,48.87417,2.34245,
4,241715,Buttes-Chaumont,48.893464,2.378341,$450.00


In [11]:
def clean_data(cleaned_listings):
    # Rename column 'id' to 'listing_id'
    cleaned_listings = cleaned_listings.rename(columns={'id': 'listing_id'})
    return cleaned_listings

cleaned_listings_clean = clean_data(cleaned_listings.copy())
cleaned_listings_clean.head()

Unnamed: 0,listing_id,neighbourhood_cleansed,latitude,longitude,price
0,3109,Observatoire,48.83191,2.3187,$113.00
1,5396,Hôtel-de-Ville,48.85247,2.35835,$95.00
2,7397,Hôtel-de-Ville,48.85909,2.35315,$145.00
3,7964,Opéra,48.87417,2.34245,
4,241715,Buttes-Chaumont,48.893464,2.378341,$450.00


In [12]:
# Assuming your dataframes are called df1 and df2
merged_df = pd.merge(merged_calendar, cleaned_listings_clean, 
                    on='listing_id',
                    how='inner',
                    suffixes=('_1', '_2'))

# Print info about the merge
print("Original shapes:")
print(f"DataFrame 1: {merged_calendar.shape}")
print(f"DataFrame 2: {cleaned_listings_clean.shape}")
print(f"\nMerged shape: {merged_df.shape}")
print(f"Number of unique listings: {merged_df['listing_id'].nunique()}")

# Check for duplicate columns
duplicate_cols = [col for col in merged_df.columns if col.endswith('_1') or col.endswith('_2')]
if duplicate_cols:
    print("\nColumns that were duplicated during merge:")
    print(duplicate_cols)

# Show first few rows
print("\nFirst few rows of merged dataset:")
print(merged_df.head())

Original shapes:
DataFrame 1: (79418789, 3)
DataFrame 2: (95461, 5)

Merged shape: (64012312, 7)
Number of unique listings: 95461

Columns that were duplicated during merge:
['price_1', 'price_2']

First few rows of merged dataset:
   listing_id        date  price_1 neighbourhood_cleansed  latitude  \
0        3109  2023-09-05    110.0           Observatoire  48.83191   
1        3109  2023-09-06    110.0           Observatoire  48.83191   
2        3109  2023-09-07    110.0           Observatoire  48.83191   
3        3109  2023-09-08    110.0           Observatoire  48.83191   
4        3109  2023-09-09    110.0           Observatoire  48.83191   

   longitude  price_2  
0     2.3187  $113.00  
1     2.3187  $113.00  
2     2.3187  $113.00  
3     2.3187  $113.00  
4     2.3187  $113.00  


In [13]:
merged_df = merged_df.drop(columns=['price_2'])


In [14]:
merged_df

Unnamed: 0,listing_id,date,price_1,neighbourhood_cleansed,latitude,longitude
0,3109,2023-09-05,110.0,Observatoire,48.831910,2.318700
1,3109,2023-09-06,110.0,Observatoire,48.831910,2.318700
2,3109,2023-09-07,110.0,Observatoire,48.831910,2.318700
3,3109,2023-09-08,110.0,Observatoire,48.831910,2.318700
4,3109,2023-09-09,110.0,Observatoire,48.831910,2.318700
...,...,...,...,...,...,...
64012307,1239136641042729451,2025-09-06,110.0,Panthéon,48.839583,2.346317
64012308,1239136641042729451,2025-09-07,110.0,Panthéon,48.839583,2.346317
64012309,1239136641042729451,2025-09-08,110.0,Panthéon,48.839583,2.346317
64012310,1239136641042729451,2025-09-09,110.0,Panthéon,48.839583,2.346317


In [36]:
def clean_data(merged_df):
    # Rename column 'price_1' to 'price'
    merged_df = merged_df.rename(columns={'price_1': 'price'})
    return merged_df

merged_df_clean = clean_data(merged_df.copy())
merged_df_clean.head()

Unnamed: 0,listing_id,date,price,neighbourhood_cleansed,latitude,longitude
0,3109,2023-09-05,110.0,Observatoire,48.83191,2.3187
1,3109,2023-09-06,110.0,Observatoire,48.83191,2.3187
2,3109,2023-09-07,110.0,Observatoire,48.83191,2.3187
3,3109,2023-09-08,110.0,Observatoire,48.83191,2.3187
4,3109,2023-09-09,110.0,Observatoire,48.83191,2.3187


In [1]:
merged_df_clean

NameError: name 'merged_df_clean' is not defined