In [18]:
import os
import geopandas as gpd
import pandas as pd

# Data path
file_path = "../data/raw/abidjan_pois.geojson"

# Check if file exists
if os.path.isfile(file_path):
    print("File found!")
    # Load raw data
    gdf = gpd.read_file(file_path)
    # Inspect first few rows
    print(gdf.head())
else:
    print("File not found. Check the path:", file_path)


File found!
            name name:en           amenity man_made  shop tourism  \
0          Powex    None              fuel     None  None    None   
1           None    None            police     None  None    None   
2  ASA Formation    None           college     None  None    None   
3    Lavage Auto    None          car_wash     None  None    None   
4   Orange Money    None  bureau_de_change     None  None    None   

  opening_hours  beds rooms addr:full addr:housenumber addr:street addr:city  \
0          None  None  None      None             None        None      None   
1          None  None  None      None             None        None      None   
2          None  None  None      None             None        None      None   
3          None  None  None      None             None        None      None   
4          None  None  None      None             None        None      None   

  source name:fr       osm_id osm_type                  geometry  
0   None    None  1193538

In [4]:
print("Original rows:", len(gdf))

Original rows: 55875


In [5]:
print("Columns:", gdf.columns)

Columns: Index(['name', 'name:en', 'amenity', 'man_made', 'shop', 'tourism',
       'opening_hours', 'beds', 'rooms', 'addr:full', 'addr:housenumber',
       'addr:street', 'addr:city', 'source', 'name:fr', 'osm_id', 'osm_type',
       'geometry'],
      dtype='object')


In [6]:
print("Missing values per column:\n", gdf.isna().sum())

Missing values per column:
 name                15605
name:en             55630
amenity             29305
man_made            53244
shop                29613
tourism             54151
opening_hours       54428
beds                55874
rooms               55863
addr:full           55844
addr:housenumber    55720
addr:street         54943
addr:city           55049
source              45858
name:fr             54995
osm_id                  0
osm_type                0
geometry                0
dtype: int64


In [7]:
columns_to_keep = ['name', 'amenity', 'shop', 'tourism', 'osm_id', 'osm_type', 'geometry']
gdf_clean = gdf[columns_to_keep]

In [8]:
gdf_clean = gdf_clean.dropna(subset=['name', 'amenity'])

In [9]:
gdf_clean = gdf_clean.drop_duplicates()

In [10]:
gdf_clean = gdf_clean.reset_index(drop=True)

In [11]:
print("Number of rows before cleaning:", len(gdf))
print("Number of rows after cleaning:", len(gdf_clean))
print("Missing values per column:\n", gdf_clean.isnull().sum())
print("Columns in cleaned data:\n", gdf_clean.columns)

Number of rows before cleaning: 55875
Number of rows after cleaning: 20734
Missing values per column:
 name            0
amenity         0
shop        20618
tourism     20726
osm_id          0
osm_type        0
geometry        0
dtype: int64
Columns in cleaned data:
 Index(['name', 'amenity', 'shop', 'tourism', 'osm_id', 'osm_type', 'geometry'], dtype='object')


In [12]:
gdf_clean['shop'] = gdf_clean['shop'].fillna('None')



In [13]:
gdf_clean['tourism'] = gdf_clean['tourism'].fillna('None')

In [14]:
gdf_clean[['shop', 'tourism']].isna().sum()

shop       0
tourism    0
dtype: int64

In [15]:
print("Missing values per column:\n", gdf_clean.isnull().sum())

Missing values per column:
 name        0
amenity     0
shop        0
tourism     0
osm_id      0
osm_type    0
geometry    0
dtype: int64


In [16]:
import random

# Add dummy reviews (number of reviews)
gdf_clean['reviews'] = [random.randint(0, 500) for _ in range(len(gdf_clean))]

# Add dummy ratings (1-5 stars, 1 decimal)
gdf_clean['rating'] = [round(random.uniform(1, 5), 1) for _ in range(len(gdf_clean))]

# Check
print(gdf_clean[['name', 'amenity', 'reviews', 'rating']].head())

            name           amenity  reviews  rating
0          Powex              fuel       73     3.1
1  ASA Formation           college      380     2.0
2    Lavage Auto          car_wash      173     1.6
3   Orange Money  bureau_de_change      142     2.7
4    Lavage Auto          car_wash      342     1.5
