In [2]:
import pandas as pd
import numpy as np
import re

In [12]:
# Load and examine the dataset "City_Development_Projects"
df = pd.read_csv('City_Development_Projects.csv')
print(df)

   Project_ID District_ID                   Project_Name Project_Start_Date  \
0       P1001         D01          Riverfront Residences          11/1/2023   
1       P1002         D02              Riverside Commons          12/1/2023   
2       P1003         D03                Northgate Lofts          2/15/2024   
3       P1004         D04  Southridge Mall Redevelopment          10-Jan-24   
4       P1005         D05             Westfield EcoHomes           3/1/2024   
5       P1006         D06   Oldtown Heritage Restoration           3/5/2024   
6       P1007         D01           Downtown Transit Hub       April 1 2024   
7       P1008         D02       Riverside Park Extension          5/12/2024   
8       P1009         D03     Northgate Community Center         15/06/2024   
9       P1010         D04      Southridge Green Corridor          7/20/2024   
10      P1011         D05    Westfield Mixed-Use Complex          8/15/2024   
11      P1001         D01          Riverfront Reside

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Project_ID          15 non-null     object 
 1   District_ID         15 non-null     object 
 2   Project_Name        15 non-null     object 
 3   Project_Start_Date  15 non-null     object 
 4   Project_Status      15 non-null     object 
 5   Estimated_Units     11 non-null     float64
dtypes: float64(1), object(5)
memory usage: 852.0+ bytes


# Data issues
* Remove duplicates: Duplicate rows with Project ID P1001 (the records are exactly the same)
* Inconsistent format: Inconsistent date formats in column Project_Start_Date
* Standardized text: Project_Status column having inconsistent text
* Missing values: Estimated_Units column is missing values and inconsistent way to address null values.

In [14]:
#Remove duplicates
df = df.drop_duplicates()
df

Unnamed: 0,Project_ID,District_ID,Project_Name,Project_Start_Date,Project_Status,Estimated_Units
0,P1001,D01,Riverfront Residences,11/1/2023,Ongoing,120.0
1,P1002,D02,Riverside Commons,12/1/2023,completed,45.0
2,P1003,D03,Northgate Lofts,2/15/2024,ongoing,
3,P1004,D04,Southridge Mall Redevelopment,10-Jan-24,Planning,78.0
4,P1005,D05,Westfield EcoHomes,3/1/2024,ongoing,32.0
5,P1006,D06,Oldtown Heritage Restoration,3/5/2024,Ongoing,5.0
6,P1007,D01,Downtown Transit Hub,April 1 2024,planning,150.0
7,P1008,D02,Riverside Park Extension,5/12/2024,completed,0.0
8,P1009,D03,Northgate Community Center,15/06/2024,On Hold,
9,P1010,D04,Southridge Green Corridor,7/20/2024,ongoing,22.0


In [None]:
# Manage inconsistencies

# standardize text in project status
df['Project_Status'] = (df['Project_Status'].str.strip().str.lower())

status_map = {'ongoing':'Ongoing',
              'completed': 'Completed',
               'planning': 'Planning',
                'on hold': 'On Hold'}

df['Project_Status'] = df['Project_Status'].map(status_map)

print(df)

   Project_ID District_ID                   Project_Name Project_Start_Date  \
0       P1001         D01          Riverfront Residences          11/1/2023   
1       P1002         D02              Riverside Commons          12/1/2023   
2       P1003         D03                Northgate Lofts          2/15/2024   
3       P1004         D04  Southridge Mall Redevelopment          10-Jan-24   
4       P1005         D05             Westfield EcoHomes           3/1/2024   
5       P1006         D06   Oldtown Heritage Restoration           3/5/2024   
6       P1007         D01           Downtown Transit Hub       April 1 2024   
7       P1008         D02       Riverside Park Extension          5/12/2024   
8       P1009         D03     Northgate Community Center         15/06/2024   
9       P1010         D04      Southridge Green Corridor          7/20/2024   
10      P1011         D05    Westfield Mixed-Use Complex          8/15/2024   
12      P1012         D06     Oldtown Library Renova

In [16]:
#Manage inconsistencies
# Standardize date format on Project Start Date column

# Normalize month names such as Sept

df['Project_Start_Date']= (df['Project_Start_Date'].astype(str).str.strip().str.replace(r'Sept\b','Sep', regex=True))
df

# Multiple strategies for parsing & error handling

def parse_mixed_dates(date_str):
    if pd.isna(date_str) or date_str in ['', 'nan']:
        return pd.NaT

    for fmt in (
        "%m/%d/%Y",
        "%d/%m/%Y",
        "%d-%b-%y",
        "%b %d %Y",
        "%B %d %Y",
        "%d-%b-%Y"
    ):
        try:
            return pd.to_datetime(date_str, format=fmt)
        except ValueError:
            continue
    return pd.to_datetime(date_str, errors="coerce")

df["Project_Start_Date"] = df["Project_Start_Date"].apply(parse_mixed_dates)
# Remove time
df["Project_Start_Date"] = df["Project_Start_Date"].dt.date
df

Unnamed: 0,Project_ID,District_ID,Project_Name,Project_Start_Date,Project_Status,Estimated_Units
0,P1001,D01,Riverfront Residences,2023-11-01,Ongoing,120.0
1,P1002,D02,Riverside Commons,2023-12-01,Completed,45.0
2,P1003,D03,Northgate Lofts,2024-02-15,Ongoing,
3,P1004,D04,Southridge Mall Redevelopment,2024-01-10,Planning,78.0
4,P1005,D05,Westfield EcoHomes,2024-03-01,Ongoing,32.0
5,P1006,D06,Oldtown Heritage Restoration,2024-03-05,Ongoing,5.0
6,P1007,D01,Downtown Transit Hub,2024-04-01,Planning,150.0
7,P1008,D02,Riverside Park Extension,2024-05-12,Completed,0.0
8,P1009,D03,Northgate Community Center,2024-06-15,On Hold,
9,P1010,D04,Southridge Green Corridor,2024-07-20,Ongoing,22.0


In [17]:
# Handling missing values in Estimate_Units
df['Estimated_Units']=(df['Estimated_Units'].replace(['N/A', 'null', ''], np.nan)
                       )
#Convert values to numerical
df['Estimated_Units'] = pd.to_numeric(
    df['Estimated_Units'],
    errors='coerce'
            )
# Replace empty cells with average values of the column
df['Estimated_Units']=df['Estimated_Units'].fillna(df['Estimated_Units'].mean())
df


Unnamed: 0,Project_ID,District_ID,Project_Name,Project_Start_Date,Project_Status,Estimated_Units
0,P1001,D01,Riverfront Residences,2023-11-01,Ongoing,120.0
1,P1002,D02,Riverside Commons,2023-12-01,Completed,45.0
2,P1003,D03,Northgate Lofts,2024-02-15,Ongoing,63.2
3,P1004,D04,Southridge Mall Redevelopment,2024-01-10,Planning,78.0
4,P1005,D05,Westfield EcoHomes,2024-03-01,Ongoing,32.0
5,P1006,D06,Oldtown Heritage Restoration,2024-03-05,Ongoing,5.0
6,P1007,D01,Downtown Transit Hub,2024-04-01,Planning,150.0
7,P1008,D02,Riverside Park Extension,2024-05-12,Completed,0.0
8,P1009,D03,Northgate Community Center,2024-06-15,On Hold,63.2
9,P1010,D04,Southridge Green Corridor,2024-07-20,Ongoing,22.0


In [18]:
# Testing
df.info()
print(df)

# Export the cleaned data
df.to_csv('City_Development_Projects_output.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
Index: 14 entries, 0 to 14
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Project_ID          14 non-null     object 
 1   District_ID         14 non-null     object 
 2   Project_Name        14 non-null     object 
 3   Project_Start_Date  14 non-null     object 
 4   Project_Status      14 non-null     object 
 5   Estimated_Units     14 non-null     float64
dtypes: float64(1), object(5)
memory usage: 784.0+ bytes
   Project_ID District_ID                   Project_Name Project_Start_Date  \
0       P1001         D01          Riverfront Residences         2023-11-01   
1       P1002         D02              Riverside Commons         2023-12-01   
2       P1003         D03                Northgate Lofts         2024-02-15   
3       P1004         D04  Southridge Mall Redevelopment         2024-01-10   
4       P1005         D05             Westfield EcoHomes   