In [8]:
import os
import pandas as pd
from PyPDF2 import PdfReader

# Function to extract data from the text of a PDF
def extract_data_from_text(text, neighborhood):
    data = {
        'Neighborhood': [neighborhood]*5,
        'Year': [1996, 2001, 2006, 2011, 2016],
        'Census Population': [],
        'English (as mother tongue)': [],
        'Unemployment Rate': [],
        'Public Transport': [],
        'Median Household Income': [],
        'Population in Low-Income Households': []
    }
    
    def extract_values(label, count, is_percentage=False, is_currency=False):
        try:
            values = text.split(label)[1].split()
            if is_percentage:
                return [float(val.split('%')[0]) for val in values[:count]]
            if is_currency:
                return [int(val.replace('$', '').replace(',', '')) for val in values[:count]]
            return [int(val.replace(',', '')) for val in values[:count]]
        except (IndexError, ValueError):
            return [None] * count
    
    # Extract data for each feature
    data['Census Population'] = extract_values('Census Population', 5)
    data['English (as mother tongue)'] = extract_values('English', 5, is_percentage=True)
    data['Unemployment Rate'] = extract_values('Unemployment rate', 5, is_percentage=True)
    data['Public Transport'] = extract_values('Public transit', 5, is_percentage=True)
    data['Median Household Income'] = extract_values('Median household income', 5, is_currency=True)
    data['Population in Low-Income Households'] = extract_values('Population in low income households', 5, is_percentage=True)
    
    return pd.DataFrame(data)

# Initialize an empty DataFrame to store all data
all_data = pd.DataFrame()

# Specify the directory containing the PDF files
directory = 'census data'

# Loop through all PDF files in the directory
for filename in os.listdir(directory):
    if filename.endswith('.pdf'):
        file_path = os.path.join(directory, filename)
        neighborhood = filename.split('-')[0]  # Extract neighborhood name from filename
        
        pdf_reader = PdfReader(file_path)
        pdf_text = ''
        for page in pdf_reader.pages:
            pdf_text += page.extract_text()
        
        neighborhood_data = extract_data_from_text(pdf_text, neighborhood)
        all_data = pd.concat([all_data, neighborhood_data], ignore_index=True)

# Save the combined DataFrame to an Excel file
all_data.to_excel('panel_dataset.xlsx', index=False)


In [10]:
import pandas as pd

# Load the existing dataset from the provided Excel file
file_path = 'panel_dataset.xlsx'
df = pd.read_excel(file_path)

# Generate the list of years to add
additional_years = list(range(2008, 2024))

# Initialize an empty DataFrame to store the new data
new_data = pd.DataFrame()

# Process each neighborhood
for neighborhood in df['Neighborhood'].unique():
    # Extract data for the current neighborhood
    neighborhood_data = df[df['Neighborhood'] == neighborhood].copy()
    
    # Add the additional years
    for year in additional_years:
        if year not in neighborhood_data['Year'].values:
            new_row = pd.DataFrame({'Neighborhood': [neighborhood], 'Year': [year]})
            neighborhood_data = pd.concat([neighborhood_data, new_row], ignore_index=True)
    # Ensure all values are floats before interpolation
    for column in neighborhood_data.columns:
        if column not in ['Neighborhood', 'Year']:
            neighborhood_data[column] = neighborhood_data[column].astype(float)
    
    # Sort by year and interpolate the missing values
    neighborhood_data = neighborhood_data.sort_values('Year').reset_index(drop=True)
    neighborhood_data = neighborhood_data.interpolate(method='linear', limit_direction='both')
    
    # Append the interpolated data to the new DataFrame
    new_data = pd.concat([new_data, neighborhood_data], ignore_index=True)

# Save the updated dataset back to an Excel file
new_file_path = 'updated_panel_dataset.xlsx'
new_data.to_excel(new_file_path, index=False)

  neighborhood_data = neighborhood_data.interpolate(method='linear', limit_direction='both')
  neighborhood_data = neighborhood_data.interpolate(method='linear', limit_direction='both')
  neighborhood_data = neighborhood_data.interpolate(method='linear', limit_direction='both')
  neighborhood_data = neighborhood_data.interpolate(method='linear', limit_direction='both')
  neighborhood_data = neighborhood_data.interpolate(method='linear', limit_direction='both')
  neighborhood_data = neighborhood_data.interpolate(method='linear', limit_direction='both')
  neighborhood_data = neighborhood_data.interpolate(method='linear', limit_direction='both')
  neighborhood_data = neighborhood_data.interpolate(method='linear', limit_direction='both')
  neighborhood_data = neighborhood_data.interpolate(method='linear', limit_direction='both')
  neighborhood_data = neighborhood_data.interpolate(method='linear', limit_direction='both')
  neighborhood_data = neighborhood_data.interpolate(method='linear', l