# DATA PREPROCESSING

## Wczytanie pakietów i ustawień

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

pd.set_option('display.max_columns', None)

### Wczytanie wszystkich plików CSV do jednego DataFrame

In [2]:
folder_path = Path('../data/raw')
all_files = folder_path.glob('*.csv')

df_all = pd.concat(
    [
        pd.read_csv(f, delimiter=',', decimal=',', low_memory=False)
        for f in all_files
    ],
    ignore_index=True
)

### Wstępna selekcja danych
Usuwam obserwacje z brakami w kolumnie OBS_VALUE oraz obserwacje, które wprawdzie mają dane, ale są oflagowane jako 'b' ("break in time series") - są one często nieprawidłowe lub ze złą jednostką.

In [3]:
if 'OBS_FLAG' in df_all.columns:
    df_all = df_all[df_all['OBS_FLAG'] != 'b']
df_all = df_all.dropna(subset=['OBS_VALUE'])

### Wybór najnowszego roku dla każdego miasta i wskaźnika
Pomiary/badania były przeprowadzane w różnych miastach w różnym czasie. Do celów zadania wybieram zawsze najświeższe dostępne dane.

In [4]:
df_all = df_all.sort_values(
    ['Geopolitical entity (declaring)', 'Urban audit indicator', 'TIME_PERIOD'],
    ascending=[True, True, False]
)

df_latest = df_all.drop_duplicates(
    subset=['Geopolitical entity (declaring)', 'Urban audit indicator'],
    keep='first'
)

city_meta = (
    df_latest[['Geopolitical entity (declaring)', 'cities']]
    .drop_duplicates()
)

df = df_latest.pivot(
    index='Geopolitical entity (declaring)',
    columns='Urban audit indicator',
    values='OBS_VALUE'
).reset_index()

df = df.merge(
    city_meta,
    on='Geopolitical entity (declaring)',
    how='left'
)

### Zmiana nazw kolumn

In [5]:
rename_map = {
    'Geopolitical entity (declaring)': 'City',
    'cities': 'Country_code',
    'Population on the 1st of January, total': 'Population',
    'Age dependency ratio (population aged 0-19 and 65 and more to population aged 20-64)': 'Age_dependency_ratio',
    'Young-age dependency ratio (population aged 0-19 to population 20-64 years)': 'Young_dependency_ratio',
    'Old age dependency ratio (population 65 and over to population 20 to 64 years)': 'Old_dependency_ratio',
    'Median population age': 'Median_age',
    'Foreigners as a proportion of population': 'Share_foreigners',
    'Infant mortality rate (per 1000 live births)': 'Infant_mortality_rate',
    'Crude birth rate (per 1000 inhabitants)': 'Crude_birth_rate',
    'Number of deaths per year under 65 due to diseases of the circulatory or respiratory systems':
        'Deaths_under_65_circulatory_respiratory',
    'Crude death rate (per 1000 inhabitants)': 'Crude_death_rate',
    'Share of persons at risk of poverty or social exclusion -%': 'Share_poverty_risk',
    'Average area of living accommodation - m²/person': 'Avg_living_area_m2_per_person',
    'Average annual rent for housing per m² - EUR': 'Avg_annual_rent_per_m2',
    'Average price for buying an apartment  per m2 - EUR': 'Avg_apartment_price_per_m2',
    'Share of students in higher education in the total population (per 1000 persons)': 'Share_students_higher_edu',
    'Share of early leavers from education and training, total -%': 'Share_early_leavers',
    'Number of available beds per 1000 residents': 'Beds_per_1000',
    'Total nights spent in tourist accommodation establishments per resident population': 'Tourist_nights_per_resident',
    'Unemployment rate': 'Unemployment_rate',
    'Share of journeys to work by public transport (rail, metro, bus, tram) -%': 'Share_public_transport',
    'Share of journeys to work by car or motor cycle -%': 'Share_car_motorcycle',
    'Average time of journey to work - minutes': 'Avg_journey_minutes',
    'Number of registered cars per 1000 population': 'Cars_per_1000',
    'People killed in road accidents per 10000 pop.': 'Road_deaths_per_10000',
    'Total number of hours of sunshine per day': 'Sunshine_hours_per_day',
    'Average temperature of warmest month - degrees': 'Temp_warmest_month',
    'Average temperature of coldest month - degrees': 'Temp_coldest_month',
    'Rainfall - litre/m²': 'Rainfall_l_per_m2',
    'Annual average concentration of NO2 (µg/m³)': 'NO2_avg_conc',
    'Municipal waste generated (domestic and commercial), total - 1000 t': 'Municipal_waste_1000t'
}

df = df.rename(columns=rename_map)

### Konwersja typów kolumn

In [6]:
# Lista wszystkich kolumn oprócz nazwy miasta
numeric_cols = df.columns.drop(['City', 'Country_code'])

# Konwersja na liczby
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')

# Teraz nazwy miast zostają, a reszta jest numeryczna
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 926 entries, 0 to 925
Data columns (total 32 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   City                                     926 non-null    object 
 1   Age_dependency_ratio                     835 non-null    float64
 2   NO2_avg_conc                             1 non-null      float64
 3   Avg_annual_rent_per_m2                   371 non-null    float64
 4   Avg_living_area_m2_per_person            606 non-null    float64
 5   Avg_apartment_price_per_m2               261 non-null    float64
 6   Temp_coldest_month                       8 non-null      float64
 7   Temp_warmest_month                       8 non-null      float64
 8   Avg_journey_minutes                      149 non-null    float64
 9   Crude_birth_rate                         851 non-null    float64
 10  Crude_death_rate                         876 non-n

In [7]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age_dependency_ratio,835.0,66.218443,8.854913,41.2,60.9,65.9,72.0,130.9
NO2_avg_conc,1.0,35.0,,35.0,35.0,35.0,35.0,35.0
Avg_annual_rent_per_m2,371.0,54.276011,53.804471,4.91,9.075,39.29,84.0,256.8
Avg_living_area_m2_per_person,606.0,37.385429,10.413019,14.1,29.0,39.13,46.0,56.5
Avg_apartment_price_per_m2,261.0,2275.468008,1122.187785,333.03,1559.14,2022.0,2647.0,9074.0
Temp_coldest_month,8.0,7.45,2.712932,4.2,5.725,6.85,9.4,12.2
Temp_warmest_month,8.0,28.1625,0.921082,27.0,27.425,28.05,29.05,29.3
Avg_journey_minutes,149.0,24.397651,4.094527,17.5,21.9,23.7,26.0,39.9
Crude_birth_rate,851.0,9.033173,2.778824,0.01,7.23,8.98,10.565,42.29
Crude_death_rate,876.0,10.448767,2.948422,0.38,8.54,10.415,12.27,19.61


### Poprawianie zmiennych
Śmiertelność noworodków: Analiza wartości odstających sugeruje błąd w jednostkach – dane prawdopodobnie wprowadzono w przeliczeniu na 1 mln zamiast na 1 tys. urodzeń. Wymagana jest rekalibracja skali.
Problemy układu oddechowego: Liczba zgonów z przyczyn oddechowych została zestawiona z całkowitą populacją w celu uzyskania wskaźnika relatywnego.

In [9]:
df['Infant_mortality_rate'] = df['Infant_mortality_rate'].apply(lambda x: x/1000 if x >1000 else x)

if 'Deaths_under_65_circulatory_respiratory' in df.columns and 'Population' in df.columns:
    df['Deaths_under_65_circ_resp_per_100k'] = (
        df['Deaths_under_65_circulatory_respiratory'] / df['Population'] * 100_000
    )

if 'Municipal_waste_1000t' in df.columns and 'Population' in df.columns:
    df['Waste_prod_rate'] = (
        df['Municipal_waste_1000t'] * 1_000_000 / df['Population']
    )

df = df.drop(['Deaths_under_65_circulatory_respiratory', 'Municipal_waste_1000t'], axis=1)

In [10]:
df['Country_code'] = df['Country_code'].astype(str).str[:2]

### Zapisanie danych

In [11]:
df.to_csv('../data/preprocessed/preprocessed_data.csv', index=False)

In [12]:
df.head(20)

Unnamed: 0,City,Age_dependency_ratio,NO2_avg_conc,Avg_annual_rent_per_m2,Avg_living_area_m2_per_person,Avg_apartment_price_per_m2,Temp_coldest_month,Temp_warmest_month,Avg_journey_minutes,Crude_birth_rate,Crude_death_rate,Share_foreigners,Infant_mortality_rate,Median_age,Beds_per_1000,Cars_per_1000,Old_dependency_ratio,Road_deaths_per_10000,Population,Rainfall_l_per_m2,Share_early_leavers,Share_car_motorcycle,Share_public_transport,Share_poverty_risk,Share_students_higher_edu,Tourist_nights_per_resident,Sunshine_hours_per_day,Unemployment_rate,Young_dependency_ratio,Country_code,Deaths_under_65_circ_resp_per_100k,Waste_prod_rate
0,'s-Gravenhage (greater city),61.1,,,49.3,,,,,9.64,8.38,19.4,3.06,37.0,,367.25,26.0,0.17,813669.0,,,,,,64.8,23.06,,5.27,35.2,NL,21.138817,3915.597129
1,'s-Hertogenbosch,63.8,,,,,,,,9.88,9.45,6.5,1.89,40.0,,546.45,29.8,0.13,160757.0,,,,,,37.1,,,2.53,34.0,NL,28.614617,4049.590376
2,A Coruña (greater city),67.9,,85.28,37.12,,,,,5.57,10.58,7.6,4.35,48.3,16.21,427.69,40.9,0.12,247350.0,,,,,,98.3,1.76,,10.45,27.0,ES,27.895694,
3,Aachen,54.1,,9.2,46.5,2350.0,,,23.2,8.18,10.2,22.3,2.91,37.0,19.76,381.9,28.1,0.12,252136.0,,3.3,55.4,21.9,,229.6,1.93,,5.63,25.5,DE,39.661135,393.835073
4,Aalborg,63.5,,90.85,48.7,,,,,11.86,10.62,,2.61,39.7,,,27.6,,,,,,,,,,,,36.0,DK,,
5,Aberdeen,53.0,,,,,,,,10.24,9.51,16.5,1.25,37.1,,394.73,23.7,0.09,228180.0,,,,,,117.9,,,7.16,29.2,UK,53.466561,261.986151
6,Acireale (greater city),63.9,,,38.03,,,,,7.37,11.12,3.1,5.36,46.0,53.17,822.89,33.5,0.2,50608.0,,,,,,0.1,3.43,,18.32,30.4,IT,27.66361,484.903573
7,Adana,,,,,,,,,,3.69,,,,,,,,,,,,,,,,,,,TR,,
8,Aix-en-Provence,67.4,,,43.0,4962.0,,,,9.5,8.44,9.0,2.3,37.0,57.39,540.42,31.9,0.27,147933.0,,,70.0,15.4,,267.0,9.2,,11.41,35.5,FR,18.251506,601.623708
9,Alba Iulia,48.8,,,17.42,,,,,8.26,7.63,,3.24,,16.42,,21.4,,74659.0,,,,,,73.4,1.62,,,27.3,RO,69.650009,
