In [2]:
from modules.datakit import *
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

__Read data__

In [4]:
path = f'data_processed/main.csv'
df = pd.read_csv(path)

__Deduplicate records using dedicated function__

In [6]:
df = deduplicate_main(df)

__Transform scraped data using dedicated function__

In [8]:
df = transform_data(
    main = df,
    only_expired = True,
    duration_start = 1,
    duration_end = 28,
    utilize_morf = True
)

__Correct misleading locations__

In [10]:
path = 'data_raw/manual_input/misleading_locations.csv'
df_locations = pd.read_csv(path)
df_locations = df_locations[df_locations.misleading_location.eq(True)]

real_coordinates = df_locations['maps_herf'].apply(
    lambda x: re.search('/@(.*),', x).group(1).split(','))

df_locations['latitude'] = [float(sublist[0]) for sublist in real_coordinates]
df_locations['longitude'] = [float(sublist[1]) for sublist in real_coordinates]

In [11]:
latitude_map = df_locations.set_index('link')['latitude']
longitude_map = df_locations.set_index('link')['longitude']
district_map = df_locations.set_index('link')['real_district']

df.loc[df['link'].isin(latitude_map.index), 'latitude'] = df['link'].map(latitude_map)
df.loc[df['link'].isin(longitude_map.index), 'longitude'] = df['link'].map(longitude_map)
df.loc[df['link'].isin(district_map.index), 'district'] = df['link'].map(district_map)

__Add hidden additional fees__

In [14]:
path = 'data_raw/large_language_model/llm_output.csv'
df_fees = pd.read_csv(path)

In [15]:
fees_map = df_fees.set_index('link')['real_additional_fees']
df.loc[df['link'].isin(fees_map.index), 'additional_fees'] = df['link'].map(fees_map)

__Calculate distance columns using dedicated functions__

In [17]:
ztm_stops = pd.read_csv('geographic_data/ztm_stops/ztm_stops.csv')

In [18]:
df['distance_to_stop'] = distance_to_nearest_stop(df, ztm_stops)

In [19]:
subway_stations = ztm_stops[
    ztm_stops.stop_name.str.startswith('Metro')|
    ztm_stops.stop_name.str.contains('Wilsona')|
    ztm_stops.stop_name.str.contains('Daszyńskiego')|
    ztm_stops.stop_name.str.contains('Nowy Świat')|
    ztm_stops.stop_name.str.contains('ONZ')|
    ztm_stops.stop_name.str.contains('Wileński')|
    ztm_stops.stop_name.eq('Dw. Gdański')|
    ztm_stops.stop_name.eq('Centrum')].copy()

In [20]:
df['distance_to_subway'] = distance_to_nearest_stop(df, subway_stations)

In [21]:
df['distance_to_center'] = distance_to_center(df)

__Historical data calculate average rent price per square in a radius using__

In [23]:
legacy_data = concat_csv_files(folder_path='data_raw/otodom_scraped_data_historical')

In [24]:
legacy_data = legacy_data[~legacy_data.approximate_coordinates.eq(True)]

In [25]:
columns_to_convert = ['rent_price', 'additional_fees', 'area']
for column in columns_to_convert:
    legacy_data[column] = legacy_data[column].apply(lambda x: get_numbers(str(x)))
    
legacy_data['rent_price'] = legacy_data.apply(
    lambda row: row['rent_price'] + row['additional_fees']\
    if not pd.isna(row['additional_fees']) else row['rent_price'], 
    axis=1)

legacy_data['price_per_square'] = legacy_data.rent_price/legacy_data.area

In [26]:
df['avg_price'] = average_price_within_radius(df, legacy_data, radius_km=0.5)

__Add apartment classes__

In [28]:
additional_columns = pd.read_csv('data_raw/manual_input/apartment_class.csv')

In [29]:
df = df.merge(additional_columns, on='link')

__Save data__

In [31]:
columns_to_drop = ['title', 'adv_description', 'link', 'last_update', 'location']
df.drop(columns=columns_to_drop, inplace=True)

path = 'data_processed/modeling_data.csv'
df.to_csv(path,
          encoding='utf-8',
          index=False)