In [30]:
import pandas as pd
pd.set_option('display.max_columns', 500)

In [32]:
def find_warsaw_district(input_string) -> str:
    """
    Identifies and returns a Warsaw district name from a given input string.

    The function splits the input string by commas and checks each part against a list of Warsaw district names. 
    If one of the parts matches a district name, that district name is returned. If there are no matches, 
    the function returns None.

    Parameters:
    - input_string (str): The string to be analyzed, which can contain multiple comma-separated values.

    Returns:
    - str or None: The name of a Warsaw district if found; otherwise, None.
    """
    # List of Warsaw districts
    warsaw_districts = [
        "Bemowo", "Białołęka", "Bielany", "Mokotów", "Ochota",
        "Praga-Południe", "Praga-Północ", "Rembertów", "Śródmieście",
        "Targówek", "Ursus", "Ursynów", "Wawer", "Wesoła",
        "Wilanów", "Włochy", "Wola", "Żoliborz"
    ]

    # Split the input string by commas
    parts = input_string.split(',')

    # Check each part for a match with Warsaw districts
    for part in parts:
        if part.strip() in warsaw_districts:
            return part.strip()  # Return the matching district name

    return None

In [34]:
def extract_ad_dates(row):
    ad_info = row['announcement_date']
    
    update_date_part = [line for line in ad_info.split('\\n') if 'Aktualizacja:' in line]
    last_update = update_date_part[0].replace("('Aktualizacja: ", '').strip() if update_date_part else None
    
    added_date_part = [line for line in ad_info.split('\\n') if 'Dodano:' in line]
    ad_added = added_date_part[0].replace('Dodano: ', '').strip() if added_date_part else None
    
    return pd.Series([last_update, ad_added])

In [36]:
def process_data(data_draw):
    
    df = data_draw.copy()
    df['district'] = df['location'].apply(lambda x: find_warsaw_district(str(x)))
    df = df[~df.district.isna()]
    
    df[['last_update', 'added_dt']] = df.apply(extract_ad_dates, axis=1)
    df.drop(['announcement_date'], axis=1, inplace=True)
    df['expired'] = 0
    df['expired_date'] = None
    to_order = ['added_dt', 'last_update', 'link', 'expired', 'expired_date']
    columns_order = to_order + [col for col in df.columns if col not in to_order]
    df = df[columns_order]
    
    return df

In [38]:
df = pd.read_csv('data_raw\otodom_scraped_data\otodom_2025_01_05.csv')

In [40]:
df = process_data(df)

In [4]:
import pandas as pd

In [6]:
path = f'data_processed/main.csv'
main = pd.read_csv(path)

In [28]:
sorted([date for date in list(main.expired_date.unique()) if isinstance(date, str)])

['2024_12_29',
 '2025_01_02',
 '2025_01_03',
 '2025_01_04',
 '2025_01_06',
 '2025_01_07',
 '2025_01_08',
 '2025_01_09']

In [24]:
main['expired_date'] = main['expired_date'].replace('2025_01_10', '2025_01_09')

In [44]:
df = df[~df.link.isin(main.link)]

In [26]:
main = pd.concat([main, df], ignore_index=True)

NameError: name 'df' is not defined

In [30]:
main.to_csv(path,
          encoding='utf-8',
          index=False)

### ====