In [1]:
import os.path
import requests
from bs4 import BeautifulSoup
import re
import json
import html
import pandas as pd
import datetime
import time
import random
import warnings
import ast

from tqdm import tqdm

pd.set_option('display.max_columns', None) 
pd.options.mode.chained_assignment = None

In [2]:
date_to_run = '20230605'
df = pd.read_csv(os.path.join('..', 'Data', 'xrysi_eykairia', 'raw', f'xe_{date_to_run}.csv'))
df.drop_duplicates(subset=['id'], inplace=True)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5811 entries, 0 to 5816
Data columns (total 32 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       5811 non-null   int64  
 1   internal_id              5811 non-null   int64  
 2   item_type                5811 non-null   object 
 3   transaction_type         5811 non-null   object 
 4   address                  5811 non-null   object 
 5   geo_lat                  5811 non-null   float64
 6   geo_lng                  5811 non-null   float64
 7   ad_group_id              752 non-null    float64
 8   levels                   5811 non-null   object 
 9   bedrooms                 5527 non-null   float64
 10  bathrooms                5253 non-null   float64
 11  construction_year        4721 non-null   float64
 12  display_golden_border    5811 non-null   bool   
 13  owner_logo               4446 non-null   object 
 14  price_per_unit_area     

In [4]:
df.describe(include = ['object', 'bool'])

Unnamed: 0,item_type,transaction_type,address,levels,display_golden_border,owner_logo,company_title,is_commercial,sponsored,title,title_abbreviation,price,price_with_abbreviation,price_per_square_meter,size_with_square_meter,date,extra_seo_info,image_alt,image_gallery,url,unique_group_url,extraction_date
count,5811,5811,5811,5811,5811,4446,4447,5811,5811,5811,5811,5811,5811,5809,5811,2610,5811,5811,5811,5811,752,5811
unique,1,1,234,61,2,574,572,2,1,419,419,293,252,70,285,39,1,4855,5146,5811,752,1
top,re_residence,LET.NORMAL,Αθήνα (Κολωνάκι),['1ος'],False,https://a2.xe.gr/LOGO/1622120851959.jpg,Golden Home,True,False,Διαμέρισμα 50 τ.μ.,Διαμέρισμα 50 τ.μ.,450 €,450 €,8 €/τ.μ.,50 τ.μ.,πριν από 2 εβδομάδες,Ενοικίαση κατοικίας,Ενοικίαση κατοικίας Αθήνα (Κολωνάκι) Διαμέρισμ...,[],https://www.xe.gr/property/d/enoikiaseis-katoi...,https://www.xe.gr/property/u/460563,2023-06-05
freq,5811,5811,240,1237,5355,313,313,4518,5811,286,286,323,327,812,288,569,5811,7,666,1,1,5811


In [5]:
df.describe()

Unnamed: 0,id,internal_id,geo_lat,geo_lng,ad_group_id,bedrooms,bathrooms,construction_year,price_per_unit_area,account_id
count,5811.0,5811.0,5811.0,5811.0,752.0,5527.0,5253.0,4721.0,5811.0,5718.0
mean,795405900.0,46423490.0,37.986582,23.739821,428654.784574,1.86376,1.194555,1976.27007,11.668904,1208909.0
std,70181950.0,2097770.0,0.019584,0.020732,52509.42131,1.229862,0.736023,16.018148,50.271482,722332.9
min,576566.0,20092860.0,37.94883,23.687081,55188.0,1.0,1.0,1900.0,0.0,20034.0
25%,804523900.0,46570500.0,37.971644,23.726997,429311.25,1.0,1.0,1970.0,8.0,540452.0
50%,812525900.0,46999290.0,37.986432,23.739544,446462.5,2.0,1.0,1975.0,9.0,1306228.0
75%,814391000.0,47113450.0,38.000622,23.753879,455809.5,2.0,1.0,1980.0,12.0,1881380.0
max,815466300.0,47163980.0,38.032707,23.789568,461166.0,21.0,16.0,2023.0,3205.0,2268898.0


In [6]:
useless_columns = ['item_type', 'transaction_type', 'owner_logo', 'title', 
                   'price_with_abbreviation', 'extra_seo_info', 'image_alt', 
                   'image_gallery', 'unique_group_url', 'price_per_square_meter', 'display_golden_border',
                   'is_commercial', 'sponsored']

In [7]:
useful_columns = ['id', 'internal_id', 'title_abbreviation','address','price','price_per_unit_area',
                  'size_with_square_meter','construction_year','levels','bedrooms', 'bathrooms',  
                   'date', 'extraction_date', 'geo_lat', 'geo_lng','company_title', 'account_id','ad_group_id', 'url']

In [8]:
len(useless_columns)+len(useful_columns)

32

In [9]:
df_useful = df.loc[:, useful_columns]

In [10]:
df_useful[['dimos', 'perioxi']] = df_useful['address'].str.extract(r'([\w\s]+)(?:\(([\w\s]+)\))?')
df_useful['dimos'] = df_useful['dimos'].str.strip()

In [11]:
dimoi_df = pd.read_csv(os.path.join('..', 'Data', 'other', 'mappings', 'dimoi.csv'))
dimoi_dict = dimoi_df.set_index('Raw')['Fixed'].to_dict()
missing_dimoi = set(df_useful['dimos'].unique()) - set(dimoi_dict.keys())
if missing_dimoi: 
    warnings.warn(f'The following dimoi are new: {missing_dimoi}')
df_useful['dimos'] = df_useful['dimos'].map(dimoi_dict)

In [12]:
df_useful['type'] = df_useful['title_abbreviation'].str.extract("(\w+)")
df_useful['price'] = df_useful['price'].str.replace('[.,]', '', regex=True).str.extract(r'(\d+)').astype('float')
df_useful['area'] = df['size_with_square_meter'].str.replace('[.,]', '', regex=True).str.extract(r'(\d+)').astype('float')
df_useful['price_per_area'] = df_useful['price'] / df_useful['area']
df_useful['date'] = df_useful['date'].fillna('πριν από 2 μήνες')

In [13]:
df_useful['levels'] = df_useful['levels'].str.replace('Υπόγειο', '-1').str.replace('Ισόγειο', '0').str.replace('Υπερυψωμένο', '0.5').str.replace('Ημιυπόγειο', '-0.5').str.replace('Ημιώροφος', '0.5').str.replace('ος', '')
df_useful['level_list'] = df_useful['levels'].apply(ast.literal_eval).apply(lambda lst: [float(element) for element in lst])

def get_smallest_level(level_list):
    if not level_list: return None
    min_val = min(level_list)
    if min_val < 0:
        non_negatives = [num for num in level_list if num >= 0]
        if non_negatives:
            min_val = min(non_negatives)
    return min_val
df_useful['min_level'] = df_useful['level_list'].apply(get_smallest_level)
df_useful['level_length'] = df_useful['level_list'].apply(len)
df_useful['max_level'] = df_useful['min_level'] + df_useful['level_length'] - 1


In [14]:
df_useful['a'] = df_useful['min_level'] + df_useful['level_length'] - 1

In [15]:
df_useful.isnull().sum()

id                           0
internal_id                  0
title_abbreviation           0
address                      0
price                        0
price_per_unit_area          0
size_with_square_meter       0
construction_year         1090
levels                       0
bedrooms                   284
bathrooms                  558
date                         0
extraction_date              0
geo_lat                      0
geo_lng                      0
company_title             1364
account_id                  93
ad_group_id               5059
url                          0
dimos                        0
perioxi                    304
type                         0
area                         0
price_per_area               0
level_list                   0
min_level                    2
level_length                 0
max_level                    2
a                            2
dtype: int64

In [16]:
df_useful[['temp_no', 'temp_date_unit']] = df_useful['date'].str.extract(r'(\d+)\s+(\w{2})')

In [17]:
df_useful['days_passed'] = 0
df_useful['days_passed'][df_useful['temp_date_unit'] == "ημ"] = 1 * df_useful['temp_no'].astype(float)
df_useful['days_passed'][df_useful['temp_date_unit'] == "εβ"] = 7 * df_useful['temp_no'].astype(float)
df_useful['days_passed'][df_useful['temp_date_unit'] == "μή"] = 30 * df_useful['temp_no'].astype(float)

In [18]:
df_ypes = pd.read_csv(os.path.join('..', 'Data', 'other', 'mappings', 'kwd_ypes.csv'))
df_useful = df_useful.merge(df_ypes, on='dimos')

In [19]:
df_clean = df_useful[(4 <= df_useful['price_per_area']) & (df_useful['price_per_area'] <= 25) & 
                     (df_useful['price'] >= 100) & 
                     (df_useful['type'] != "Κτίριο") & 
                     (df_useful['dimos'] != "WRONG") & 
                     (df_useful['level_length'] <= 3) &
                     ~((df_useful['type'] != "Διαμέρισμα") & (df_useful['level_length'] != 1))]

In [20]:
df_clean.to_csv(os.path.join('..', 'Data', 'xrysi_eykairia', 'clean', f'xe_clean_{date_to_run}.csv'), encoding='utf-8-sig', index=False)

In [21]:
print(f"Original: {len(df_useful)} rows.\nFiltered: {len(df_clean)} rows.\nRemoved: {len(df_useful) - len(df_clean)} rows.")

Original: 5771 rows.
Filtered: 5418 rows.
Removed: 353 rows.
