In [9]:
import pandas as pd
import numpy as np
from collections import Counter, defaultdict
import re
import time
import datetime

In [10]:
train_data = pd.read_csv('train.csv').drop('w', axis = 1)
test_data = pd.read_csv('test.csv')
train = train_data.copy()
test = test_data.copy()

### Functions for data cleaning

In [11]:
def f_square_meters(value):
    """
    if the value is less than 13 I believe is an outlier...
    """
    if int(value) > 13:
        return int(value)
    else: return '--'

def f_bathrooms_number(label):
    """
    returns the number, if it is +3 returns 4
    
    """
    if label == '1':
        return 1
    elif label == '2':
        return 2
    elif label == '3':
        return 3
    elif label == '3+':
        return 4
    else: return '--'

def f_lift(value):
    """
    1 if has an elevator
    0 if does not have one
    """
    if value == 'yes':
        return 1
    if value == 'no':
        return 0
    else: return '--' 

def f_rooms_number(value):
    """
    returns either the number or 6 if it is +5
    """
    if value == '5+':
        return 6
    else: return int(value)

def f_total_floors_in_building(value):
    """
    return the number of 1 if it is 1 floor
    """
    if value == '1 floor':
        return 1
    elif  value in ['7', '8', '4', '5', '6', '3', '2', '23', '14', '10','15',
                    '13', '16', '11', '9', '12', '22', '21', '19',
                    '17', '18', '24', '20', '23','25', '26']:
        return int(value)
    else: return '--'

"""
def f_car_parking(label): # firt fill null with .fillna('--')
    '''
    if there is a private parking 2
    if it is shared 1
    if there is no parking 0
    '''
    if label == '--':
        return '--'
    
    if label == 'no':
        return 0
    elif label in ['1 in garage/box','1 in garage/box, 1 in shared parking', '1 in garage/box, 5 in shared parking', '1 in garage/box, 2 in shared parking',
                  '1 in garage/box, 3 in shared parking', '2 in garage/box', '2 in garage/box, 1 in shared parking', '2 in garage/box, 2 in shared parking',
                   '2 in garage/box, 3 in shared parking','2 in garage/box, 8 in shared parking', '7 in garage/box, 3 in shared parking', '5 in garage/box',
                  '2 in garage/box, 1 in shared parking', '1 in garage/box, 4 in shared parking','2 in garage/box, 16 in shared parking']:
        return 2
    else:
        return 1
"""

def f_car_parking(label):
    if label == 'no':
        return 0
    if label == '--':
        return '--'
    else:
        return 1
    
def f_private_car(text):
    if text == 'no':
        return 0
    elif text == '--':
        return '--'
    key = re.findall(',',text)
    if key:
        s = text.split(',')[0]
        num = re.findall(r'\d+',s)
        return num[0]
    else:
        if re.findall('garage/box',text):
            num = re.findall(r'\d+',text)
            return num[0]
        else: return 0

def f_shared_car(text):
    if text == 'no':
        return 0
    elif text == '--':
        return '--'
    key = re.findall(',',text)
    if key:
        s = text.split(',')[1]
        num = re.findall(r'\d+',s)
        return num[0]
    else:
        if re.findall('shared',text):
            num = re.findall(r'\d+',text)
            return num[0]
        else: return 0
    
    
def f_availability(label):
    """
    encode the date in a number using time function. the later the bigger 
    """
    if label == 'available':
        return 0
    elif type(label) == str:
        s = label[15:]
        return int((time.mktime(datetime.datetime.strptime(s, "%d/%m/%Y").timetuple()) - time.mktime(datetime.datetime.strptime('1/1/2023', "%d/%m/%Y").timetuple()))/100000)/10
    else: return 0


def f_condominium_fees(value): # first fill none with '--'
    """
    if no fees return 0, else the number 
    """
    if value == '--':
        return '--'
    elif value == 'No condominium fees':
        return 1
    else:
        return int(value)

def f_year_of_construction(value): # first fill nan with '--'
    """
    divede into decades.
    """
    if value == '--':
        return '--'
    else: 
        return int(value) ###########//10


def f_condition(label):
    """
    new = 4
    excellent = 3
    good = 2
    to be refurbished = 1
    """
    if label == 'excellent / refurbished':
        return 3
    elif label == 'good condition / liveable':
        return 2
    elif label == 'to be refurbished':
        return 1
    elif label == 'new / under construction':
        return 4
    else: return '--'

zone__ = test_data['zone'].unique()
zone___ = train_data['zone'].unique()
def f_zone(value):
    if value in zone__ and value in zone___:
        return value
    else: return '--'

def f_floor(value):
    """
    returns the floor number. maybe use as qualitative 
    """
    if value == 'ground floor':
        return 0
    elif value == 'mezzanine':
        return 1.5
    elif value == 'semi-basement':
        return -1
    else: return int(value)

def f_heating_centralized(value):
    if value == 'central':
        return 1
    elif value == 'independent':
        return 0
    else: return 0


def f_energy_efficiency_class(label):
    if label == 'a':
        return 8
    if label == 'b':
        return 7
    if label == 'c':
        return 6
    if label == 'd':
        return 5
    if label == 'e':
        return 4
    if label == 'f':
        return 3
    if label == 'g':
        return 2
    else:
        return '--'
    
def f_other_features(label):
    if label == '--':
        return []
    label = label.replace(" ", "")
    elements =  label.split('|')
    return elements

def f_def_other(label, element):
    if element in label:
        return 1
    else:
        return 0

def f_exp_finder(row):
    l = row.other_features
    e = []
    for n in l: 
        s = re.findall(r'north|south|east|west', n)
        if s:
            e.append(s)
    
    stat = [0,0,0,0]
    cardinals = ['est','north','west','south']
    e = [l for x in e for l in x]
    if e:
        if cardinals[0] in e:
            row[cardinals[0]] = 1
            stat[0] = 1
        if cardinals[1] in e:
            row[cardinals[1]] = 1
            stat[1] = 1
        if cardinals[2] in e:
            row[cardinals[2]] = 1
            stat[2] = 1
        if cardinals[3] in e:
            row[cardinals[3]] = 1
            stat[3] = 1
    for i,n in enumerate(stat):
            if n == 0:
                row[cardinals[i]] = 0
    return row

def f_other_features_dummy_test(df, cardinals = False):
    elements = [x for l in list(df['other_features']) for x in l]
    counter_features, set_features = Counter(elements), set(elements)
    
    others, windows, exposures = [], [], []
    for n in set_features:
        if n[0]== 'w':
            windows.append(n)
        elif n[:2] == 'ex':
            exposures.append(n)
        else:
            others.append(n)
    
    for thing in others:
        df[thing] = df['other_features'].apply(f_def_other, args = [thing])
    if cardinals:
        df = df.apply(f_exp_finder, axis = 1) # returns north south est and west dommies.
    return df, set(others)

def f_other_features_dummy_train(df, others ,cardinals = False):
    elements = [x for l in list(df['other_features']) for x in l]
    counter_features, set_features = Counter(elements), set(elements)
    
    windows, exposures = [], []
    for n in set_features:
        if n[0]== 'w':
            windows.append(n)
        elif n[:2] == 'ex':
            exposures.append(n)
        else:
            continue
    
    for thing in others:
        df[thing] = df['other_features'].apply(f_def_other, args = [thing])
    if cardinals:
        df = df.apply(f_exp_finder, axis = 1) # returns north south est and west dommies.
    return df

#### Sub areas of Milano

In [12]:
dic_to_zone = {
    'certosa': 'Gallaratese',
    'carrobbio': 'Centro Storico',
    'brera': 'Brera-Garibaldi',
    'paolo sarpi': 'Brera-Garibaldi',
    'frua': 'Portello',
    'corvetto': 'Corvetto-Rogoredo',
    'turro': 'Bicocca-Greco',
    'san siro': 'San Siro',
    'bruzzano': 'Affori-Bruzzano',
    'rubattino': 'Lambrate',
    'città studi': 'Lambrate',
    'comasina': 'Affori-Bruzzano',
    'porta vittoria': 'Centro Storico',
    'dergano': 'Affori-Bruzzano',
    'vigentino - fatima': 'Porta Romana-Crocetta',
    'tre castelli - faenza': 'Brera-Garibaldi',
    'bande nere': 'San Siro',
    'vialba': 'Gallaratese',
    'affori': 'Affori-Bruzzano',
    'monte rosa - lotto': 'San Siro',
    'bologna - sulmona': 'Porta Romana-Crocetta',
    'centrale': 'Centrale',
    'quartiere forlanini': 'Porta Vittoria',
    'martini - insubria': 'Porta Vittoria',
    'sempione': 'City Life-Sempione',
    'maggiolina': 'Bicocca-Greco',
    'corso san gottardo': 'Navigli',
    'navigli - darsena': 'Navigli',
    'pasteur': 'Bicocca-Greco',
    'cimiano': 'Lambrate',
    'gambara': 'Vercelli-Washington',
    "sant'ambrogio": 'Brera-Garibaldi',
    'quartiere adriano': 'Bicocca-Greco',
    'figino': 'Gallaratese',
    'giambellino': 'Lorenteggio-Giambellino',
    'famagosta': 'Navigli',
    'ponte nuovo': 'Lambrate',
    'ghisolfa - mac mahon': 'Bovisa-Ghisolfa',
    'cermenate - abbiategrasso': 'Navigli',
    'cantalupa - san paolo': 'Lorenteggio-Giambellino',
    'plebisciti - susa': 'Porta Vittoria',
    'dezza': 'Porta Vittoria',
    'baggio': 'San Siro',
    'piave - tricolore': 'Centro Storico',
    'piazzale siena': 'San Siro',
    'quintosole - chiaravalle': 'Quinto Sole',
    'solari': 'Brera-Garibaldi',
    'lodi - brenta': 'Porta Romana-Crocetta',
    'bisceglie': 'Lorenteggio-Giambellino',
    'cascina dei pomi': 'Centrale',
    'palestro': 'Centro Storico',
    'piazza napoli': 'Vercelli-Washington',
    'amendola - buonarroti': 'Portello',
    'quarto oggiaro': 'Affori-Bruzzano',
    'ticinese': 'Navigli',
    'buenos aires': 'Corso Buenos Aires',
    'argonne - corsica': 'Lambrate',
    'repubblica': 'Turati-Repubblica',
    'gallaratese': 'Gallaratese',
    'bovisa': 'Bovisa-Ghisolfa',
    'farini': 'Isola',
    'morgagni': 'Corso Buenos Aires',
    'vercelli - wagner': 'Vercelli-Washington',
    'barona': 'Navigli',
    'corso genova': 'Porta Genova',
    'chiesa rossa': 'Navigli',
    'cenisio': 'Porta Genova',
    'crescenzago': 'Lambrate',
    'quadronno - crocetta': 'Porta Romana-Crocetta',
    'guastalla': 'Porta Romana-Crocetta',
    'bocconi': 'Porta Romana-Crocetta',
    'cascina merlata - musocco': 'Gallaratese',
    'porta nuova': 'Brera-Garibaldi',
    'muggiano': 'Gallaratese',
    'duomo': 'Centro Storico',
    'ripamonti': 'Gratosoglio-Ripamonti',
    'greco - segnano': 'Bicocca-Greco',
    'bicocca': 'Bicocca-Greco',
    'molise - cuoco': 'Porta Vittoria',
    'cadore': 'Centro Storico',
    'villa san giovanni': 'Bicocca-Greco',
    'pezzotti - meda': 'Navigli',
    'indipendenza': 'Centro Storico',
    'precotto': 'Bicocca-Greco',
    'washington': 'Vercelli-Washington',
    'santa giulia': 'Porta Vittoria',
    'cadorna - castello': 'Centro Storico',
    'niguarda': 'Bicocca-Greco',
    'lambrate': 'Lambrate',
    'missori': 'Centro Storico',
    'gorla': 'Bicocca-Greco',
    'bignami - ponale': 'Bicocca-Greco',
    'melchiorre gioia': 'Centrale',
    'rovereto': 'Bicocca-Greco',
    'san vittore': 'Brera-Garibaldi',
    'city life': 'City Life-Sempione',
    'zara': 'Isola',
    'casoretto': 'Lambrate',
    "ca' granda": 'Bicocca-Greco',
    'ascanio sforza': 'Navigli',
    'inganni': 'Lorenteggio-Giambellino',
    'prato centenaro': 'Bovisa-Ghisolfa',
    'garibaldi - corso como': 'Brera-Garibaldi',
    'viale ungheria - mecenate': 'Porta Vittoria',
    'isola': 'Isola',
    'quartiere feltre': 'Lambrate',
    'istria': 'Isola',
    'primaticcio': 'Vercelli-Washington',
    'portello - parco vittoria': 'Portello',
    'quinto romano': 'Quinto Romano',
    "porta romana - medaglie d'oro": 'Porta Romana-Crocetta',
    'moscova': 'Brera-Garibaldi',
    'bovisasca': 'Affori-Bruzzano',
    'porta venezia': 'Corso Buenos Aires',
    'montenero': 'Centro Storico',
    'monte stella': 'Portello',
    'turati': 'Turati-Repubblica',
    'de angeli': 'Vercelli-Washington',
    'tripoli - soderini': 'Lorenteggio-Giambellino',
    'arena': 'Brera-Garibaldi',
    'vincenzo monti': 'City Life-Sempione',
    'san carlo': 'Centro Storico',
    'parco trotter': 'Bicocca-Greco',
    'rogoredo': 'Corvetto-Rogoredo',
    'quartiere olmi': 'Quinto Romano',
    'lorenteggio': 'Lorenteggio-Giambellino',
    'borgogna - largo augusto': 'Centro Storico',
    'gratosoglio': 'Gratosoglio-Ripamonti',
    'ponte lambro': 'Corvetto-Rogoredo',
    'udine': 'Lambrate',
    'arco della pace': 'City Life-Sempione',
    'pagano': 'Portello',
    'roserio': 'Gallaratese',
    'trenno': 'Gallaratese',
    'via canelli': 'Portello',
    'lanza': 'Brera-Garibaldi',
    'ortica': 'Lambrate',
    'quarto cagnino': 'Gallaratese',
    'parco lambro': 'Lambrate',
    'qt8': 'Portello',
    'san babila': 'Centro Storico',
    'quadrilatero della moda': 'Centro Storico',
    'via calizzano': 'Portello'
}

def zones_smaller(zone):
    if zone in dic_to_zone:
        return dic_to_zone[zone]
    else: return '--'

### Set Test data

In [13]:
test['square_meters'] = test_data['square_meters'].apply(f_square_meters)
test['bathrooms_number'] = test_data['bathrooms_number'].apply(f_bathrooms_number)
test['lift'] = test_data['lift'].apply(f_lift)
test['rooms_number'] = test_data['rooms_number'].apply(f_rooms_number)
test['total_floors_in_building'] = test_data['total_floors_in_building'].apply(f_total_floors_in_building)
test['car_parking'] = test_data['car_parking'].fillna('--')

test['shared_parking'] = test['car_parking'].apply(f_shared_car)
test['private_parking'] = test['car_parking'].apply(f_private_car)
test['car_parking'] = test['car_parking'].apply(f_car_parking)

test['availability'] = test_data['availability'].apply(f_availability)
test['condominium_fees'] = test_data['condominium_fees'].fillna('--')
test['condominium_fees'] = test['condominium_fees'].apply(f_condominium_fees)
test['year_of_construction'] = test_data['year_of_construction'].fillna('--')
test['year_of_construction'] = test['year_of_construction'].apply(f_year_of_construction)
test['conditions'] = test_data['conditions'].apply(f_condition)

test['zone'] = test_data['zone'].apply(f_zone)
test['zone_second'] = test_data['zone'].apply(zones_smaller)
test['floor'] = test_data['floor'].apply(f_floor)
test['heating_centralized'] = test_data['heating_centralized'].apply(f_heating_centralized)
test['energy_efficiency_class'] = test_data['energy_efficiency_class'].apply(f_energy_efficiency_class)
test['other_features'] = test_data['other_features'].fillna('--')
test['other_features'] = test['other_features'].apply(f_other_features)
test, others_test = f_other_features_dummy_test(test ,True)

### Set Train data

In [14]:
train['square_meters'] = train_data['square_meters'].apply(f_square_meters) 
train['bathrooms_number'] = train_data['bathrooms_number'].apply(f_bathrooms_number) 
train['lift'] = train_data['lift'].apply(f_lift) 
train['rooms_number'] = train_data['rooms_number'].apply(f_rooms_number) 
train['total_floors_in_building'] = train_data['total_floors_in_building'].apply(f_total_floors_in_building) 
train['car_parking'] = train_data['car_parking'].fillna('--')

train['shared_parking'] = train['car_parking'].apply(f_shared_car)
train['private_parking'] = train['car_parking'].apply(f_private_car)
train['car_parking'] = train['car_parking'].apply(f_car_parking)

train['availability'] = train_data['availability'].apply(f_availability)
train['condominium_fees'] = train_data['condominium_fees'].fillna('--')
train['condominium_fees'] = train['condominium_fees'].apply(f_condominium_fees)
train['year_of_construction'] = train_data['year_of_construction'].fillna('--')
train['year_of_construction'] = train['year_of_construction'].apply(f_year_of_construction)
train['conditions'] = train_data['conditions'].apply(f_condition)
train['zone'] = train_data['zone'].apply(f_zone)
train['zone_second'] = train_data['zone'].apply(zones_smaller)
train['floor'] = train_data['floor'].apply(f_floor)
train['heating_centralized'] = train_data['heating_centralized'].apply(f_heating_centralized)
train['energy_efficiency_class'] = train_data['energy_efficiency_class'].apply(f_energy_efficiency_class)
train['other_features'] = train_data['other_features'].fillna('--')
train['other_features'] = train['other_features'].apply(f_other_features)
train = f_other_features_dummy_train(train, others_test,True)