# Deep Learning practice

### Transformación de los datos

In [1]:
import numpy as np
import pandas as pd

train = pd.read_csv('./train.csv',sep=';', decimal='.')
val = pd.read_csv('./val.csv',sep=';', decimal='.')
test = pd.read_csv('./test.csv',sep=';', decimal='.')

#images = np.load('images.npy')
#train_img = images[train['Unnamed: 0']]
#val_img = images[val['Unnamed: 0']]
#test_img = images[test['Unnamed: 0']]

#train = train.drop(columns=['Unnamed: 0'])
#val = val.drop(columns=['Unnamed: 0'])
#test = test.drop(columns=['Unnamed: 0'])

Ahora separo las variables descriptivas de las variables numéricas para tratarlas a parte

In [2]:
desc_columns = ['Unnamed: 0','Name','Summary','Space','Description','Neighborhood Overview','Notes','Transit','Access',
                'Interaction','House Rules','Host About','Host Verifications','Street','Amenities','Features']

numcat_columns = ['Unnamed: 0']+[c for c in train.columns if c not in desc_columns]

train_desc = train[desc_columns]
train = train[numcat_columns]

val_desc = val[desc_columns]
val = val[numcat_columns]

test_desc = test[desc_columns]
test = test[numcat_columns]

Imputamos valores ausentes con la media y moda de train

In [3]:
def get_input_data(data):
    
    input_values = []
    
    for col in data.columns:
        if data[col].dtype == object: input_values.append(data[col].mode())
        else: input_values.append(data[col].mean())
    
    return input_values

def apply_input_data(data, input_values):
    
    for col, val in zip(data.columns,input_values):
        data[col] = data[col].fillna(val)
    
    return data

In [4]:
input_data = get_input_data(train)

train = apply_input_data(train, input_data)
val = apply_input_data(val, input_data)
test = apply_input_data(test, input_data)

Genero valores numéricos para las variables categóricas aprovechando y readaptando el trabajo que realicé para el módulo de ML

In [5]:
from sklearn.preprocessing import OneHotEncoder

train['Host Location'] = train['Host Location'].apply(lambda x: str(x).split(',')[0])
val['Host Location'] = val['Host Location'].apply(lambda x: str(x).split(',')[0])
test['Host Location'] = test['Host Location'].apply(lambda x: str(x).split(',')[0])

encoders = {
    'Host Location': OneHotEncoder().fit(train['Host Location'].values.reshape(-1,1)),
    'Calendar last Scraped': OneHotEncoder().fit(train['Calendar last Scraped'].values.reshape(-1,1)),
    'Neighbourhood Cleansed': OneHotEncoder().fit(train['Neighbourhood Cleansed'].values.reshape(-1,1)),
    'Neighbourhood Group Cleansed': OneHotEncoder().fit(train['Neighbourhood Group Cleansed'].values.reshape(-1,1)),
    'Zipcode': OneHotEncoder().fit(train['Zipcode'].values.reshape(-1,1)),
    'Property Type': OneHotEncoder().fit(train['Property Type'].values.reshape(-1,1))
}

In [6]:
dfs = []
for df in (train,val,test):
    
    for column in encoders:
        encoded = encoders[column].transform(df[column].values.reshape(-1,1))
        colnames = df.columns.tolist() + np.zeros(encoded.shape[1]).tolist()
        df = pd.DataFrame(np.column_stack([df, encoded.toarray()]), columns=colnames).drop(columns=[column])
        
    df['Cancellation Policy'] = df['Cancellation Policy'].apply(lambda x: {
        'super_strict_60': 4,
        'super_strict_30': 4,
        'strict': 4,
        'moderate': 2,
        'flexible': 1,
    }.get(x))
        
    df['Room Type'] = df['Room Type'].apply(lambda x: {
        'Shared room': 1,
        'Private room': 2,
        'Entire home/apt': 3
    }.get(x))
    
    df['Host Response Time'] = df['Host Response Time'].apply(lambda x: {
        'a few days or more': 1,
        'within a day': 2,
        'within a few hours': 3,
        'within an hour': 4
    }.get(x))
    df.loc[df['Host Response Time'].isnull(), 'Host Response Time'] = 0
    
    df['Calendar Updated'] = df['Calendar Updated'].apply(lambda x: 5 if "today" in str(x) else
                                                          4 if "yesterday" in str(x) else
                                                          3 if "day" in str(x) else
                                                          2 if "week" in str(x) else
                                                          1 if "month" in str(x) else 0)
    
    dfs.append(df)
    
train = dfs[0]
val = dfs[1]
test = dfs[2]

Compruebo que no quedan valores nulos en mi dataset.

In [7]:
v = True
for df in (train, val, test):
    for col in df.columns:
        if sum(df[col].isnull()) != 0:
            v = False
            break
            
if v: print('success!')
else: print('null found!')

success!


Reescribo mis datos para pasar a la siguiente fase.

Esta vez no necesito guardarme los índices, ya que los índices correctos son los que proceden del dataset original y ahora están en la columna 'Unnamed: 0'

In [8]:
train.to_csv('./train.csv', sep=';', decimal='.', index=False)
val.to_csv('./val.csv', sep=';', decimal='.', index=False)
test.to_csv('./test.csv', sep=';', decimal='.', index=False)

train_desc.to_csv('./train_desc.csv', sep=';', decimal='.', index=False)
val_desc.to_csv('./val_desc.csv', sep=';', decimal='.', index=False)
test_desc.to_csv('./test_desc.csv', sep=';', decimal='.', index=False)