# Apartement Rent Data
Dataset source: https://www.kaggle.com/datasets/shashanks1202/apartment-rent-data/data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

from sklearn.model_selection import train_test_split

import re

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

# Load data

In [None]:
df = pd.read_csv('../../Datasets/Apartement Rent Data/apartments_for_rent_classified_100K/apartments_for_rent_classified_100K.csv',
                 sep=';',
                 encoding='cp1252',
                low_memory=False)

In [None]:
df.drop(['time','price_display'],axis=1,inplace=True)

In [None]:
df.shape

In [None]:
df.dropna(subset=['price'],axis=0,inplace=True)

In [None]:
train,test = train_test_split(df,shuffle=True,test_size=0.1,random_state=42)

In [None]:
X_train,X_test,y_train,y_test = train_test_split(train.drop('price',axis=1),
                                                 train['price'],
                                                shuffle=True,
                                                test_size=0.2,
                                                random_state=42)

# Data Preprocessing

In [None]:
train.info()

## Handling Missing Data

In [None]:
def extract_address(address):
    return re.sub(r'^\d* ','',address)

In [None]:
address_dict = X_train.address.dropna().apply(extract_address).value_counts().to_dict()

In [None]:
city_dict = (X_train.cityname.dropna().value_counts()-1).to_dict()

In [None]:
na_mean = train.isna().mean()
na_cat = na_mean[na_mean > 0]

In [None]:
na_cat

In [None]:
X_train.select_dtypes(include='number').hist(bins=100,figsize=(10,10))

In [None]:
feature_numerical = list(set(X_train.select_dtypes(include='number').columns) - {'latitude','longitude','id'})
feature_categorical = list(list(X_train.select_dtypes(exclude='number').columns) + ['latitude','longitude'])

In [None]:
preprocessor = ColumnTransformer(
    transformers=(
        ('numeric_imputer', SimpleImputer(strategy='median',add_indicator=True),feature_numerical),
        ('categorical_imputer',SimpleImputer(strategy='most_frequent',add_indicator=True),feature_categorical)
    ),
    verbose_feature_names_out=False,
    remainder='passthrough'
)

In [None]:
df.select_dtypes(include='number').columns

In [None]:
feature_categorical

In [None]:
preprocessor.set_output(transform='pandas')

In [None]:
preprocessor.fit(X_train)

In [None]:
X_train = preprocessor.transform(X_train)
X_test = preprocessor.transform(X_test)

In [None]:
X_train.shape,X_test.shape

In [None]:
X_train.info()

## Id

In [None]:
X_train.drop('id',axis=1,inplace=True)

In [None]:
X_test.drop('id',axis=1,inplace=True)

## Category

In [None]:
unique_categories = df.category.unique()

In [None]:
all_categories = []
for categories in unique_categories:
    categories = categories.split('/')
    for c in categories:
        all_categories.append(c)

In [None]:
all_categories

In [None]:
unique_categories = list(set(all_categories))

In [None]:
unique_categories

In [None]:
def encode_category(df):
    for i in unique_categories:
        df[f'cat_{i}'] = df.category.apply(lambda x : 1 if i in x else 0)
    df.drop('category',axis=1,inplace=True)
    return df

In [None]:
X_train = encode_category(X_train)

In [None]:
X_test = encode_category(X_test)

In [None]:
X_train.info()

## Amenities

In [None]:
unique_amenities = X_train.amenities.unique()

In [None]:
unique_amenities

In [None]:
all_amenities = []
for amenities in unique_amenities:
    amenities = amenities.split(',')
    for a in amenities:
        all_amenities.append(a)

In [None]:
unique_amenities = list(set(all_amenities))

In [None]:
unique_amenities

In [None]:
def encode_amenities(df):
    for i in unique_amenities:
        df[f'ame_{i}'] = df.amenities.apply(lambda x : 1 if i in x else 0)
    df.drop('amenities',axis=1,inplace=True)
    return df

In [None]:
X_train = encode_amenities(X_train)
X_test = encode_amenities(X_test)

In [None]:
X_train.info()

## Bathrooms

In [None]:
X_train.bathrooms.head(5)

In [None]:
X_train.bathrooms = X_train.bathrooms.astype(int)
X_test.bathrooms = X_test.bathrooms.astype(int)

## Bedrooms

In [None]:
X_train.bedrooms.head(5)

In [None]:
X_train.bedrooms = X_train.bedrooms.astype(int)
X_test.bedrooms = X_test.bedrooms.astype(int)

## Currency

In [None]:
X_train.currency.head(5)

In [None]:
X_train.currency.unique()

In [None]:
X_test.currency.unique()

In [None]:
X_train.drop('currency',axis=1,inplace=True)
X_test.drop('currency',axis=1,inplace=True)

## Fee

In [None]:
X_train.fee.head(5)

In [None]:
X_train.fee.unique()

In [None]:
X_train.fee = X_train.fee.map({'No':0,'Yes':1})
X_test.fee = X_test.fee.map({'No':0,'Yes':1})

In [None]:
X_train.fee.unique()

## Has Photo

In [None]:
X_train.has_photo.unique()

In [None]:
def encode_has_photo(df):
    for i in df.has_photo.unique():
        df[f'has_photo_{i}'] = df.has_photo.apply(lambda x: 1 if i in x else 0)
    df.drop('has_photo',axis=1,inplace=True)
    return df

In [None]:
X_train = encode_has_photo(X_train)

In [None]:
X_test = encode_has_photo(X_test)

## pets_allowed

In [None]:
X_train.pets_allowed.unique()

In [None]:
pets_allowed_unique = ['Cats','Dogs']

In [None]:
def encode_pets_allowed(df):
    for i in pets_allowed_unique:
        df[f'pets_allowed_{i}'] = df['pets_allowed'].apply(lambda x:1 if i in x else 0)
    df.drop('pets_allowed',axis=1,inplace=True)
    return df

In [None]:
X_train = encode_pets_allowed(X_train)
X_test = encode_pets_allowed(X_test)

In [None]:
X_train.info()

In [None]:
X_train.pets_allowed_Cats.sum(),X_train.pets_allowed_Dogs.sum()

## Price Type

In [None]:
X_train.price_type.unique()

In [None]:
price_type_unique = ['Monthly','Weekly']

In [None]:
def encode_price_type(df):
    for pt in price_type_unique:
        df[f'pt_{pt}'] = df.price_type.apply(lambda x: 1 if pt in x else 0)
    df.drop('price_type',axis=1,inplace=True)
    return df

In [None]:
X_train = encode_price_type(X_train)
X_test = encode_price_type(X_test)

In [None]:
X_train[['pt_Monthly','pt_Weekly']].sum()

In [None]:
# drop?

## Address

In [None]:
address_dict

In [None]:
def encode_address(df):
    df['address_encoded'] = df.address.apply(lambda x: address_dict.get(x,0))
    df.drop('address',axis=1,inplace=True)
    return df

In [None]:
X_train =  encode_address(X_train)
X_test = encode_address(X_test)

In [None]:
sample = '8215 S.W 72nd Avenue' 

In [None]:
re.findall(r'^\d*',sample)[0]

In [None]:
re.sub(r'^\d* ','',sample)

In [None]:
X_test.shape[0] - (X_test.address_encoded == 0).sum()

In [None]:
X_train.drop('address_encoded',axis=1,inplace=True)
X_test.drop('address_encoded',axis=1,inplace=True)

## City Name

In [None]:
(X_train.cityname.value_counts() == 1).sum()

In [None]:
city_dict

In [None]:
def encode_cityname(df):
    df['cityname_encoded'] = df['cityname'].apply(lambda x: city_dict.get(x,0))
    df.drop('cityname',axis=1,inplace=True)
    return df

In [None]:
X_train = encode_cityname(X_train)
X_test = encode_cityname(X_test)

In [None]:
X_test.cityname_encoded.value_counts()

## State

In [None]:
(X_train.state.value_counts() == 1).sum()

In [None]:
state_unique = X_train.state.unique()

In [None]:
def encode_state(df):
    state_cols = {}
    for s in state_unique:
        state_cols[f'state_{s}'] = df.state.apply(lambda x: 1 if s in x else 0)
    df = pd.concat([df,pd.DataFrame(state_cols)],axis=1)
    df.drop('state',axis=1,inplace=True)
    return df

In [None]:
X_train = encode_state(X_train)
X_test = encode_state(X_test)

In [None]:
X_train.shape

## Source

In [None]:
source_unique = X_train.source.unique()

In [None]:
def encode_source(df):
    source_cols = {}
    for s in source_unique:
        source_cols[f'source_{s}'] = df.source.apply(lambda x: 1 if s in x else 0)
    df = pd.concat([df,pd.DataFrame(source_cols)],axis=1)
    df.drop('source',axis=1,inplace=True)
    return df

In [None]:
X_train = encode_source(X_train)
X_test = encode_source(X_test)

In [None]:
X_train.shape,X_test.shape