# Apartement Rent Data
Dataset source: https://www.kaggle.com/datasets/shashanks1202/apartment-rent-data/data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

# Load data

In [7]:
df = pd.read_csv('../../Datasets/Apartement Rent Data/apartments_for_rent_classified_100K/apartments_for_rent_classified_100K.csv',
                 sep=';',
                 encoding='cp1252',
                low_memory=False)

# Data Preprocessing

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99492 entries, 0 to 99491
Data columns (total 22 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             99492 non-null  int64  
 1   category       99492 non-null  object 
 2   title          99492 non-null  object 
 3   body           99492 non-null  object 
 4   amenities      83448 non-null  object 
 5   bathrooms      99429 non-null  float64
 6   bedrooms       99368 non-null  float64
 7   currency       99492 non-null  object 
 8   fee            99492 non-null  object 
 9   has_photo      99492 non-null  object 
 10  pets_allowed   39068 non-null  object 
 11  price          99491 non-null  float64
 12  price_display  99491 non-null  object 
 13  price_type     99492 non-null  object 
 14  square_feet    99492 non-null  int64  
 15  address        7943 non-null   object 
 16  cityname       99190 non-null  object 
 17  state          99190 non-null  object 
 18  latitu

In [13]:
df.head(5)

Unnamed: 0,id,category,title,body,amenities,bathrooms,bedrooms,currency,fee,has_photo,...,price_display,price_type,square_feet,address,cityname,state,latitude,longitude,source,time
0,5668640009,housing/rent/apartment,One BR 507 & 509 Esplanade,"This unit is located at 507 & 509 Esplanade, R...",,1.0,1.0,USD,No,Thumbnail,...,"$2,195",Monthly,542,507 509 Esplanade,Redondo Beach,CA,33.852,-118.3759,RentLingo,1577360355
1,5668639818,housing/rent/apartment,Three BR 146 Lochview Drive,"This unit is located at 146 Lochview Drive, Ne...",,1.5,3.0,USD,No,Thumbnail,...,"$1,250",Monthly,1500,146 Lochview Dr,Newport News,VA,37.0867,-76.4941,RentLingo,1577360340
2,5668639686,housing/rent/apartment,Three BR 3101 Morningside Drive,This unit is located at 3101 Morningside Drive...,,2.0,3.0,USD,No,Thumbnail,...,"$1,395",Monthly,1650,3101 Morningside Dr,Raleigh,NC,35.823,-78.6438,RentLingo,1577360332
3,5668639659,housing/rent/apartment,Two BR 209 Aegean Way,"This unit is located at 209 Aegean Way, Vacavi...",,1.0,2.0,USD,No,Thumbnail,...,"$1,600",Monthly,820,209 Aegean Way,Vacaville,CA,38.3622,-121.9712,RentLingo,1577360330
4,5668639374,housing/rent/apartment,One BR 4805 Marquette NE,"This unit is located at 4805 Marquette NE, Alb...",,1.0,1.0,USD,No,Thumbnail,...,$975,Monthly,624,4805 Marquette NE,Albuquerque,NM,35.1038,-106.611,RentLingo,1577360308


## Category

In [18]:
unique_categories = df.category.unique()

In [19]:
all_categories = []
for categories in unique_categories:
    categories = categories.split('/')
    for c in categories:
        all_categories.append(c)

In [20]:
unique_categories = list(set(all_categories))

In [21]:
unique_categories

['commercial',
 'home',
 'rent',
 'apartment',
 'other',
 'housing',
 'retail',
 'condo',
 'short_term']

In [22]:
for i in unique_categories:
    df[f'cat_{i}'] = df.category.apply(lambda x : 1 if i in x else 0)

In [29]:
df.drop('category',axis=1,inplace=True)

## Amenities

In [46]:
df.amenities.fillna('Missing',inplace=True)

In [47]:
unique_amenities = df.amenities.unique()

In [48]:
unique_amenities

array(['Missing', 'Fireplace,Gym,Parking,Pool,Storage,Wood Floors',
       'Gated,Pool', ..., 'Dishwasher,Doorman',
       'AC,Elevator,Parking,Patio/Deck,Pool,Washer Dryer',
       'Dishwasher,Doorman,Patio/Deck,Refrigerator,Storage,Washer Dryer,Wood Floors'],
      dtype=object)

In [49]:
all_amenities = []
for amenities in unique_amenities:
    amenities = amenities.split(',')
    for a in amenities:
        all_amenities.append(a)

In [50]:
unique_amenities = list(set(all_amenities))

In [51]:
unique_amenities

['Pool',
 'Patio/Deck',
 'Missing',
 'Alarm',
 'View',
 'Gated',
 'Washer Dryer',
 'Luxury',
 'Doorman',
 'Internet Access',
 'Golf',
 'Basketball',
 'Dishwasher',
 'Gym',
 'Tennis',
 'Elevator',
 'Hot Tub',
 'Wood Floors',
 'Fireplace',
 'Garbage Disposal',
 'AC',
 'Cable or Satellite',
 'Clubhouse',
 'Playground',
 'Storage',
 'Parking',
 'TV',
 'Refrigerator']

In [52]:
for i in unique_amenities:
    df[f'ame_{i}'] = df.amenities.apply(lambda x : 1 if i in x else 0)

In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99492 entries, 0 to 99491
Data columns (total 58 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      99492 non-null  int64  
 1   title                   99492 non-null  object 
 2   body                    99492 non-null  object 
 3   amenities               99492 non-null  object 
 4   bathrooms               99429 non-null  float64
 5   bedrooms                99368 non-null  float64
 6   currency                99492 non-null  object 
 7   fee                     99492 non-null  object 
 8   has_photo               99492 non-null  object 
 9   pets_allowed            39068 non-null  object 
 10  price                   99491 non-null  float64
 11  price_display           99491 non-null  object 
 12  price_type              99492 non-null  object 
 13  square_feet             99492 non-null  int64  
 14  address                 7943 non-null 

In [62]:
df.drop('amenities',axis=1,inplace=True)