# Data Science Regression Project: Predicting Apartment Prices in Germany


In [297]:
from os import listdir
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
import matplotlib 
import re

In [298]:
# finding json files in givin path 
def find_csv_filenames(path_to_dir, suffix=".csv" ):
    filenames = listdir(path_to_dir)
    return [filename for filename in filenames if filename.endswith(suffix)]

In [299]:
def concatenate_path_csv_file():
    path_with_csv_files = []
    for csv_file in csv_files:
        path_with_csv_files.append(path+csv_file)
    return path_with_csv_files

## Data Load: Load germany apartment data, to see features


In [300]:
path = "./apartment_data/"
csv_files = find_csv_filenames(path)
df = pd.concat(map(pd.read_csv, concatenate_path_csv_file()), ignore_index=True)

In [301]:
df.head(5)

Unnamed: 0,sales_price,livable_surface,number_of_rooms,location
0,"335,000€",73m²,5,-Perl
1,Priceonrequest,112m²,6,EchternacherbrückTheexactadressisnotindicated
2,"128,000€",134m²,4,KölnerStraße-Burscheid
3,"95,000€",86m²,3,Goethestraße-Remscheid
4,"120,000€",66m²,3,Friedrich-Schiller-Straße-Ummendorf


In [302]:
# we have 831 apartmens and 4 features
df.shape

(831, 4)

In [303]:
# sales_price is the dependent feature, the rest are independet. We try to predict the sales_price
df.columns

Index(['sales_price', 'livable_surface', 'number_of_rooms', 'location'], dtype='object')

In [304]:
# unique method is used to get an overview of unique elements in specific index.
df['location'].unique()

array(['-Perl', 'EchternacherbrückTheexactadressisnotindicated',
       'KölnerStraße-Burscheid', 'Goethestraße-Remscheid',
       'Friedrich-Schiller-Straße-Ummendorf',
       'UnterdemKönigsberg-BadPyrmont', 'ImTonloch-Ahnsen',
       'Krokusweg-Dortmund', 'AmWald-Stolberg', 'Bertramstraße-Köln',
       'Hauptstraße-Meckenbeuren', '-Trier', 'PlackenEllern-Belm',
       'Usingerstraße-Köln', 'JülicherStraße-Eschweiler',
       'Taunusstraße-Kerpen', 'Hansengasse-Geislingen',
       'ObererWeg-Tübingen', 'Hutbergstraße-Nürnberg',
       '-Echternacherbrück', 'WeißenburgerStraße-Ludwigshafen',
       'Yorckstr.-Ludwigshafen', 'Immenweg-Berlin',
       'Jugendheimstraße-Dormagen', 'Harkortstraße-Castrop-Rauxel',
       'Bahnhofstraße-Castrop-Rauxel', 'Siemensstraße-Bamberg',
       'BitburgerStrasse-Echternacherbrück', 'Wilhelmstraße-Nürnberg',
       'Kazmairstraße-München', 'SchmidenerStraße-Stuttgart',
       'Hauptstraße-Hofgeismar', 'Saturnstraße-Bielefeld',
       'Zugspitzstraße-L

In [305]:
# removing streets name from location
df['location'] = df['location'].apply(lambda x: x.split('-')[-1])

In [306]:
df.head()

Unnamed: 0,sales_price,livable_surface,number_of_rooms,location
0,"335,000€",73m²,5,Perl
1,Priceonrequest,112m²,6,EchternacherbrückTheexactadressisnotindicated
2,"128,000€",134m²,4,Burscheid
3,"95,000€",86m²,3,Remscheid
4,"120,000€",66m²,3,Ummendorf


In [307]:
# first location is the most frequently-occurring location. 
df['location'].value_counts()

Trier             51
Duisburg          38
Berlin            30
Saarbrücken       22
Mettlach          21
                  ..
Auerbach           1
Haßfurt            1
SaalfelderHöhe     1
Thalheim           1
Eppelborn          1
Name: location, Length: 295, dtype: int64

## Drop features that are not required to build our model

In [308]:
# we will work also with location to show how to handle categorical data
df_draft = df.drop(['location'],axis='columns')
df_draft.shape

(831, 3)

## Handle missing data if its NaN. 

In [309]:
'''
We have no NaN values but something like: sales_price: Priceonrequest
'''
df = df.dropna()
df.isnull().sum()

sales_price        0
livable_surface    0
number_of_rooms    0
location           0
dtype: int64

In [310]:
index_sales_price = df.columns.get_loc('sales_price')
price_pattern = r'([0-9]*(,|.)?[0-9]*(,|.)?[0-9]*\€)'

for row in range(0, len(df)-1):
    try:
        price = re.search(price_pattern, df.iat[row, index_sales_price]).group()
    except Exception as e:
        # drops sales_prices values which does not contains numerical value e.g. Priceonrequest
        df = df.drop(df.index[[row]])
    price = 'None'

In [311]:
df.head()

Unnamed: 0,sales_price,livable_surface,number_of_rooms,location
0,"335,000€",73m²,5,Perl
2,"128,000€",134m²,4,Burscheid
3,"95,000€",86m²,3,Remscheid
4,"120,000€",66m²,3,Ummendorf
5,"68,000€",59m²,2,BadPyrmont


In [312]:
df.head()

Unnamed: 0,sales_price,livable_surface,number_of_rooms,location
0,"335,000€",73m²,5,Perl
2,"128,000€",134m²,4,Burscheid
3,"95,000€",86m²,3,Remscheid
4,"120,000€",66m²,3,Ummendorf
5,"68,000€",59m²,2,BadPyrmont


In [313]:
# remove € sign and ',' from entire sales_prices...
df['sales_price'] = df['sales_price'].replace({r'\€':''}, regex = True)
df['sales_price'] = df['sales_price'].replace({r',':'.'}, regex = True)
df['livable_surface'] = df['livable_surface'].replace({r'm\u00b2':''}, regex = True)
df.rename(columns={'sales_price': 'sales_price', 'livable_surface': 'livable_surface in m\u00b2', 'number_of_rooms': 'number_of_rooms', 'location':'location'}, inplace=True)

In [295]:
df.head()

Unnamed: 0,sales_price,livable_surface in m²,number_of_rooms,location
0,335.0,73,5,Perl
2,128.0,134,4,Burscheid
3,95.0,86,3,Remscheid
4,120.0,66,3,Ummendorf
5,68.0,59,2,BadPyrmont
