# Data preprocessing
### Input:

In [1]:
host = 'localhost'
database='roomekbot$offers'
user='root'
password='Ad4Gw2'
sql_query = "select * from offers" # where city = 'Wrocław' and business_type = 'buy'"

### Import database

In [2]:
import mysql.connector
from mysql.connector import Error

try:
    connection = mysql.connector.connect(host=host, database=database, user=user, password=password)
    cursor = connection.cursor()
    cursor.execute(sql_query)
    records = cursor.fetchall()
    
    sql_query = "describe offers"
    cursor = connection.cursor()
    cursor.execute(sql_query)
    titles = cursor.fetchall()
    titles = [x[0] for x in titles]
    
except Error as e:
    print("Error reading data from MySQL table", e)
finally:
    if (connection.is_connected()):
        connection.close()
        cursor.close()

### Create dataframe

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.DataFrame(data=records, columns=titles)
df.head()

Unnamed: 0,offer_url,city,housing_type,business_type,offer_name,offer_thumbnail_url,price,total_price,street,district,...,terrace,balcony,separate_kitchen,basement,virtual_walk,two_level_apartment,connecting_room,pet_friendly,creation_time,modification_time
0,https://www.olx.pl/oferta/-CID3-IDC1jJs.html,Bydgoszcz,room,rent,Комната,https://apollo-ireland.akamaized.net:443/v1/fi...,450,450.0,,,...,,,,,,,,,2019-10-31 15:22:41,
1,https://www.olx.pl/oferta/-CID3-IDCcc96.html,Lublin,room,rent,Комфортный хостел,https://apollo-ireland.akamaized.net:443/v1/fi...,40,40.0,,,...,,,,,,,,,2019-10-31 15:22:02,
2,https://www.olx.pl/oferta/0-prowizji-pokoj-nr-...,Szczecin,room,rent,0% Prowizji - Pokój Nr 4 w komfortowym Mikro-A...,https://apollo-ireland.akamaized.net:443/v1/fi...,650,650.0,,Centrum,...,,,,,,,,,2019-10-31 15:16:31,
3,https://www.olx.pl/oferta/0nowoczesna-kawalerk...,Lodz,apartment,rent,"0Nowoczesna kawalerka na Bałutach, Młynarska 47",https://apollo-ireland.akamaized.net:443/v1/fi...,1000,10300.0,Mlynarska,Bałuty,...,,,,,,,,,2019-10-31 15:33:38,
4,https://www.olx.pl/oferta/1-2-osobowy-duzy-pok...,Gdansk,room,rent,1-2 osobowy duży pokój na wynajem Gdańsk-Wrzes...,https://apollo-ireland.akamaized.net:443/v1/fi...,510,510.0,,Wrzeszcz,...,,,,,,,,,2019-10-31 15:23:16,


### Statistics - show amount of empty cells

In [6]:
round(df.isnull().sum()/len(df)*1000)/10

offer_url                      0.0
city                           0.0
housing_type                   0.0
business_type                  0.0
offer_name                     0.0
offer_thumbnail_url            4.6
price                          0.0
total_price                    0.6
street                        54.1
district                      23.9
date_of_the_offer              0.0
offer_id                       0.0
offer_text                     0.0
offer_from                    81.4
apartment_level               34.1
furniture                     15.4
type_of_building              45.8
area                          30.8
amount_of_rooms                8.9
additional_rent               57.7
price_per_m2                  71.8
type_of_market                71.8
security_deposit              89.0
building_material             87.8
windows                       86.1
heating                       83.4
building_year                 84.3
fit_out                       67.3
ready_from          

### Replace None and Nan with mean value or 0

In [7]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
# specify which columns to replace with mean value...
# imputer = imputer.fit(df[:, 1:3])
# df[:, 1:3] = imputer.transform(df[:, 1:3])

# ... and which replace with zero:
# df[10:14] = df.fillna(0, inplace=True)

# or replace all with zeros:
df.fillna(0, inplace=True)
df.head()

Unnamed: 0,offer_url,city,housing_type,business_type,offer_name,offer_thumbnail_url,price,total_price,street,district,...,terrace,balcony,separate_kitchen,basement,virtual_walk,two_level_apartment,connecting_room,pet_friendly,creation_time,modification_time
0,https://www.olx.pl/oferta/-CID3-IDC1jJs.html,Bydgoszcz,room,rent,Комната,https://apollo-ireland.akamaized.net:443/v1/fi...,450,450.0,0,0,...,0.0,0.0,0.0,0.0,0,0,0.0,0.0,2019-10-31 15:22:41,0
1,https://www.olx.pl/oferta/-CID3-IDCcc96.html,Lublin,room,rent,Комфортный хостел,https://apollo-ireland.akamaized.net:443/v1/fi...,40,40.0,0,0,...,0.0,0.0,0.0,0.0,0,0,0.0,0.0,2019-10-31 15:22:02,0
2,https://www.olx.pl/oferta/0-prowizji-pokoj-nr-...,Szczecin,room,rent,0% Prowizji - Pokój Nr 4 w komfortowym Mikro-A...,https://apollo-ireland.akamaized.net:443/v1/fi...,650,650.0,0,Centrum,...,0.0,0.0,0.0,0.0,0,0,0.0,0.0,2019-10-31 15:16:31,0
3,https://www.olx.pl/oferta/0nowoczesna-kawalerk...,Lodz,apartment,rent,"0Nowoczesna kawalerka na Bałutach, Młynarska 47",https://apollo-ireland.akamaized.net:443/v1/fi...,1000,10300.0,Mlynarska,Bałuty,...,0.0,0.0,0.0,0.0,0,0,0.0,0.0,2019-10-31 15:33:38,0
4,https://www.olx.pl/oferta/1-2-osobowy-duzy-pok...,Gdansk,room,rent,1-2 osobowy duży pokój na wynajem Gdańsk-Wrzes...,https://apollo-ireland.akamaized.net:443/v1/fi...,510,510.0,0,Wrzeszcz,...,0.0,0.0,0.0,0.0,0,0,0.0,0.0,2019-10-31 15:23:16,0


### Select only valuable data

In [13]:
# https://towardsdatascience.com/the-complete-beginners-guide-to-data-cleaning-and-preprocessing-2070b7d4c6d

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
# from sklearn.cross_validation import train_test_split

# Matrix of dependent variables:
# X = df.loc[:, ['city', 'housing_type', 'business_type', 'district', 10, 11, 12, 13, 14, 15, 16, 17, 18]]
X = df.drop(['price', 'total_price', 'parsed_fields', 'offer_from', 'offer_url', 'offer_name', 'offer_thumbnail_url', 'offer_id', 'offer_text', 'creation_time', 'modification_time', 'street'], axis=1)
X=X.values

#vector of independent variables:
y = df.loc[:, ['price']]
y=y.values
X

array([['Bydgoszcz', 'room', 'rent', ..., 0, 0.0, 0.0],
       ['Lublin', 'room', 'rent', ..., 0, 0.0, 0.0],
       ['Szczecin', 'room', 'rent', ..., 0, 0.0, 0.0],
       ..., 
       ['Lublin', 'apartment', 'buy', ..., 0, 0.0, 0.0],
       ['Poznan', 'apartment', 'buy', ..., 0, 0.0, 0.0],
       ['Bydgoszcz', 'apartment', 'buy', ..., 0, 0.0, 0.0]], dtype=object)

### Encode labels for categorical data - text into numbers

In [11]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X = LabelEncoder()

X[:,0] = labelencoder_X.fit_transform(X[:,0])

# enc = OneHotEncoder(categorical_features = [0])
# X = enc.fit_transform(X).toarray()
X

array([[1, 'room', 'rent', ..., 0, 0.0, 0.0],
       [6, 'room', 'rent', ..., 0, 0.0, 0.0],
       [8, 'room', 'rent', ..., 0, 0.0, 0.0],
       ..., 
       [6, 'apartment', 'buy', ..., 0, 0.0, 0.0],
       [7, 'apartment', 'buy', ..., 0, 0.0, 0.0],
       [1, 'apartment', 'buy', ..., 0, 0.0, 0.0]], dtype=object)

### Split dataset into dependent and intependent values

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

### Feature Scaling

In [40]:
# from sklearn.preprocessing import StandardScaler

# sc_X = StandardScaler()
# X_train = sc_X.fit_transform(X_train)
# X_test = sc_X.transform(X_test)
# sc_y = StandardScaler()
# y_train = sc_y.fit_transform(y_train)