In [1]:
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from logistic_classification import LogisticClassification
import numpy as np
from collections import Counter
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("immoscout_cleaned_lat_lon_fixed_v7.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13378 entries, 0 to 13377
Columns: 105 entries, Unnamed: 0 to type
dtypes: float64(51), int64(3), object(51)
memory usage: 10.7+ MB


  df = pd.read_csv("immoscout_cleaned_lat_lon_fixed_v7.csv")


In [3]:
len(df["type"].unique())

21

In [4]:
df["type"].unique()

array(['penthouse', 'terrace-house', 'detached-house', 'flat',
       'stepped-house', 'farmhouse', 'semi-detached-house',
       'stepped-apartment', 'duplex-maisonette', 'attic-flat', 'loft',
       'chalet', 'villa', 'attic-room', 'secondary-suite', 'castle',
       'detached-secondary-suite', 'studio',
       'furnished-residential-property', 'rustico', 'single-room'],
      dtype=object)

In [5]:
# array to dict with python built in
types_nrs = dict(enumerate(df["type"].unique()))
types_name = {v: k for k, v in types_nrs.items()}
types_name

{'penthouse': 0,
 'terrace-house': 1,
 'detached-house': 2,
 'flat': 3,
 'stepped-house': 4,
 'farmhouse': 5,
 'semi-detached-house': 6,
 'stepped-apartment': 7,
 'duplex-maisonette': 8,
 'attic-flat': 9,
 'loft': 10,
 'chalet': 11,
 'villa': 12,
 'attic-room': 13,
 'secondary-suite': 14,
 'castle': 15,
 'detached-secondary-suite': 16,
 'studio': 17,
 'furnished-residential-property': 18,
 'rustico': 19,
 'single-room': 20}

In [6]:
# rename column type with type_dict
df["type"] = df["type"].map(types_name)

In [7]:
# keep only numeric columns
df = df.select_dtypes(include=np.number)
df.head()

Unnamed: 0.1,Unnamed: 0,lat,lon,index,ForestDensityL,ForestDensityM,ForestDensityS,Latitude,Longitude,NoisePollutionRailwayL,...,gde_population,gde_private_apartments,gde_social_help_quota,gde_tax,gde_workers_sector1,gde_workers_sector2,gde_workers_sector3,gde_workers_total,price_cleaned,type
0,0,47.4171,8.0856,16620,0.511176,0.286451,0.090908,47.415927,8.08584,0.0,...,1545.0,686.0,2.234259,5.89,14.0,9.0,308.0,331.0,1150000.0,0
1,1,47.4195,8.0827,16620,0.511176,0.286451,0.090908,47.415927,8.08584,0.0,...,1545.0,686.0,2.234259,5.89,14.0,9.0,308.0,331.0,1420000.0,1
2,2,47.4033,8.033,17812,0.163362,0.095877,0.001911,47.397416,8.04315,0.0,...,21036.0,10149.0,3.54901,6.05,37.0,3092.0,30364.0,33493.0,720000.0,0
3,3,47.415643,8.085423,16620,0.511176,0.286451,0.090908,47.415927,8.08584,0.0,...,1545.0,686.0,2.234259,5.89,14.0,9.0,308.0,331.0,1430000.0,2
4,4,47.403824,8.048288,12716,0.333865,0.279276,0.145835,47.40487,8.052781,0.0,...,6081.0,2638.0,1.708126,6.3,65.0,349.0,941.0,1355.0,995000.0,3


# DROP NAN

In [8]:
# drop rows with nan
df = df.dropna()

In [9]:
# make Matrix y as target with column type
y = df["type"].values
# make Matrix X as features with all columns except type
X = df.drop(["type"], axis=1).values

In [10]:
X.shape

(3516, 54)

In [11]:
y.shape

(3516,)

In [12]:
X = (X - X.mean(axis = 0)) / X.std(axis = 0) # standardization of features

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
# print all shapes
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(2109, 54) (1407, 54) (2109,) (1407,)


In [14]:
# create 20 models in one line
models = [LogisticClassification(lr=0.0001, n_iters=5000) for _ in range(20)]
models_pred = dict.fromkeys(range(0, 21), 0)
accuracy_dict = dict.fromkeys(range(0, 21), 0)
print(accuracy_dict)

{0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0, 10: 0, 11: 0, 12: 0, 13: 0, 14: 0, 15: 0, 16: 0, 17: 0, 18: 0, 19: 0, 20: 0}


In [15]:
X_train.shape, y_train.shape

((2109, 54), (2109,))

In [16]:
# fit one model
models[0].fit(X_train, np.where(y_train == 0, 1, 0))
# predict one model
y_pred = models[0].predict(X_test)
# calculate accuracy
accuracy = accuracy_score(np.where(y_test == 0, 1, 0), y_pred)
# count values in y_pred
print(Counter(y_pred), Counter(np.where(y_test == 0, 1, 0)))
print(accuracy)

Counter({0: 1356, 1: 51}) Counter({0: 1352, 1: 55})
0.9260838663823738


In [17]:
# fit the models
for idx, model in enumerate(models):
    model.fit(X_train, np.where(y_train == idx, 1, 0))

In [18]:
# predict the models
for idx, model in enumerate(models):
    models_pred[idx] = model.predict(X_test)

In [19]:
# get the accuracy
for idx in range(len(models)):
    accuracy_dict[idx] = accuracy_score(np.where(y_test == idx, 1, 0), models_pred[idx])
    print("Accuracy of model {0:<30s} is {1:10.2f}".format(types_nrs[idx], accuracy_dict[idx]))

Accuracy of model penthouse                      is       0.93
Accuracy of model terrace-house                  is       0.91
Accuracy of model detached-house                 is       0.75
Accuracy of model flat                           is       0.71
Accuracy of model stepped-house                  is       0.98
Accuracy of model farmhouse                      is       0.99
Accuracy of model semi-detached-house            is       0.93
Accuracy of model stepped-apartment              is       0.98
Accuracy of model duplex-maisonette              is       0.93
Accuracy of model attic-flat                     is       0.93
Accuracy of model loft                           is       0.99
Accuracy of model chalet                         is       0.95
Accuracy of model villa                          is       0.91
Accuracy of model attic-room                     is       1.00
Accuracy of model secondary-suite                is       1.00
Accuracy of model castle                         is    

In [20]:
# get first row of df without column type
from pprint import pprint
x_feat = df.iloc[0, :-1].values.reshape(1, -1)
print(x_feat.shape, y_test.shape)
pred_test = dict.fromkeys(range(0, 21), 0)
accuracy_test = dict.fromkeys(range(0, 21), 0)
for idx, model in enumerate(models):
    pred_test[idx] = model.predict_proba(x_feat)
pprint(pred_test)


(1, 54) (1407,)
{0: array([-32953.50164288]),
 1: array([-40974.93578823]),
 2: array([817690.97224673]),
 3: array([-1207820.17489207]),
 4: array([-14131.89195639]),
 5: array([-12888.94958588]),
 6: array([-24139.44569301]),
 7: array([-15655.4145675]),
 8: array([-32668.25148255]),
 9: array([-24056.80137322]),
 10: array([-19576.83436239]),
 11: array([-4430.85060309]),
 12: array([53352.73913472]),
 13: array([-18777.22959075]),
 14: array([-16336.37408679]),
 15: array([-15846.14066192]),
 16: array([-16336.37408679]),
 17: array([-21293.97335184]),
 18: array([-16336.37408679]),
 19: array([-16792.28887771]),
 20: 0}


In [21]:
def sigmoid(x, w, b):
    return 1 / (1 + np.exp(-(np.dot(x, w) + b)))

In [22]:
# calc sigmoid for first model
sigmoid(x_feat, models[0].weights, models[0].bias)

  return 1 / (1 + np.exp(-(np.dot(x, w) + b)))


array([0.])

In [23]:
# print first weights and bias
print(models[0].weights, models[0].bias)

[-0.08288764 -0.22718178 -0.31726389  0.0083078   0.07459457 -0.02478321
 -0.00254197  0.27503665  0.0744312   0.13024455 -0.12022889  0.01695864
  0.02782622 -0.08486151  0.09317863  0.13081436 -0.11776112  0.00992385
  0.10672941 -0.07067804 -0.01142572  0.0846718  -0.12811986  0.11634054
  0.47441586  0.06112201  0.05948219 -0.01800001  0.02785342 -0.04603688
  0.03988076  0.01143221 -0.02889388  0.01901128  0.39904291  0.26060442
  0.07895971  0.76656445  0.07053983  0.22855829  0.10968101  0.12773631
  0.34310883  0.576203    0.17317234 -0.38154349  0.3876961   0.01977001
  0.18975303  0.02026564  0.06765521 -0.04349697 -0.03401115 -0.06216459] -0.4046948832705351
