In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import nltk
import math
from sklearn import linear_model, metrics
from sklearn.model_selection import cross_val_score, KFold, cross_val_predict
%matplotlib inline

In [2]:
# Part 1

hackernews_items = pd.read_csv('hn_items.csv',delimiter=',',encoding='latin-1')

nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\bepis\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [3]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
model = SentimentIntensityAnalyzer()

hn_text = hackernews_items.dropna(subset=['text'], how='all')['text'].values

df = pd.DataFrame(columns=['text','neg','pos'])

data = []

for text in hn_text:
    score = model.polarity_scores(text)
    
    data.append({'text':text,'neg':score['neg'],'pos':score['pos']})

df = df.append(data)

df.head(5)

Unnamed: 0,text,neg,pos
0,&#34;the rising star of venture capital&#34; -...,0.0,0.0
1,Is there anywhere to eat on Sandhill Road?,0.0,0.0
2,It's kind of funny that Sevin Rosen is giving ...,0.0,0.218
3,"This is interesting, but the limitations becom...",0.0,0.149
4,Stay tuned...,0.0,0.0


In [4]:
print('5 most positive: ')
print(df.nlargest(5,'pos'))

print('5 most negative: ')
print(df.nlargest(5,'neg'))

5 most positive: 
                text  neg  pos
605             sure  0.0  1.0
999              ha!  0.0  1.0
1307      Beautiful.  0.0  1.0
1628  Great, thanks!  0.0  1.0
1902            True  0.0  1.0
5 most negative: 
             text  neg  pos
512          dupe  1.0  0.0
1797         spam  1.0  0.0
2962          No.  1.0  0.0
3008       dupe.   1.0  0.0
3014  desperation  1.0  0.0


In [5]:
# Part 2

negatives = df['neg']
positives = df['pos']
posts = hackernews_items.dropna(subset=['text'], how='all')['text']

In [6]:
from sklearn.cluster import KMeans
    
folds = KFold(n_splits=10)
kmeans = KMeans(n_clusters=10)

X = negatives.values
Y = positives.values
XY = np.stack((X,Y),axis=1)

kmeans.fit(XY)
    
clusters = kmeans.cluster_centers_
    
pred = kmeans.predict(clusters)

kmeans.score(XY)

-17.487684198291639

In [7]:
# Part 3
import folium
from folium.plugins import HeatMap

boliga = pd.read_csv('boliga_zealand.csv').drop(['Index', '_1', 'Unnamed: 0'], axis=1)
zip_df = pd.DataFrame(boliga['zip_code'].str.split(' ',1).tolist(), columns = ['zip','city'])
boliga = boliga.assign(zip_int=zip_df['zip'])
boliga = boliga[boliga['zip_int'].astype(int) <= 2999]
heatmap_df = boliga[['lon','lat','price']].dropna()

In [8]:
boliga_map = folium.Map(location=[55.676098, 12.568337], zoom_start=11)

folium.Marker(location=[55.676098, 12.568337], icon=folium.Icon(color='red',icon='home')).add_to(boliga_map)
heat_data = [(e.lat,e.lon,float(e.price)) for e in heatmap_df.itertuples()]

In [9]:
HeatMap(heat_data, radius=7).add_to(boliga_map)

<folium.plugins.heat_map.HeatMap at 0x114e2c90>

In [10]:
# Jupyter seems unable to properly load the Folium file, so it's saved as a separate file.
boliga_map.save('heatmap.html')

# If price and proximity are the only factors, then western Copenhagen seems to have a significant clustering of low-price housing.
# Particularly, Brønshøj, Husum, Rødovre and Hvidovre have fairly low prices, along with housing along the eastern side of Kalvebod Fælled.
# Northwestern Copenhagen is a generally low-income area, and it seems there is a clear divide in pricing in western copenhagen when moving past Frederiksberg and Valby.
# So while pricing in these areas is low, desirability is also on the lower end. As such, somewhere in Amager south of Kastrup might be more desirable.

In [22]:
# Part 4

haversine_df = boliga[['lon','lat','price','size_in_sq_m']].dropna()

import math
def haversine_distance(origin, destination):

    lat_orig, lon_orig = origin
    lat_dest, lon_dest = destination
    radius = 6371

    dlat = math.radians(lat_dest-lat_orig)
    dlon = math.radians(lon_dest-lon_orig)
    a = (math.sin(dlat / 2) * math.sin(dlat / 2) + math.cos(math.radians(lat_orig)) 
        * math.cos(math.radians(lat_dest)) * math.sin(dlon / 2) * math.sin(dlon / 2))
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    d = radius * c

    return d

In [23]:
cph_center = (55.676098, 12.568337)

haversine_df = haversine_df.assign(km_to_cph=haversine_df.apply(lambda row: haversine_distance((row['lat'],row['lon']),cph_center),axis=1))
haversine_df[:5]

Unnamed: 0,lon,lat,price,size_in_sq_m,km_to_cph
0,12.593629,55.671769,4000000,91.0,1.657329
1,12.573689,55.676839,4895000,105.0,0.345533
2,12.590441,55.687079,250000,135.0,1.846924
3,12.591287,55.683439,7375000,98.0,1.65427
4,12.588744,55.687623,5825000,101.0,1.81079


In [35]:
regr = linear_model.LinearRegression()

X_variables = haversine_df['km_to_cph'].astype(float).values.reshape(-1,1)
Y_variables = haversine_df['price'].astype(int).values.reshape(-1,1)
Z_variables = haversine_df['size_in_sq_m'].astype(float).values.reshape(-1,1)

In [36]:
from math import sqrt

folds = KFold(n_splits=10)

metrics_df = pd.DataFrame(columns=['MAE','RMSE','Pearson','Coefficients','Intercept'])

metrics_list = []

for train_idx, test_idx in folds.split(X_variables, Y_variables, Z_variables):
    XZ_variables = np.stack([X_variables,Z_variables], axis=1).reshape(-1,2)
    XZ_train, XZ_test = XZ_variables[train_idx], XZ_variables[test_idx]
    Y_train, Y_test = Y_variables[train_idx], Y_variables[test_idx]
    
    regr = linear_model.LinearRegression()
    
    regr.fit(XZ_train, Y_train)
    
    pred = regr.predict(XZ_test)
    
    MAE = str(metrics.mean_absolute_error(Y_test,pred))
    RMSE = str(sqrt(metrics.mean_squared_error(Y_test,pred)))
    Pearson = str(metrics.r2_score(Y_test, pred))
    coef = str(regr.coef_)
    intercept = str(regr.intercept_)
    
    metrics_list.append({'MAE':MAE,'RMSE':RMSE,'Pearson':Pearson,'Coefficients':coef,'Intercept':intercept})

metrics_df = metrics_df.append(metrics_list)
    
metrics_df

Unnamed: 0,MAE,RMSE,Pearson,Coefficients,Intercept
0,1174037.38883,2144552.476786836,0.211113686177,[[-50144.1468458 21173.90529618]],[ 498250.87960243]
1,1100120.04932,1918685.848375606,0.237796142476,[[-54502.22830107 21614.21966869]],[ 512827.04731026]
2,1036518.49321,1878553.5156161697,0.25252708392,[[-54640.54807139 21551.22354509]],[ 519284.99086517]
3,766123.214978,1293033.7661722663,0.328076739947,[[-55365.40297733 21340.49817169]],[ 559535.83784652]
4,769638.417343,1249136.3929791877,0.355233416161,[[-55421.10077292 21330.05818676]],[ 564876.98863496]
5,843071.068122,1579055.9428374353,0.0670818180413,[[-57500.9406537 21894.24749629]],[ 557734.33834066]
6,886109.701979,1388383.0180807484,-0.289723668799,[[-51126.71813158 22408.71421293]],[ 475862.06201267]
7,989510.690701,1973721.228571908,-0.0814055288237,[[-56006.40772878 22601.03956962]],[ 464041.60604401]
8,1149840.48351,1694459.9511825277,0.22881455553,[[-59155.94246881 21941.8961982 ]],[ 525654.91765968]
9,1611217.43888,3187611.1353421817,0.178544735339,[[-66764.29094814 19669.18416474]],[ 737235.23392789]


In [None]:
# As is plainly clear, the model can't help us untuit anything - in some cases it might actually be worse to follow the model than not.
# Size and proximity to the city center are not exactly fully representative - the price variance between several homes of equal size is too huge to be a reliable metric.
# Bispebjerg and Amager Øst are of approximately equal distance to Nørreport, where the former is notoriously low-income, while the latter is notoriously high-income.
# Also, prices have changed drastically over time, as gentrification of certain parts of the city has greatly increased prices of homes in recent years.
# We would need both a greater dimensionality, as well as data more representative of current prices to accurately intuit prices of homes in the city.