In [56]:
import pandas as pd
import geopandas
import geopandas as gpd
import shapely.wkt
from sqlalchemy import create_engine
from shapely.geometry import shape
from scipy.stats import pearsonr
import geopandas as gpd
import numpy as np
import matplotlib
import matplotlib.pyplot as pltimport
import spacy
import en_core_web_sm
import contextualSpellCheck
from pyvis.network import Network
from IPython.core.display import display, HTML

# nlp = en_core_web_sm.load()
nlp = spacy.load('en_core_web_lg')

db_connection_string = 'postgresql://postgres:postgres@localhost/path_safety'
engine = create_engine(db_connection_string)

# Get Text from DB

In [57]:
table = 'frankfurt.location_rates_users'
sql = "SELECT * FROM {table_name}".format(table_name=table)
gdf = gpd.GeoDataFrame.from_postgis(sql, engine,geom_col='geometry')

reasons = gdf.copy()
# reasons.dropna(subset=['safety_reason'], inplace=True)
reasons.dropna(subset=['attractiveness_reason'], inplace=True)

reasons = reasons[reasons.attractiveness_avg>2.5]
text = ' '.join(reasons['attractiveness_reason'])

# Form Text into sentences per participant

In [58]:
text_edit = text.replace(';','.')
text_edit =text_edit.replace('|','\n')
doc = nlp(text_edit)
# doc

# Manual steps
####  Store doc as a txt file (one for each safe, unsafe, attr, unattr)
####  Correct mispellings
#### Result 4 txt files 

# Create word-head graph and identify nodes with highest betweeness

In [127]:

# # create subgraph with all the neighbors of nodes in "nodes"
# def check_neighbor_weights(g,nodes):
#   subg=nx.Graph() #Create subgraph
#   for n in nodes:
#     subg.add_node(n)
#     neighbors=g.neighbors(n) #Find all neighbors of node n
#     for neighs in neighbors:
#       # if g[n][neighs]['count']<3: #Check if the weigh t is below 50
#       subg.add_edge(n,neighs,count=g[n][neighs]['count'])
#   return subg

# for each text file
for file in ['text_safe','text_unsafe','text_attr','text_unattr']:
  with open('text_data/' + file +".txt") as f:
      text_edit = f.read()

  # additional adaptations
  text_edit = text_edit.replace('road', 'street')
  text_edit = text_edit.replace('look', '')
  text_edit = text_edit.replace(' s ', '')
  text_edit = text_edit.lower()

  # create combos of word-head
  doc = nlp(text_edit)
  text_list = []
  head_list = []
  for token in doc:
      if token.is_alpha:
          if not token.is_stop:
              text_list.append(token.lemma_)
              head_list.append(token.head.lemma_.lower())
  df = pd.DataFrame(list(zip(text_list, head_list)), columns =['text', 'head'])
  combos = df.groupby(['text','head']).size().reset_index().rename(columns={0:'count'}).sort_values('count', ascending=False)
  # remove self-loops
  combos = combos.query("text != head")
  # inverse weighting so more counts mean smaller edge weight
  combos['count'] = 1/combos['count']

  # make graph
  import networkx as nx
  G = nx.from_pandas_edgelist(combos, source='text', target='head', edge_attr='count')
  G.remove_edges_from(nx.selfloop_edges(G))
  # manual adaptations of nodes without meaning
  G.remove_node('of')
  G.remove_node('be')
  G.remove_node('for')
  G.remove_node('in')
  G.remove_node('lot')
  G.remove_node('to')
  G.remove_node('on')
  G.remove_node('s')


  # calculate centrality
  degCent = nx.degree_centrality(G)
  #Descending order sorting centrality
  degCent_sorted=dict(sorted(degCent.items(), key=lambda item: item[1],reverse=True))
  #Computing betweeness
  # betCent = nx.betweenness_centrality(G, weight='count', normalized=True, endpoints=True)
  #Descending order sorting betweeness
  # betCent_sorted=dict(sorted(betCent.items(), key=lambda item: item[1],reverse=True))
  N_top = 10
  # keys_deg_top=list(degCent_sorted)[0:N_top]
  keys_bet_top=list(degCent_sorted)[0:N_top]
  # top = list(set(keys_bet_top) & set(keys_deg_top))

  top = keys_bet_top
  #manual adaptation to only keep street elements
  if 'area' in top:
    top.remove('area')
  if 'street' in top:
    top.remove('street')
  if 'place' in top:
    top.remove('place')

  print(file, top)

  # for node in top[0:3]:
  for node in ['building']:
    subg=nx.Graph()
    #Find all neighbors of node
    neighbors=G.neighbors(node) 
    edges = []
    for neighs in neighbors:
        edges.append((neighs,G[node][neighs]['count']))
    edges_sorted = sorted(edges, key=lambda tup: tup[1])
    for i in edges_sorted[0:10]:
        subg.add_edge(node,i[0],count=i[1])

    net = Network(height='1000px', width='90%')
    
    net.from_nx(subg)
    net.save_graph('../img/word_graphs/' + file + '_' + node + '_subg.html')

      # net = Network(notebook=True)
      # subg = check_neighbor_weights(G, [node])
      # net.from_nx(subg)
      # net.show('word_graph' + file + '_' + node + '_subg.html')

text_safe ['traffic', 'building', 'car', 'people', 'residential', 'space', 'safe', 'open']
text_unsafe ['car', 'construction', 'traffic', 'work', 'building', 'graffiti', 'site']
text_attr ['building', 'tree', 'shop', 'house', 'architecture', 'clean', 'space']
text_unattr ['building', 'tree', 'graffiti', 'ugly', 'construction', 'work', 'house']


In [132]:
safe = ['traffic', 'building', 'car', 'people', 'residential']
unsafe = ['car', 'construction', 'traffic', 'work',  'building']
attr = ['building', 'tree', 'shop', 'house', 'architecture']
unattr = ['building', 'tree', 'graffiti', 'ugly', 'construction']

print(set(safe) & set(unsafe))
print(set(attr) & set(unattr))
print(set(safe) & set(unsafe)& set(attr) & set(unattr))

{'car', 'traffic', 'building'}
{'tree', 'building'}
{'building'}


# Wordclouds

In [118]:
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
from PIL import Image
import numpy as np

# Define a function to plot word cloud
def plot_cloud(wordcloud):
    # Set figure size
    plt.figure(figsize=(40, 50))
#     plt.figure(figsize=(24,14), dpi=1200)
    # Display image
    plt.imshow(wordcloud) 
    # No axis details
    plt.axis("off");
# Import package

mask = np.array(Image.open('../img/black_Circle.jpeg'))

# reasons = gdf.copy()
# reasons.dropna(subset=['attractiveness_reason'], inplace=True)
# reasons = reasons[reasons.attractiveness_avg>3.5]
# text = ' '.join(reasons['attractiveness_reason'])
for file in ['text_safe','text_unsafe','text_attr','text_unattr']:
    with open('text_data/' + file +".txt") as f:
        text_edit= f.read()

    text_edit = text_edit.lower()
    # additional adaptations
    text_edit = text_edit.replace('road', 'street')
    text_edit = text_edit.replace(' look ', '')
    # text_edit = text_edit.replace("'s", '')
    # text_edit = text_edit.replace(' lots ', '')
    # text_edit = text_edit.replace(' lot ', '')
    text_edit = text_edit.replace(' ing ', '')
    text_edit = text_edit.replace(' s ', '')

    # Generate word cloud
    # could remove stopwords as well
    wordcloud = WordCloud(width=1000, height=600, 
                        random_state=42, background_color='white', max_words=30,
                        font_path='../lemon_milk/LEMONMILK-Regular.otf',
                        mask=mask,
                        colormap='Dark2', collocations=False,
                        ).generate(text_edit)


    # Plot
    wordcloud.to_file('../img/wordclouds/wc_'+ file +'.png' )

    # plot_cloud(wordcloud)

False
False
False
False


In [124]:
doc

dislike flat buildings .run down
the street is ing dirty
sad area, cube houses are not comfortable to me
foliage is nice but housing standards are low
big buildings. gray. similar
it isn't very chic lots of maintenance work
beautiful houses
ugly scaffolding but otherwise aesthetically pleasing
construction
the works are less pretty
under reconstruction. view blocked by scaffold
very unattractive parts  baron not very attractive with all the shops
no greenery
ugly.dull
a little run down. not extremely well ed after
so tight street, bad parking plan
streamlined buildings. some variety. good upkeep. 
inside the ally high buildings. constructions. noisy
very unkempt and neglected ing part of the street
next to modern building
maybe the shop of klimatisier give me less attraction. 
no greeness greenery, nice buildings
bins. bollards
not a nice place weird
bland
architecture is too basic and not homogenous
it is ugly there is graffiti too
graffiti. boring buildings. trees.
there's not people

In [126]:
for token in doc:
    if token.is_alpha:
        if not token.is_stop:
            print(token.lemma_, token.head.lemma_.lower())
            text_list.append(token.lemma_)

flat building
building ﻿dislike
street be
e be
dirty area
sad area
area be
cube house
house be
comfortable be
foliage to
nice be
housing standard
standard be
low building
big building
building be
gray gray
similar be
chic lot
lot be
maintenance work
work house
beautiful house
house scaffolding
ugly scaffolding
scaffolding of
aesthetically pleasing
pleasing be
construction pleasing
work be
pretty under
reconstruction under
view attractive
block view
scaffold part
unattractive part
part baron
baron by
attractive attractive
shop with
greenery shop
little run
run attractive
extremely well
ed plan
tight street
street after
bad plan
parking plan
plan plan
streamlined building
building plan
variety variety
good upkeep
upkeep upkeep
inside inside
ally building
high building
building inside
construction construction
noisy unkempt
unkempt unkempt
neglect unkempt
ing part
street of
modern building
building to
maybe give
shop give
klimatisier of
attraction give
greeness greenery
greenery greenery
