Skip to content

Commit

Permalink
Added a script which uses geonames to connect a user with a geoname e…
Browse files Browse the repository at this point in the history
…ntry with the identified_via_geonames table. Also added a second column in the new table for country_names, which are none present in the original dataset. The script is throttled to do 1950 queries every 62 minutes. Also added a database methods to support this functionality
  • Loading branch information
Oscar-Rydh committed Apr 11, 2017
1 parent 3dae4b0 commit 2669117
Show file tree
Hide file tree
Showing 5 changed files with 197 additions and 51 deletions.
1 change: 1 addition & 0 deletions .gitignore
Expand Up @@ -10,3 +10,4 @@ twitter-geo.dump
allCountries.txt
twitter-with-geonames.dump
twitter-with-preprocessed.dump
twitter-before-geoname-api-identification.dump
8 changes: 8 additions & 0 deletions database/create_tables.sql
Expand Up @@ -110,3 +110,11 @@ CREATE TABLE IF NOT EXISTS filtered_user_locations(
FOREIGN KEY(user_id) REFERENCES users(user_id),
FOREIGN KEY(geonameid) REFERENCES geonames(geonameid)
);

CREATE TABLE IF NOT EXISTS identified_via_geonames (
geonameid INT,
user_id BIGINT,
country_name VARCHAR (200),
FOREIGN KEY (geonameid) REFERENCES geonames(geonameid),
FOREIGN KEY (user_id) REFERENCES users(user_id)
);
66 changes: 63 additions & 3 deletions database/database.py
Expand Up @@ -339,9 +339,7 @@ def select_user_locations(self):
statement = """
SELECT user_id, user_location
FROM users
WHERE user_location IS NOT NULL
AND user_id NOT IN (SELECT user_id from filtered_user_locations)
LIMIT 100;
WHERE user_location IS NOT NULL;
"""
cur.execute(statement)
self.conn.commit()
Expand All @@ -353,6 +351,23 @@ def select_user_locations(self):

return result_array


def get_all_users_with_location(self):
cur = self.conn.cursor()

statement = """
SELECT user_id, user_location
FROM users
WHERE user_location IS NOT NULL
"""

cur.execute(statement)
self.conn.commit()
result_tuple = cur.fetchall()
cur.close()

return result_tuple

def select_database_locations(self):
cur = self.conn.cursor()
statement = """
Expand Down Expand Up @@ -382,3 +397,48 @@ def set_filtered_location(self, user_id, geonameid, ratio):
cur.execute(statement, (user_id, geonameid, ratio))
self.conn.commit()
cur.close()


def insert_into_preprocessed(self, location, user_id, rest=None):
cur = self.conn.cursor()
if rest != None:
statement = """
UPDATE users
SET preprocessed_location = %s,
preprocessed_rest = %s
WHERE user_id = %s
"""
cur.execute(statement, (location, rest, user_id))
else:
statement = """
UPDATE users
SET preprocessed_location = %s,
preprocessed_rest = DEFAULT
WHERE user_id = %s
"""
cur.execute(statement, (location, user_id))
self.conn.commit()
cur.close()

def select_preprocessed_data_from_user_id(self, user_id):
cur = self.conn.cursor()
statement = '''
SELECT preprocessed_location, preprocessed_rest FROM users WHERE user_id = %s;
'''

cur.execute(statement, ([user_id]))
self.conn.commit()
preprocessed_data = cur.fetchone()
cur.close()
return preprocessed_data

def insert_into_identified_via_geonames(self, user_id, geonameid, country_name):
cur = self.conn.cursor()
statement = '''
INSERT INTO identified_via_geonames(user_id, geonameid, country_name)
VALUES (%s, %s, %s)
'''

cur.execute(statement, ([user_id, geonameid, country_name]))
self.conn.commit()
cur.close()
118 changes: 118 additions & 0 deletions geonames_api_application.py
@@ -0,0 +1,118 @@
import requests
import time
from database.database import Database

def make_search_request(query):
base_payload = {'username': 'Svenskjefel', 'order_by': 'relevance'}
base_url = "http://api.geonames.org/searchJSON"
base_payload['q'] = query
r = requests.get(base_url, base_payload)
return r.json()

def split_on_common_characters(query):
result = []
if '/' in query:
result = query.split('/')
elif '|' in query:
result = query.split('|')
elif ';' in query:
result = query.split(';')
elif ':' in query:
result = query.split(':')
elif '-' in query:
result = query.split('-')
elif '.' in query:
result = query.split('.')

return result

def split_to_words(query):
if " " in query:
return query.split(" ")
listify = []
listify.append(query)
return listify

db = Database("twitter-geo")

# Must be less then 2000 an hour
some_user_locations = db.select_user_locations()


throttle_count = 0

no_result_count = 0

for location in some_user_locations:
throttle_count += 1

if throttle_count == 1950:
time.sleep(60)

user_id = location[0]
user_location = location[1]
preprocessed_data = db.select_preprocessed_data_from_user_id(user_id)
preprocessed_location = preprocessed_data[0]
preprocessed_rest = None
if len(preprocessed_data) > 1:
preprocessed_rest = preprocessed_data[1]

#Check given user location
query = user_location
print ("Sending query for: ", query)
result = make_search_request(query)
print (result)
total_results = result['totalResultsCount']

if total_results > 0:
print("FOUND RESULT")
id = result['geonames'][0]['geonameId']
country_name = result['geonames'][0].get('countryName', None)
db.insert_into_identified_via_geonames(user_id, id, country_name)
else:
# Check our preprocessed locations
print ("Failed.... Checking Preprocessed locations")
query = preprocessed_location
print ("Sending query for: ", query)
result = make_search_request(query)
if result['totalResultsCount'] > 0:
print ("FOUND RESULT")
id = result['geonames'][0]['geonameId']
country_name = result['geonames'][0].get('countryName', None)
db.insert_into_identified_via_geonames(user_id, id, country_name)
else:
# Check our preprocessed preprocessed rest
print('Failed.... Checking Rest of preprocess if exists')
if (preprocessed_rest):
query = preprocessed_rest
result = make_search_request(query)
print ("Sending query for: ", query)
if (result['totalResultsCount'] > 0):
id = result['geonames'][0]['geonameId']
country_name = result['geonames'][0].get('countryName', None)
db.insert_into_identified_via_geonames(user_id, id, country_name)
else:
#CHECKING EACH INDIVIDUAL WORD
print('Failed.... Checking Every Word')
list_words = split_to_words(preprocessed_location)
if (preprocessed_rest):
list_words += split_to_words(preprocessed_rest)
for word in list_words:
query = word
print ("Sending query for: ", word)
result = make_search_request(query)
if result['totalResultsCount'] > 0:
id = result['geonames'][0]['geonameId']
country_name = result['geonames'][0].get('countryName', None)
db.insert_into_identified_via_geonames(user_id, id, country_name)
break
else:
print ("Still no result")
no_result_count += 1

# MAYBE DO A FALLOVER TO LOCATIONS IN DBPEDIA SPOTLIGHT?
# Seems hard since its politics, though might be possible with locations



print('No result for: ', no_result_count, " queries")
55 changes: 7 additions & 48 deletions preprocess_user_location_in_db.py
@@ -1,45 +1,11 @@
import psycopg2
from database.database import Database

conn = psycopg2.connect('dbname={}'.format("twitter-geo"))
db = Database("twitter-geo")

def get_all_users_with_location():
cur = conn.cursor()
print("Starts the preprocess, This takes about an hour. Consider loading a dump instead.")

statement = """
SELECT user_id, user_location
FROM users
WHERE user_location IS NOT NULL
"""

cur.execute(statement)
conn.commit()
result_tuple = cur.fetchall()
cur.close()

return result_tuple

def insert_into_preprocessed(location, user_id, rest=None):
cur = conn.cursor()
if rest != None:
statement = """
UPDATE users
SET preprocessed_location = %s,
preprocessed_rest = %s
WHERE user_id = %s
"""
cur.execute(statement, (location, rest, user_id))
else:
statement = """
UPDATE users
SET preprocessed_location = %s,
preprocessed_rest = DEFAULT
WHERE user_id = %s
"""
cur.execute(statement, (location, user_id))
conn.commit()
cur.close()

all_users = get_all_users_with_location()
all_users = db.get_all_users_with_location()

i = 0

Expand All @@ -54,19 +20,12 @@ def insert_into_preprocessed(location, user_id, rest=None):
rest = rest_split[1][1:]
else:
rest = ",".join(rest_split)
insert_into_preprocessed(location, user[0], rest)
db.insert_into_preprocessed(location, user[0], rest)
else:
insert_into_preprocessed(user[1], user[0])
db.insert_into_preprocessed(user[1], user[0])
i+= 1

if (i%10000 == 0):
print('We have completed: ', i)
#print(location)
#print (rest)
#print(user[0])
#



# Save in DB
# ALTER TABLE IN DB
print ("Done preprocessing")

0 comments on commit 2669117

Please sign in to comment.