Skip to content

Commit

Permalink
Added table and code to filter possible locations
Browse files Browse the repository at this point in the history
  • Loading branch information
Oscar-Rydh committed Apr 10, 2017
1 parent d822122 commit 93d83ec
Show file tree
Hide file tree
Showing 6 changed files with 112 additions and 4 deletions.
4 changes: 4 additions & 0 deletions README.md
Expand Up @@ -55,3 +55,7 @@ SELECT count(*) FROM tweets WHERE retweeted_id IS NULL AND in_reply_to_status_id
SELECT count(*) FROM tweets WHERE retweeted_id IS NULL AND in_reply_to_status_id IS NOT NULL AND in_reply_to_user_id IS NULL;
SELECT count(*) FROM tweets WHERE retweeted_id IS NULL AND in_reply_to_status_id IS NULL AND in_reply_to_user_id IS NOT NULL;
SELECT count(*) FROM tweets WHERE retweeted_id IS NULL AND in_reply_to_status_id IS NULL AND in_reply_to_user_id IS NULL;


# SELECT ALL FILTERED LOCATIONS:
select user_location, name, ratio, country_code from users inner join filtered_user_locations using (user_id) inner join geonames using(geonameid);
10 changes: 10 additions & 0 deletions database/create_tables.sql
Expand Up @@ -101,3 +101,13 @@ CREATE TABLE IF NOT EXISTS geonames (

CREATE VIEW trumps_tweets AS
SELECT * FROM tweets WHERE user_id = 25073877;

CREATE TABLE IF NOT EXISTS filtered_user_locations(
id INTEGER,
user_id BIGINT,
geonameid INT,
ratio DECIMAL,
PRIMARY KEY (id),
FOREIGN KEY(user_id) REFERENCES users(user_id),
FOREIGN KEY(geonameid) REFERENCES geonames(geonameid)
);
49 changes: 49 additions & 0 deletions database/database.py
Expand Up @@ -333,3 +333,52 @@ def loadCountries(self,
population, elevation, dem, timezone, modification_date))
self.conn.commit()
cur.close()

def select_user_locations(self):
cur = self.conn.cursor()
statement = """
SELECT user_id, user_location
FROM users
WHERE user_location IS NOT NULL
AND user_id NOT IN (SELECT user_id from filtered_user_locations)
LIMIT 100;
"""
cur.execute(statement)
self.conn.commit()
result_tuple = cur.fetchall()
result_array = []
cur.close()
for result in result_tuple:
result_array.append([result[0], result[1]])

return result_array

def select_database_locations(self):
cur = self.conn.cursor()
statement = """
SELECT DISTINCT geonameid, name, asciiname, latitude, longitude
FROM geonames
WHERE feature_code = 'PPL';
"""
cur.execute(statement)
self.conn.commit()
result_tuple = cur.fetchall()
result_array = []
cur.close()
i = 0
for result in result_tuple:
if (i%100000 == 0):
print("We have selected: ", i)
result_array.append([result[0], result[1], result[2], float(result[3]), float(result[4])])
i+=1
return result_array

def set_filtered_location(self, user_id, geonameid, ratio):
cur = self.conn.cursor()
statement = """
INSERT INTO filtered_user_locations(user_id, geonameid, ratio)
VALUES (%s, %s, %s);
"""
cur.execute(statement, (user_id, geonameid, ratio))
self.conn.commit()
cur.close()
45 changes: 45 additions & 0 deletions place_with_contains.py
@@ -0,0 +1,45 @@
#This file compares the "places" table with "geonames" table with plain old contains.
from database.database import Database
from difflib import SequenceMatcher

def similar(a, b):
return SequenceMatcher(None, a, b).ratio()


db = Database("twitter-geo")

#user_id, userlocations (in an array)
user_locations = db.select_user_locations()
#geonameid, name, asciiname, latitude, longitude(in an array)
db_locations = db.select_database_locations()
#userlocation, db_location(either name or asciiname), latitude, longitude
found_locations = []
for user_location in user_locations:
print ("Parsing: ", user_location)
max = 0
max_geoid = 0
for db_location in db_locations:
#if not(db_location[0] == None) and user_location in db_location[0]:
if not(db_location[1] == None):
ratio = similar(user_location[1], db_location[1])
if ratio == 1:
db.set_filtered_location(user_location[0], db_location[0], ratio)
break
elif ratio > max:
max = ratio
max_geoid = db_location[0]

#elif not(db_location[1] == None) and user_location in db_location[1]:
elif not(db_location[2] == None):
ratio = similar(user_location[1], db_location[2])
if ratio == 1:
db.set_filtered_location(user_location[0], db_location[0], ratio)
break
elif ratio > max:
max = ratio
max_geoid = db_location[0]

if (max_geoid != 0):
db.set_filtered_location(user_location[0], max_geoid, max)

print ("WE ARE DONE")
1 change: 0 additions & 1 deletion placement_methods/place_with_contains.py

This file was deleted.

7 changes: 4 additions & 3 deletions web/database_relation_searcher.py
@@ -1,7 +1,7 @@
from database import Database

class Database(Database):

#DO NOT USE THIS ITS NOT WORKING!!!!!!!!!!!!!!!!!

#retweeted_id --> tweeten är en retweet
#in_reply_to_user_id --> någon har blivit mentioned
Expand All @@ -19,7 +19,7 @@ class Database(Database):
#SELECT count(*) FROM tweets WHERE retweeted_id IS NULL AND in_reply_to_status_id IS NOT NULL AND in_
#SELECT count(*) FROM tweets WHERE retweeted_id IS NULL AND in_reply_to_status_id IS NULL AND in_repl
#SELECT count(*) FROM tweets WHERE retweeted_id IS NULL AND in_reply_to_status_id IS NULL AND in_repl

'''
def get_total_clean_retweet_count(self):
cur = self.conn.cursor()
Expand Down Expand Up @@ -77,4 +77,5 @@ def get_total_mention_count(self):
commented_retweet_count = cur.fetchone()
cur.close()
return commented_retweet_count[0]
return comme nted_retweet_count[0]
'''

0 comments on commit 93d83ec

Please sign in to comment.