In [1]:
import csv
import xml.etree.ElementTree as ET
import json
import csv
import pyodbc
from geopy.geocoders import Nominatim
import requests
from datetime import datetime

In [2]:
# Load participant age, status, and type dictionaries
with open('dict_partecipant_age.json') as f1:
    dict_partecipant_age = json.load(f1)

with open('dict_partecipant_status.json') as f2:
    dict_partecipant_status = json.load(f2)

with open('dict_partecipant_type.json') as f3:
    dict_partecipant_type = json.load(f3)

In [3]:
# Function to compute additional date-related data
def compute_date_data(date_str):
    date_obj = datetime.strptime(date_str, '%Y-%m-%d %H:%M:%S')
    date = date_obj.date()
    day = date_obj.day
    month = date_obj.month
    year = date_obj.year
    quarter = (date_obj.month - 1) // 3 + 1
    day_of_week = date_obj.strftime('%A')
    return date, day, month, year, quarter, day_of_week

In [4]:
# Function to parse dates.xml and create a mapping of date_fk to real date
def parse_dates_xml(xml_file):
    date_mapping = {}
    tree = ET.parse(xml_file)
    root = tree.getroot()

    for row in root.findall('.//row'):
        date = row.find('date').text
        date_pk = int(row.find('date_pk').text)
        date_mapping[date_pk] = date

    return date_mapping


In [5]:
# Function to compute crime gravity using provided dictionaries
def compute_crime_gravity(x):
    gravity = dict_partecipant_age.get(x['participant_age_group'], 1) * \
              dict_partecipant_type.get(x['participant_type'], 1) * \
              dict_partecipant_status.get(x['participant_status'], 1)
    return gravity

In [6]:
def get_continent_by_country_code(country_code):
    if country_code:
        try:
            response = requests.get(f'https://restcountries.com/v3/alpha/{country_code.lower()}')
            data = response.json()
            continent = data[0]['region']
            return continent
        except Exception as e:
            print(f"Error fetching continent information: {e}")
    
    return None

In [7]:
def get_location_info(latitude, longitude):
    geolocator = Nominatim(user_agent="Lab_DSS_Group_ID_200")
    location = geolocator.reverse((latitude, longitude), language='en')

    if location is not None:
        address = location.address
        city = location.raw.get('address', {}).get('city') or location.raw.get('address', {}).get('town') or location.raw.get('address', {}).get('village') or location.raw.get('address', {}).get('county', None)

        state = location.raw.get('address', {}).get('state', None)
        #country = location.raw.get('address', {}).get('country', None) (Not calculated as the country is USA for all records, and the Continent is North America)

        #continent = get_continent_by_country_code(location.raw.get('address', {}).get('country_code', None)) 

        return {
            'city': city,
            'state': state,
            #'country': country,
            #'continent': continent
        }
    else:
        return None


In [8]:
date_mapping = parse_dates_xml('dates.xml')

In [9]:
import csv
import itertools

# Function to insert data into the database
def insert_data_with_ID(id_dict, table_name, key_dict, csv_writer):
    key_tuple = tuple(key_dict.values())
    if key_tuple not in id_dict:
        id_dict[key_tuple] = next(iter(key_dict.values()))
        csv_writer.writerow(key_dict.values())

def split_and_integrate(csv_file):
    # List of CSV file names
    csv_file_names = ['Custody.csv', 'Geography.csv', 'Gun.csv', 'Date.csv', 'Incident.csv', 'Participant.csv']

    # Create CSV writers for each file
    csv_writers = {name: csv.writer(open(name, 'w', newline='')) for name in csv_file_names}

    # Read and process Police.csv
    with open(csv_file, 'r') as csvfile:
        reader = csv.DictReader(csvfile)
        next(reader)  # Skip the header row

        # Counter to track the number of processed rows
        row_count = 0
        try:
            for _ in range(0):
                next(reader)

            for row in reader:
                try:
                    custody_id, participant_age_group, participant_gender, participant_status, participant_type, latitude, longitude, gun_stolen, gun_type, incident_id, date_fk = row

                    gun_stolen_bool = 1 if gun_stolen == 'Stolen' else 0
                    gun_key_dict = {"is_stolen": gun_stolen_bool, "gun_type": gun_type}
                    insert_data_with_ID(gun_id_dict, 'Gun.csv', gun_key_dict, csv_writers['Gun'])

                    participant_key = {
                        "age_group": participant_age_group,
                        "gender": participant_gender,
                        "type": participant_type,
                        "status": participant_status
                    }
                    insert_data_with_ID(participant_id_dict, 'Participant.csv', participant_key, csv_writers['Participant'])

                    # Get location information from latitude and longitude
                    latitude, longitude = float(latitude), float(longitude)
                    location_info = get_location_info(latitude, longitude)
                    city = location_info["city"]
                    state = location_info["state"]
                    country = "United States"
                    continent = "North America"
                    geo_key = {
                        "latitude": str(latitude),
                        "longitude": str(longitude),
                        "city": city,
                        "state": state,
                        "country": country,
                        "continent": continent
                    }
                    insert_data_with_ID(geo_id_dict, 'Geography.csv', geo_key, csv_writers['Geography'])

                    # Normal ID, No Incremental Tables:
                    date_id = int(date_fk)
                    date_value = date_mapping[date_id]
                    date, day, month, year, quarter, day_of_week = compute_date_data(date_value)
                    date_key = {
                        "date_id": date_id,
                        "the_date": date,
                        "the_day": day,
                        "the_month": month,
                        "the_year": year,
                        "quarter": quarter,
                        "day_of_week": day_of_week
                    }
                    insert_data_with_ID(date_id_dict, 'Date.csv', date_key, csv_writers['Date'])

                    incident_id = int(incident_id)
                    incident_key = {"incident_id": incident_id}
                    insert_data_with_ID(incident_id_dict, 'Incident.csv', incident_key, csv_writers['Incident'])

                    custody_key = {
                        "custody_id": custody_id,
                        "partecipant_id": participant_id_dict[participant_key],
                        "gun_id": gun_id_dict[gun_key_dict],
                        "geo_id": geo_id_dict[geo_key],
                        "date_id": date_id_dict[date_key],
                        "crime_gravity": compute_crime_gravity(row),
                        "incident_id ": incident_id
                    }
                    insert_data_with_ID(custody_id_dict, 'Custody.csv', custody_key, csv_writers['Custody'])

                    # Increment the row count
                    row_count += 1

                    # Commit in batches of 1000
                    if row_count % 10 == 0:
                        print(row_count)

                except Exception as e:
                    print(f"Error processing row {row_count}: {e}")

        except Exception as outer_exception:
            print(f"Error in outer loop: {outer_exception}")

    # Close all CSV files
    for writer in csv_writers.values():
        writer.close()




# Define dictionaries for tracking IDs
custody_id_dict = {}
geo_id_dict = {}
gun_id_dict = {}
date_id_dict = {}
incident_id_dict = {}
participant_id_dict = {}

# Specify the CSV file to read
csv_file_path = 'police.csv'

# Run the split_and_integrate function
split_and_integrate(csv_file_path)


Error processing row 0: 'Gun'
Error processing row 0: 'Gun'
Error processing row 0: 'Gun'
Error processing row 0: 'Gun'
Error processing row 0: 'Gun'
Error processing row 0: 'Gun'
Error processing row 0: 'Gun'
Error processing row 0: 'Gun'
Error processing row 0: 'Gun'
Error processing row 0: 'Gun'
Error processing row 0: 'Gun'
Error processing row 0: 'Gun'
Error processing row 0: 'Gun'
Error processing row 0: 'Gun'
Error processing row 0: 'Gun'
Error processing row 0: 'Gun'
Error processing row 0: 'Gun'
Error processing row 0: 'Gun'
Error processing row 0: 'Gun'
Error processing row 0: 'Gun'
Error processing row 0: 'Gun'
Error processing row 0: 'Gun'
Error processing row 0: 'Gun'
Error processing row 0: 'Gun'
Error processing row 0: 'Gun'
Error processing row 0: 'Gun'
Error processing row 0: 'Gun'
Error processing row 0: 'Gun'
Error processing row 0: 'Gun'
Error processing row 0: 'Gun'
Error processing row 0: 'Gun'
Error processing row 0: 'Gun'
Error processing row 0: 'Gun'
Error proc

AttributeError: '_csv.writer' object has no attribute 'close'

In [18]:
import requests
from geopy.geocoders import Nominatim
from concurrent.futures import ThreadPoolExecutor

# Use Geonames for country information
GEONAMES_API_URL = "http://api.geonames.org/countryInfoJSON"
GEONAMES_USERNAME = "your_geonames_username"

In [26]:
def get_location_info(latitude, longitude):
    geolocator = Nominatim(user_agent="Lab_DSS_Group_ID_200")
    location = geolocator.reverse((latitude, longitude), language='en')

    if location is not None:
        address = location.address
        city = location.raw.get('address', {}).get('city') or location.raw.get('address', {}).get('town') or location.raw.get('address', {}).get('village') or location.raw.get('address', {}).get('county', None)
        state = location.raw.get('address', {}).get('state', None)

        # Use a ThreadPoolExecutor for parallelizing requests
        #with ThreadPoolExecutor() as executor:
            #continent = executor.submit(get_continent_by_country_code, location.raw.get('address', {}).get('country_code', None))

        return {
            'city': city,
            'state': state,
            #'continent': continent.result()  # Wait for the result
        }
    else:
        return None

In [23]:
# Example usage
latitude = 59.206600
longitude = -160.502000
api_key = "AIzaSyBU-5iM3eGnShHFm0V1NFnkGmInJRysaOI"

result = get_location_info(latitude, longitude, api_key)
print(result)

200
b'{\n   "plus_code" : \n   {\n      "global_code" : "92FX6F4X+J6J"\n   },\n   "results" : \n   [\n      {\n         "address_components" : \n         [\n            {\n               "long_name" : "92FX6F4X+J6",\n               "short_name" : "92FX6F4X+J6",\n               "types" : \n               [\n                  "plus_code"\n               ]\n            }\n         ],\n         "formatted_address" : "92FX6F4X+J6",\n         "geometry" : \n         {\n            "bounds" : \n            {\n               "northeast" : \n               {\n                  "lat" : 59.206625,\n                  "lng" : -160.501875\n               },\n               "southwest" : \n               {\n                  "lat" : 59.2065,\n                  "lng" : -160.502\n               }\n            },\n            "location" : \n            {\n               "lat" : 59.20659999999999,\n               "lng" : -160.502\n            },\n            "location_type" : "GEOMETRIC_CENTER",\n       

In [22]:
import pyodbc
from concurrent.futures import ThreadPoolExecutor
import requests

# Function to get location info using Google Maps Geocoding API
def get_location_info(latitude, longitude, api_key):
    url = f"https://maps.googleapis.com/maps/api/geocode/json?latlng={latitude},{longitude}&key={api_key}"
    
    try:
        response = requests.get(url)
        print(response.status_code)
        print(response.content)

        data = response.json()

        # Check if the response contains results
        if data.get("results"):
            # Extract address components
            address_components = data["results"][0]["address_components"]

            # Try to extract city, town, village, county
            city = next((component["long_name"] for component in address_components if "locality" in component["types"]), None)
            town = next((component["long_name"] for component in address_components if "sublocality_level_1" in component["types"]), None)
            village = next((component["long_name"] for component in address_components if "sublocality_level_2" in component["types"]), None)
            county = next((component["long_name"] for component in address_components if "administrative_area_level_2" in component["types"]), None)

            # Return the first non-None value found, or None if all are None
            return {"city": city or town or village or county or None}

    except requests.exceptions.RequestException as e:
        print(f"Error in geocoding request: {e}")

    # Return None if no valid location information is found
    return {"city": None}


In [16]:
import random

def generate_random_coordinates(num_points=10000):
    coordinates = []
    for _ in range(num_points):
        latitude = round(random.uniform(-90, 90), 6)
        longitude = round(random.uniform(-180, 180), 6)
        coordinates.append((latitude, longitude))
    return coordinates

# Example usage:
random_coordinates = generate_random_coordinates()
api_key ="AIzaSyBU-5iM3eGnShHFm0V1NFnkGmInJRysaOI"

for index, (latitude, longitude) in enumerate(random_coordinates, start=1):
    print(f"Point {index}: Latitude = {latitude}, Longitude = {longitude}")
    result = get_location_info(latitude, longitude, api_key)
    print(result)


Point 1: Latitude = 75.549565, Longitude = -161.756928
{'city': None}
Point 2: Latitude = 85.519678, Longitude = -39.551348
{'city': None}
Point 3: Latitude = 32.603652, Longitude = -99.765516
{'city': None}
Point 4: Latitude = -50.118199, Longitude = 178.32943
{'city': None}
Point 5: Latitude = 60.675149, Longitude = 98.284061
{'city': None}
Point 6: Latitude = 15.057072, Longitude = 6.803407
{'city': None}
Point 7: Latitude = 76.701787, Longitude = 71.642039
{'city': None}
Point 8: Latitude = -84.506087, Longitude = -102.12829
{'city': None}
Point 9: Latitude = -77.274674, Longitude = 167.608661
{'city': None}
Point 10: Latitude = -84.202845, Longitude = -37.535922
{'city': None}
Point 11: Latitude = -1.612879, Longitude = 79.370979
{'city': None}
Point 12: Latitude = 72.565925, Longitude = 87.542383
{'city': None}
Point 13: Latitude = 41.415604, Longitude = 62.965127
{'city': None}
Point 14: Latitude = -8.168304, Longitude = -110.198168
{'city': None}
Point 15: Latitude = 16.762201,

KeyboardInterrupt: 

In [1]:
#Write to my db
# Connection string
server = 'tcp:lds.di.unipi.it'
username = 'Group_ID_200'
password = '89VIG10K'
database = 'Group_ID_200_DB'
connectionString = 'DRIVER={ODBC Driver 17 for SQL Server};SERVER=' + server + ';DATABASE=' + database + ';UID=' + username + ';PWD=' + password

In [20]:
import pyodbc
from concurrent.futures import ThreadPoolExecutor
import requests
import time
from decimal import Decimal  # Import the Decimal class

# Function to get location info using Google Maps Geocoding API with retry logic
def get_location_info_with_retry(latitude, longitude, api_key, max_retries=3):
    # Convert Decimal values to float
    latitude = float(latitude)
    longitude = float(longitude)

    for attempt in range(max_retries):
        try:
            url = f"https://maps.googleapis.com/maps/api/geocode/json?latlng={latitude},{longitude}&key={api_key}"
            response = requests.get(url)
            data = response.json()

            if data.get("results"):
                address_components = data["results"][0]["address_components"]
                city = next((component["long_name"] for component in address_components if "locality" in component["types"]), None)
                town = next((component["long_name"] for component in address_components if "sublocality_level_1" in component["types"]), None)
                village = next((component["long_name"] for component in address_components if "sublocality_level_2" in component["types"]), None)
                county = next((component["long_name"] for component in address_components if "administrative_area_level_2" in component["types"]), None)
                state = next((component["long_name"] for component in address_components if "administrative_area_level_1" in component["types"]), None)

                return {"city": city or town or village or county or None, "state": state}
        except requests.exceptions.RequestException as e:
            print(f"Error in geocoding request (attempt {attempt + 1}/{max_retries}): {e}")
            time.sleep(1)  # Add a delay before retrying

    return {"city": None, "state": None}

# Assuming you have a SQL Server connection
# Connect to the SQL Server database
conn = pyodbc.connect(connectionString)
cursor = conn.cursor()

# Assuming 'Geography' is your table name
query = "SELECT latitude, longitude FROM Geography WHERE city IS NULL OR state IS NULL"
cursor.execute(query)

# Fetch all records at once
records = cursor.fetchall()
# records = records[20:100]
print(len(records))

# Replace "your_google_maps_api_key" with your actual Google Maps API key
google_maps_api_key = "AIzaSyBU-5iM3eGnShHFm0V1NFnkGmInJRysaOI"

# Batch size for committing changes
batch_size = 1
row = 0

# Use ThreadPoolExecutor for parallel processing (adjust the number of threads as needed)
with ThreadPoolExecutor(max_workers=8) as executor:
    for start_index in range(0, len(records), batch_size):
        end_index = start_index + batch_size
        batch_records = records[start_index:end_index]

        # Get location info for the current batch
        location_info_list = list(executor.map(lambda record: get_location_info_with_retry(*record, google_maps_api_key), batch_records))

        print("Processing batch:", start_index // batch_size + 1)
        for record, info in zip(batch_records, location_info_list):
            update_query = "UPDATE Geography SET city = ?, state = ? WHERE latitude = ? AND longitude = ?"
            try:
                cursor.execute(update_query, (info["city"], info["state"], *record))

                # Fetch the updated record
                select_query = "SELECT * FROM Geography WHERE latitude = ? AND longitude = ?"
                cursor.execute(select_query, record)
                updated_record = cursor.fetchone()

                # Convert Decimal values to float for latitude and longitude
                updated_record = tuple(float(val) if isinstance(val, Decimal) else val for val in updated_record)

                # Print the updated record
                #print("Updated record:", updated_record)
            except Exception as e:
                print(f"Error processing row {row}: {e}")
                break  # Exit the loop on error

        conn.commit()
        # Print progress
        row += batch_size
        print("Processed rows:", row)
    conn.commit()
    

# Close the cursor and connection
cursor.close()
conn.close()


53
Processing batch: 1
Processed rows: 1
Processing batch: 2
Processed rows: 2
Processing batch: 3
Processed rows: 3
Processing batch: 4
Processed rows: 4
Processing batch: 5
Processed rows: 5
Processing batch: 6
Processed rows: 6
Processing batch: 7
Processed rows: 7
Processing batch: 8
Processed rows: 8
Processing batch: 9
Processed rows: 9
Processing batch: 10
Processed rows: 10
Processing batch: 11
Processed rows: 11
Processing batch: 12
Processed rows: 12
Processing batch: 13
Processed rows: 13
Processing batch: 14
Processed rows: 14
Processing batch: 15
Processed rows: 15
Processing batch: 16
Processed rows: 16
Processing batch: 17
Processed rows: 17
Processing batch: 18
Processed rows: 18
Processing batch: 19
Processed rows: 19
Processing batch: 20
Processed rows: 20
Processing batch: 21
Processed rows: 21
Processing batch: 22
Processed rows: 22
Processing batch: 23
Processed rows: 23
Processing batch: 24
Processed rows: 24
Processing batch: 25
Processed rows: 25
Processing batc

In [26]:
import pyodbc
from concurrent.futures import ThreadPoolExecutor
import requests
import time
from decimal import Decimal  # Import the Decimal class

#Write to my db
# Connection string
server = 'tcp:lds.di.unipi.it'
username = 'Group_ID_200'
password = '89VIG10K'
database = 'Group_ID_200_DB'
connectionString = 'DRIVER={ODBC Driver 17 for SQL Server};SERVER=' + server + ';DATABASE=' + database + ';UID=' + username + ';PWD=' + password


# Assuming you have a SQL Server connection
# Connect to the SQL Server database
conn = pyodbc.connect(connectionString)
cursor = conn.cursor()

# Assuming 'Geography' is your table name
query = "SELECT latitude, longitude FROM Geography WHERE city IS NULL OR state IS NULL"
cursor.execute(query)

# Fetch all records at once
records = cursor.fetchall()
# records = records[20:100]
print(len(records))


0


In [24]:
def get_location_info(latitude, longitude):
    geolocator = Nominatim(user_agent="Lab_DSS_Group_ID_200")
    location = geolocator.reverse((latitude, longitude), language='en')

    if location is not None:
        address = location.address
        city = location.raw.get('address', {}).get('city') or location.raw.get('address', {}).get('town') or location.raw.get('address', {}).get('village') or location.raw.get('address', {}).get('county', None)

        state = location.raw.get('address', {}).get('state', None)
        #country = location.raw.get('address', {}).get('country', None) (Not calculated as the country is USA for all records, and the Continent is North America)

        #continent = get_continent_by_country_code(location.raw.get('address', {}).get('country_code', None)) 

        return {
            'city': city,
            'state': state,
            #'country': country,
            #'continent': continent
        }
    else:
        return None


In [None]:
batch_size = 1
row = 0

# Use ThreadPoolExecutor for parallel processing (adjust the number of threads as needed)
with ThreadPoolExecutor(max_workers=8) as executor:
    for start_index in range(0, len(records), batch_size):
        end_index = start_index + batch_size
        batch_records = records[start_index:end_index]

        # Get location info for the current batch
        location_info_list = list(executor.map(lambda record: get_location_info_with_retry(*record, google_maps_api_key), batch_records))

        print("Processing batch:", start_index // batch_size + 1)
        for record, info in zip(batch_records, location_info_list):
            update_query = "UPDATE Geography SET city = ?, state = ? WHERE latitude = ? AND longitude = ?"
            try:
                cursor.execute(update_query, (info["city"], info["state"], *record))

                # Fetch the updated record
                select_query = "SELECT * FROM Geography WHERE latitude = ? AND longitude = ?"
                cursor.execute(select_query, record)
                updated_record = cursor.fetchone()

                # Convert Decimal values to float for latitude and longitude
                updated_record = tuple(float(val) if isinstance(val, Decimal) else val for val in updated_record)

                # Print the updated record
                #print("Updated record:", updated_record)
            except Exception as e:
                print(f"Error processing row {row}: {e}")
                break  # Exit the loop on error

        conn.commit()
        # Print progress
        row += batch_size
        print("Processed rows:", row)
    conn.commit()

# Close the cursor and connection
cursor.close()
conn.close()

In [27]:
# Assuming you have a SQL Server connection
# Connect to the SQL Server database
conn = pyodbc.connect(connectionString)
cursor = conn.cursor()

# Assuming 'Geography' is your table name
query = "SELECT latitude, longitude FROM Geography WHERE city IS NULL OR state IS NULL"
cursor.execute(query)

# Fetch all records at once
records = cursor.fetchall()
# records = records[20:100]
print(len(records))

1


In [28]:
import pyodbc
from concurrent.futures import ThreadPoolExecutor
import geopy
from geopy.geocoders import Nominatim
import time

# Function to get location info using Geopy Nominatim with retry logic
def get_location_info_with_retry(latitude, longitude, max_retries=3):
    location = None

    # Convert Decimal values to float
    latitude = float(latitude)
    longitude = float(longitude)

    for attempt in range(max_retries):
        try:
            geolocator = Nominatim(user_agent="your_app_name")
            location = geolocator.reverse((latitude, longitude), language='en')
            break  # Break the loop if successful
        except geopy.exc.GeocoderTimedOut as e:
            print(f"Error in geocoding request (attempt {attempt + 1}/{max_retries}): {e}")
            time.sleep(1)  # Add a delay before retrying

    if location:
        city = location.raw.get('address', {}).get('city') or location.raw.get('address', {}).get('town') or location.raw.get('address', {}).get('village') or location.raw.get('address', {}).get('county', None)
        state = location.raw.get('address', {}).get('state', None)

        return {"city": city, "state": state}

    return {"city": None, "state": None}

# Assuming you have a SQL Server connection
# Connect to the SQL Server database
conn = pyodbc.connect(connectionString)
cursor = conn.cursor()

# Assuming 'Geography' is your table name
query = "SELECT latitude, longitude FROM Geography WHERE city IS NULL OR state IS NULL"
cursor.execute(query)

# Fetch all records at once
records = cursor.fetchall()
# records = records[20:100]
print(len(records))

# Replace "your_google_maps_api_key" with your actual Google Maps API key
# (Note: Geopy Nominatim doesn't require an API key)

# Batch size for committing changes
batch_size = 1
row = 0

# Use ThreadPoolExecutor for parallel processing (adjust the number of threads as needed)
with ThreadPoolExecutor(max_workers=8) as executor:
    for start_index in range(0, len(records), batch_size):
        end_index = start_index + batch_size
        batch_records = records[start_index:end_index]

        # Get location info for the current batch
        location_info_list = list(executor.map(lambda record: get_location_info_with_retry(*record), batch_records))

        print("Processing batch:", start_index // batch_size + 1)
        for record, info in zip(batch_records, location_info_list):
            update_query = "UPDATE Geography SET city = ?, state = ? WHERE latitude = ? AND longitude = ?"
            try:
                cursor.execute(update_query, (info["city"], info["state"], *record))
                # Fetch the updated record, print the updated record (if needed)
            except Exception as e:
                print(f"Error processing row {row}: {e}")
                break  # Exit the loop on error

        conn.commit()
        # Print progress
        row += batch_size
        print("Processed rows:", row)
    conn.commit()

# Close the cursor and connection
cursor.close()
conn.close()


1


Processing batch: 1
Processed rows: 1
