In [5]:
import csv
import xml.etree.ElementTree as ET
import json
import pyodbc
from geopy.geocoders import Nominatim
import requests
from datetime import datetime
import os

# Load partecipant age, status, and type dictionaries
with open('dict_partecipant_age.json') as f1:
    dict_partecipant_age = json.load(f1)

with open('dict_partecipant_status.json') as f2:
    dict_partecipant_status = json.load(f2)

with open('dict_partecipant_type.json') as f3:
    dict_partecipant_type = json.load(f3)

# Function to compute additional date-related data
def compute_date_data(date_str):
    date_obj = datetime.strptime(date_str, '%Y-%m-%d %H:%M:%S')
    date = date_obj.date()
    day = date_obj.day
    month = date_obj.month
    year = date_obj.year
    quarter = (date_obj.month - 1) // 3 + 1
    day_of_week = date_obj.strftime('%A')
    return date, day, month, year, quarter, day_of_week

# Function to parse dates.xml and create a mapping of date_fk to real date
def parse_dates_xml(xml_file):
    date_mapping = {}
    tree = ET.parse(xml_file)
    root = tree.getroot()

    for row in root.findall('.//row'):
        date = row.find('date').text
        date_pk = int(row.find('date_pk').text)
        date_mapping[date_pk] = date

    return date_mapping

# Function to compute crime gravity using provided dictionaries
def compute_crime_gravity(x):
    gravity = dict_partecipant_age.get(x['partecipant_age_group'], 1) * \
              dict_partecipant_type.get(x['partecipant_type'], 1) * \
              dict_partecipant_status.get(x['partecipant_status'], 1)
    return gravity

# Function to get continent by country code
def get_continent_by_country_code(country_code):
    if country_code:
        try:
            response = requests.get(f'https://restcountries.com/v3/alpha/{country_code.lower()}')
            data = response.json()
            continent = data[0]['region']
            return continent
        except Exception as e:
            print(f"Error fetching continent information: {e}")

    return None

# Function to get location information
def get_location_info(latitude, longitude):
    geolocator = Nominatim(user_agent="Lab_DSS_Group_ID_200")
    location = geolocator.reverse((latitude, longitude), language='en')

    if location is not None:
        address = location.address
        city = location.raw.get('address', {}).get('city') or location.raw.get('address', {}).get('town') or location.raw.get('address', {}).get('village') or location.raw.get('address', {}).get('county', None)

        state = location.raw.get('address', {}).get('state', None)

        return {
            'city': city,
            'state': state,
        }
    else:
        return None

# Connection string
server = 'tcp:lds.di.unipi.it'
username = 'Group_ID_200'
password = '89VIG10K'
database = 'Group_ID_200_DB'
connectionString = 'DRIVER={ODBC Driver 17 for SQL Server};SERVER=' + server + ';DATABASE=' + database + ';UID=' + username + ';PWD=' + password

# Connect to the SQL Server database
conn = pyodbc.connect(connectionString)
cursor = conn.cursor()

geo_id_dict = {}
gun_id_dict = {}
partecipant_id_dict = {}
date_id_dict = {}
incident_id_dict = {}
custody_id_dict = {}

# Function to insert data into the database
def insert_data_with_ID(conn, cursor, id_dict, table_name, key_dict):
    key_tuple = tuple(key_dict.values())
    if key_tuple not in id_dict:
        id_dict[key_tuple] = next(iter(key_dict.values()))
        columns = ', '.join(key_dict.keys())
        placeholders = ', '.join(['?'] * len(key_dict))
        values = ', '.join(str(value) for value in key_dict.values())
        insert_query = f'INSERT INTO {table_name} ({columns}) VALUES ({values});'

        cursor.execute(insert_query)
        conn.commit()

def insert_data_without_ID(conn, cursor, table_name, key_dict):
    columns = ', '.join(key_dict.keys())
    placeholders = ', '.join(['?'] * len(key_dict))
    values = ', '.join(str(value) for value in key_dict.values())
    insert_query = f'INSERT INTO {table_name} ({columns}) VALUES ({values});'
    select_query = f'SELECT TOP 1 * FROM {table_name};'

    cursor.execute(insert_query)
    conn.commit()

    cursor.execute(select_query)
    id_generated = cursor.fetchone()[0]

    return id_generated

def get_or_insert_id(conn, cursor, id_dict, table_name, key_dict):
    key_tuple = tuple(key_dict.values())
    if key_tuple in id_dict:
        return id_dict[key_tuple]
    else:
        last_inserted_id = insert_data_without_ID(conn, cursor, table_name, key_dict)
        id_dict[key_tuple] = last_inserted_id
        return last_inserted_id

date_mapping = parse_dates_xml('dates.xml')

def split_and_integrate(csv_file):
    # List of table names in your database
    table_names = ['Custody', 'Geography', 'Gun', 'Date', 'Incident', 'partecipant']

    # Clean the tables by deleting all records
    for table_name in table_names:
        cursor.execute(f'DELETE FROM {table_name}')
        conn.commit()

    # Read and process Police.csv
    with open(csv_file, 'r') as csvfile:
        reader = csv.DictReader(csvfile)
        next(reader)  # Skip the header row

        # Set the batch size
        batch_size = 10

        # Counter to track the number of processed rows
        row_count = 0

        for row in reader:
            custody_id = row['custody_id']
            participant_age_group = row['participant_age_group']
            participant_gender = row['participant_gender']
            participant_status = row['participant_status']
            participant_type = row['participant_type']
            latitude = float(row['latitude'])
            longitude = float(row['longitude'])
            gun_stolen = row['gun_stolen']
            gun_type = row['gun_type']
            incident_id = row['incident_id']
            date_fk = row['date_fk']

            gun_stolen_bool = 1 if gun_stolen == 'Stolen' else 0
            gun_key_dict = {"is_stolen": gun_stolen_bool, "gun_type": gun_type}
            gun_id = get_or_insert_id(conn, cursor, gun_id_dict, 'Gun', gun_key_dict)

            partecipant_key = {
                "age_group": participant_age_group,
                "gender": participant_gender,
                "type": participant_type,
                "status": participant_status
            }
            partecipant_id = get_or_insert_id(conn, cursor, partecipant_id_dict, 'partecipant', partecipant_key)

            # Get location information from latitude and longitude
            location_info = get_location_info(latitude, longitude)
            city = location_info["city"]
            state = location_info["state"]

            geo_key = {
                "latitude": str(latitude), "longitude": str(longitude),
                "city": city, "state": state
            }
            geo_id = get_or_insert_id(conn, cursor, geo_id_dict, 'Geography', geo_key)

            # Normal ID, No Incremental Tables:
            date_id = int(date_fk)
            date_value = date_mapping[date_id]
            date, day, month, year, quarter, day_of_week = compute_date_data(date_value)
            date_key = {
                "date_id": date_id, "the_date": date, "the_day": day,
                "the_month": month, "the_year": year, "quarter": quarter, "day_of_week": day_of_week
            }
            insert_data_with_ID(conn, cursor, date_id_dict, "Date", date_key)

            incident_id = int(incident_id)
            incident_key = {"incident_id": incident_id}
            insert_data_with_ID(conn, cursor, incident_id_dict, "Incident", incident_key)

            custody_key = {
                "custody_id": custody_id, "partecipant_id": partecipant_id, "gun_id": gun_id,
                "geo_id": geo_id, "date_id": date_id, "crime_gravity": compute_crime_gravity(row),
                "incident_id": incident_id
            }
            insert_data_with_ID(conn, cursor, custody_id_dict, 'Custody', custody_key)

            row_count += 1

            if row_count % batch_size == 0:
                print(f'Processed {row_count} rows.')
                conn.commit()

    # Commit any remaining records
    conn.commit()
    print('Data insertion completed successfully.')

# Call the function with the appropriate arguments
split_and_integrate('Police.csv')

# Close the database connection when done
cursor.close()
conn.close()


ProgrammingError: ('42S22', "[42S22] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Invalid column name 'Shotgun'. (207) (SQLExecDirectW)")