In [None]:
!pip install Levenshtein
!pip install py_stringmatching

In [None]:
import numpy as np
import pandas as pd
import Levenshtein as lev
from py_stringmatching import similarity_measure as sm


# Task 1

In [None]:
df = pd.read_csv("dft-road-casualty-statistics-collision-provisional-mid-year-unvalidated-2024.csv",
                 low_memory=False)

In [None]:
# Statistical methods for columns:
# 1. Total number of collisions:
total_collisions = len(df)

# 2. Average number of causalities per month:
df['date'] = pd.to_datetime(df['date'], errors = 'coerce', dayfirst=True)
df['month'] = df['date'].dt.month
avg_casualties_per_month = df.groupby('month')['number_of_casualties'].mean()

# 3. Max number of causalities:
max_casualties = df['number_of_casualties'].max()

# 4. Most common collision time:
common_time = df['time'].mode()[0]

# 5. Average speed limit:
avg_speed_limit = df['speed_limit'].mean()

# 6. Distribution of weather conditions:
weather_distribution = df['weather_conditions'].value_counts()

# 7. Frequency of speed limit:
speed_frequency = df['speed_limit'].value_counts()

# 8. Most common speed range:
def speed_category(x):
    if x <= 30:
        return "Low"
    elif x >= 50:
        return "High"
    else:
        return "Medium"
df['speed_range'] = df['speed_limit'].apply(speed_category)
most_common_range = df['speed_range'].value_counts().idxmax()
most_common_range

# 9. Median of the police attendance:
median_police = df['did_police_officer_attend_scene_of_collision'].median()

# 10. Standard deviation of special condition at site:
std_special_condition = df['did_police_officer_attend_scene_of_collision'].std()

summary = {
    "Total Collisions": total_collisions,
    "Average Causalities per Month": avg_casualties_per_month.to_dict(),
    "Max Number of Causalities": max_casualties,
    "Most Common Collision Time": common_time,
    "Average Speed Limit": avg_speed_limit,
    "Distribution of weather conditions": weather_distribution,
    "Frequency of speed limit": speed_frequency.to_dict(),
    "Most Common Speed Range": most_common_range,
    "Median of the police attendance": median_police,
    "Standard deviation of special condition at site": std_special_condition
}
summary

{'Total Collisions': 46707,
 'Average Causalities per Month': {1: 1.2442567987085558,
  2: 1.2467266775777415,
  3: 1.2659081889138775,
  4: 1.2750883872722327,
  5: 1.2735447806537854,
  6: 1.2975550122249389},
 'Max Number of Causalities': 21,
 'Most Common Collision Time': '17:00',
 'Average Speed Limit': 35.509559594921534,
 'Distribution of weather conditions': weather_conditions
 1    36830
 2     5796
 9     1554
 8     1377
 4      488
 5      442
 3      116
 7       94
 6       10
 Name: count, dtype: int64,
 'Frequency of speed limit': {30: 23467,
  20: 9003,
  60: 5521,
  40: 4050,
  70: 2485,
  50: 2146,
  -1: 35},
 'Most Common Speed Range': 'Low',
 'Median of the police attendance': 1.0,
 'Standard deviation of special condition at site': 0.8309113579162716}

# Task 2

## Part 1

In [None]:
acm = pd.read_csv("ACM.csv")
dblp = pd.read_csv("DBLP2.csv", encoding="latin1")

In [None]:
# b. Change all alphabetical characters into lowercase.
for col in acm.columns:
  if acm[col].dtype == 'object':
    acm[col] = acm[col].str.lower()

for col in dblp.columns:
  if dblp[col].dtype == 'object':
    dblp[col] = dblp[col].str.lower()

print(acm.head(10)) #to double-check correctness of action

       id                                              title  \
0  304586  the wasa2 object-oriented workflow management ...   
1  304587  a user-centered interface for querying distrib...   
2  304589  world wide database-integrating the web, corba...   
3  304590           xml-based information mediation with mix   
4  304582  the ccube constraint object-oriented database ...   
5  304583  the cornell jaguar project: adding mobility to...   
6  304584  the active multisync controller of the cubetre...   
7  304585                  the jungle database search engine   
8  306112  adept: an agent-based approach to business pro...   
9  306115  a componentized architecture for dynamic elect...   

                                             authors  \
0                    gottfried vossen, mathias weske   
1                  isabel f. cruz, kimberly m. james   
2  athman bouguettaya, boualem benatallah, lily h...   
3  chaitan baru, amarnath gupta, bertram lud&#228...   
4  alexander br

In [None]:
# c. Convert multiple spaces to one.
for col in acm.columns:
  if acm[col].dtype == 'object':
    acm[col] = acm[col].str.replace(r'\s+', ' ', regex=True)

for col in dblp.columns:
  if dblp[col].dtype == 'object':
    dblp[col] = dblp[col].str.replace(r'\s+', ' ', regex=True)

In [None]:
# d. Use Levenshtein similarity
def levenshtein_similarity(s1, s2):
    return 1 - lev.distance(s1, s2) / max(len(s1), len(s2))

In [None]:
# e. Use Jaro similarity
def jaro_similarity(s1, s2):
    return lev.jaro(s1, s2)

In [None]:
#. f Use a modified version of the affine similarity that is scaled to the interval [0, 1]
def scaled_affine_similarity(s1, s2, open_gap = 1, gap_ext = 0.1):
    affine = sm.affine.Affine(gap_start = 1, gap_continuation = 0.1, \
                       sim_func = lambda s1, s2: (int(1 if s1 == s2 else 0)))
    raw_score = affine.get_raw_score(s1,s2)
    max_len = max(len(s1), len(s2))

    # the affine similarity can return a negative value so we take the absolute value
    return abs(raw_score) / max_len

In [None]:
#. g match/mismatch
def year_match(year1, year2):
    if year1 == year2:
        return 1
    else:
        return 0

In [None]:
# h. formula to combine scores to get final score
def compare_records(rec1, rec2):
  # sum of the parameters should be 1
  w1 = 0.5
  w2 = 0.2
  w3 = 0.2
  w4 = 0.1

  st = levenshtein_similarity(rec1.title, rec2.title)
  sa = jaro_similarity(rec1.authors, rec2.authors)
  sc = scaled_affine_similarity(rec1.venue, rec2.venue)
  sy = year_match(rec1.year, rec2.year)

  rec_sim = w1 * sa + w2 * st + w3 * sc + w4 * sy
  return rec_sim

In [None]:
# i. Report the records with rec_sim > 0.7 as duplicate records by storing the ids of both records in a list.
def find_duplicate_ids(df1, df2, threshold = 0.7):
  duplicate_ids = pd.DataFrame(columns = ['idDBLP', 'idACM'])
  for i, rec1 in df1.iterrows():
    # tracking progression
    if(i % 100 == 0):
      print(i)
    for j, rec2 in df2.iterrows():
      if compare_records(rec1, rec2) > threshold:
        # if the similarity is above the threshold, we save the ids in the dataframe
        duplicate_ids.loc[len(duplicate_ids)] = [rec1.id, rec2.id]
  return duplicate_ids


In [31]:
from itertools import combinations
import time

matches = pd.read_csv("DBLP-ACM_perfectMapping.csv")
matches.idDBLP = matches.idDBLP.str.lower()

In [None]:
# j. Compute the precision of this method by counting the number of duplicate records that you discovered correctly.
# k. Record the running time of the method
start_time = time.time()

duplicate_ids = find_duplicate_ids(dblp, acm)
end_time = time.time()

elapsed_time = end_time - start_time
print('elapsed time', round(elapsed_time), "s")

# we find the elements that are both in the perfect match dataframe and our results
intersection = pd.merge(duplicate_ids, matches, on=['idDBLP', 'idACM'])

# calculating the proportion of ids that are in the perfect matches to all the ones we found
precision = len(intersection) / len(duplicate_ids)

duplicate_ids
precision

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
elapsed time 1951 s


0.8764822134387352

## Part 2

In [83]:
df_ACM = pd.read_csv("ACM.csv")
df_DB = pd.read_csv("DBLP2.csv", encoding="latin1")

In [84]:
# 1. Concatenate all columns per row into one string
df_DB["merged"] = df_DB.astype(str).agg(" ".join, axis=1)
df_ACM["merged"] = df_ACM.astype(str).agg(" ".join, axis =1)

In [85]:
# 2. Change all alphabetical characters into lowercase.
df_DB["merged"] = df_DB["merged"].str.lower()
df_ACM["merged"] = df_ACM["merged"].str.lower()

In [86]:
# 3. Convert multiple spaces to one.
df_DB["merged"] = df_DB["merged"].str.replace(r"\s+", " ", regex=True).str.strip()
df_ACM["merged"] = df_ACM["merged"].str.replace(r"\s+", " ", regex=True).str.strip()

In [87]:
# 4. Combine the records from both tables into one big list as we did during the lab.
list_ACM = df_ACM["merged"].tolist()
list_DB  = df_DB["merged"].tolist()

big_list = list_ACM + list_DB
print(f"Total length of big_list: {len(big_list)}")  # Should be 4910

Total length of big_list: 4910


In [92]:
# 5. Use the functions in the tutorials from lab 5 to compute the shingles, the minhash signature and the similarity.
def shingle(text, k: int) -> set:
    """Return set of k-shingles, robust to None/NaN/non-strings/short strings."""
    # Treat None/NaN as empty
    if text is None:
        return set()
    if isinstance(text, float):
        if text != text:  # NaN
            return set()
        text = str(text)
    elif not isinstance(text, str):
        text = str(text)

    if len(text) < k:
        return set()
    return { text[i:i+k] for i in range(len(text) - k + 1) }

def build_vocab(shingle_sets: list) -> dict:
    full_set = {sh for s in shingle_sets for sh in s}
    return {sh: i for i, sh in enumerate(full_set)}

def one_hot(shingles: set, vocab: dict):
    vec = np.zeros(len(vocab), dtype=int)
    for sh in shingles:
        vec[vocab[sh]] = 1
    return vec

def get_minhash_arr(num_hashes:int, vocab:dict):
    length = len(vocab)
    arr = np.zeros((num_hashes, length), dtype=int)
    for i in range(num_hashes):
        arr[i, :] = np.random.permutation(length) + 1
    return arr

def get_signature(minhash: np.ndarray, vector: np.ndarray):
    idx = np.nonzero(vector)[0]
    if idx.size == 0:
        # No shingles; return a signature that won't match others
        return np.full(minhash.shape[0], np.iinfo(np.int32).max, dtype=int)
    return np.min(minhash[:, idx], axis=1)

def jaccard_similarity(set1: set, set2: set) -> float:
    inter = len(set1 & set2)
    union = len(set1 | set2)
    return inter / union if union else 0.0

def compute_signature_similarity(sig1: np.ndarray, sig2: np.ndarray) -> float:
    if sig1.shape != sig2.shape:
        raise ValueError("Signature shapes must match.")
    return float(np.mean(sig1 == sig2))

# Shingling
k = 3
shingle_sets = [shingle(doc, k) for doc in big_list]

# Vocab & one-hot
vocab = build_vocab(shingle_sets)
if len(vocab) == 0:
    raise ValueError("Vocabulary is empty. Check that big_list has strings of length >= k.")

onehot = np.stack([one_hot(sset, vocab) for sset in shingle_sets])

# MinHash signatures
num_hashes = 100
minhash_arr = get_minhash_arr(num_hashes, vocab)
signatures = np.stack([get_signature(minhash_arr, vec) for vec in onehot])

In [None]:
#  similarities
N = len(big_list)
jac_mat = np.eye(N, dtype=float) # exact Jaccard similarity matrix
mh_mat  = np.eye(N, dtype=float)  # MinHash similarity matrix

for i in range(N):
    for j in range(i + 1, N):
        # Exact Jaccard on shingles
        s_jac = jaccard_similarity(shingle_sets[i], shingle_sets[j])
        jac_mat[i, j] = jac_mat[j, i] = s_jac

        # MinHash-based similarity (fraction of equal signature components)
        s_mh = compute_signature_similarity(signatures[i], signatures[j])

        mh_mat[i, j] = mh_mat[j, i] = s_mh

In [94]:
""" 6. Extract the top 2224 candidates from the LSH algorithm, compare them to the actual
    mappings in the file DBLP-ACM_perfectMapping.csv
    and compute the precision of the method.
    7. Record the running time of the method.
    8. Compare the precision and the running time in Parts 1 and 2."""

start_time = time.time()
# shingling again for time measurement
k = 3
shingle_sets = [shingle(doc, k) for doc in big_list]

# Vocab & one-hot
vocab = build_vocab(shingle_sets)
if len(vocab) == 0:
    raise ValueError("Vocabulary is empty. Check that big_list has strings of length >= k.")

onehot = np.stack([one_hot(sset, vocab) for sset in shingle_sets])

# MinHash signatures
num_hashes = 100
minhash_arr = get_minhash_arr(num_hashes, vocab)
signatures = np.stack([get_signature(minhash_arr, vec) for vec in onehot])

perfect_mapping_df = pd.read_csv("DBLP-ACM_perfectMapping.csv")
# the following code is taken from lab 5
class LSH:
    """
    Implements the Locality Sensitive Hashing (LSH) technique for approximate
    nearest neighbor search.
    """
    buckets = []
    counter = 0

    def __init__(self, b: int):
        """
        Initializes the LSH instance with a specified number of bands.

        Parameters:
        - b (int): The number of bands to divide the signature into.
        """
        self.b = b
        for i in range(b):
            self.buckets.append({})

    def make_subvecs(self, signature: np.ndarray) -> np.ndarray:
        """
        Divides a given signature into subvectors based on the number of bands.

        Parameters:
        - signature (np.ndarray): The MinHash signature to be divided.

        Returns:
        - np.ndarray: A stacked array where each row is a subvector of the signature.
        """
        l = len(signature)
        assert l % self.b == 0
        r = int(l / self.b)
        subvecs = []
        for i in range(0, l, r):
            subvecs.append(signature[i:i+r])
        return np.stack(subvecs)

    def add_hash(self, signature: np.ndarray):
        """
        Adds a signature to the appropriate LSH buckets based on its subvectors.

        Parameters:
        - signature (np.ndarray): The MinHash signature to be hashed and added.
        """
        subvecs = self.make_subvecs(signature).astype(str)
        for i, subvec in enumerate(subvecs):
            subvec = ','.join(subvec)
            if subvec not in self.buckets[i].keys():
                self.buckets[i][subvec] = []
            self.buckets[i][subvec].append(self.counter)
        self.counter += 1

    def check_candidates(self) -> set:
        """
        Identifies candidate pairs from the LSH buckets that could be potential near duplicates.

        Returns:
        - set: A set of tuple pairs representing the indices of candidate signatures.
        """
        candidates = []
        for bucket_band in self.buckets:
            keys = bucket_band.keys()
            for bucket in keys:
                hits = bucket_band[bucket]
                if len(hits) > 1:
                    candidates.extend(combinations(hits, 2))
        return set(candidates)

# 25 bands gives threshold around 0.7 similarity
n_buckets = 25
lsh = LSH(n_buckets)

for signature in signatures:
    lsh.add_hash(signature)

candidate_pairs = lsh.check_candidates()

# built a list that gives each dataset an index position
len_acm = len(df_ACM)
index_to_id = []
for i in range(len_acm):
    record_id = df_ACM["id"].iloc[i]
    index_to_id.append(("ACM", record_id))
for i in range(len(df_DB)):
    record_id = df_DB["id"].iloc[i]
    index_to_id.append(("DBLP", record_id))

# keep only the pairs where the two documents come from different sources
cross_source_pairs = [(i, j) for (i, j) in candidate_pairs
                      if index_to_id[i][0] != index_to_id[j][0]]

#  Score them using MinHash signature similarity
scored = []
for i, j in cross_source_pairs:
    sim = compute_signature_similarity(signatures[i], signatures[j])
    scored.append((sim, i, j))

#  Sort by similarity and take top 2224
scored.sort(reverse=True, key=lambda x: x[0])
top_n = scored[:2224]

#  create set of predicted pairs
pred_pairs = set()
for sim, i, j in top_n:
    id_acm = str(index_to_id[i][1])
    id_dblp = str(index_to_id[j][1])
    pred_pairs.add((id_dblp, id_acm))

true_pairs = set(zip(perfect_mapping_df["idDBLP"].astype(str),
                     perfect_mapping_df["idACM"].astype(str)))

#  Compute precision
tp = len(pred_pairs.intersection(true_pairs))
fp = len(pred_pairs) - tp
precision = tp / len(pred_pairs) if pred_pairs else 0.0


print("precision =", round(precision, 4))
end_time = time.time()
elapsed_time = end_time - start_time
print(f"\n---")
print("running time:", round(elapsed_time, 2), "seconds")

precision = 0.9047

---
running time: 3.3 seconds


# Task 3

In [None]:
import kagglehub
import os
import shutil

In [None]:
# Downloading the Dataset
path = kagglehub.dataset_download("uciml/pima-indians-diabetes-database")

# get the current working directory, should be the same as the directory where the notebook is in
working_dir = os.getcwd()

# Copy the CSV directly into the working directory
shutil.copy(os.path.join(path, "diabetes.csv"),
            os.path.join(working_dir, "diabetes.csv"))

Using Colab cache for faster access to the 'pima-indians-diabetes-database' dataset.


'/content/diabetes.csv'

In [None]:
# Reading the dataframe
dia = pd.read_csv("diabetes.csv")
dia

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [None]:
# 1. Compute the correlation between the different columns after removing the outcome column.
dia_wo_outcome = dia.drop(columns = ['Outcome'])
corr_before = dia_wo_outcome.corr()

In [None]:
# 2. Remove the disguised values from the table. We need to remove the values that equal to 0 from columns BloodPressure, SkinThickness and BMI as these are missing values but they have been replaced by the value 0. Remove the value but keep the record (i.e.) change the value to null.
dia.BloodPressure = dia.BloodPressure.replace(0, np.nan)
dia.SkinThickness = dia.SkinThickness.replace(0, np.nan)
dia.BMI = dia.BMI.replace(0, np.nan)

In [None]:
# 3. Fill the cells with null using the mean values of the records that have the same class label.
for c in dia.columns:
  dia[c] = dia.groupby('Outcome')[c].transform(lambda s: s.fillna(s.mean()))

# 4. Compute the correlation between the different columns.
corr_after = dia.drop(columns=['Outcome']).corr()

In [None]:
# show corr_before
corr_before

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Pregnancies,1.0,0.129459,0.141282,-0.081672,-0.073535,0.017683,-0.033523,0.544341
Glucose,0.129459,1.0,0.15259,0.057328,0.331357,0.221071,0.137337,0.263514
BloodPressure,0.141282,0.15259,1.0,0.207371,0.088933,0.281805,0.041265,0.239528
SkinThickness,-0.081672,0.057328,0.207371,1.0,0.436783,0.392573,0.183928,-0.11397
Insulin,-0.073535,0.331357,0.088933,0.436783,1.0,0.197859,0.185071,-0.042163
BMI,0.017683,0.221071,0.281805,0.392573,0.197859,1.0,0.140647,0.036242
DiabetesPedigreeFunction,-0.033523,0.137337,0.041265,0.183928,0.185071,0.140647,1.0,0.033561
Age,0.544341,0.263514,0.239528,-0.11397,-0.042163,0.036242,0.033561,1.0


In [None]:
# show corr_after
corr_after

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Pregnancies,1.0,0.129459,0.208935,0.094172,-0.073535,0.024127,-0.033523,0.544341
Glucose,0.129459,1.0,0.222417,0.220943,0.331357,0.219879,0.137337,0.263514
BloodPressure,0.208935,0.222417,1.0,0.203453,-0.048106,0.286518,-0.002264,0.324439
SkinThickness,0.094172,0.220943,0.203453,1.0,0.104017,0.565443,0.102426,0.135916
Insulin,-0.073535,0.331357,-0.048106,0.104017,1.0,0.185545,0.185071,-0.042163
BMI,0.024127,0.219879,0.286518,0.565443,0.185545,1.0,0.15253,0.027578
DiabetesPedigreeFunction,-0.033523,0.137337,-0.002264,0.102426,0.185071,0.15253,1.0,0.033561
Age,0.544341,0.263514,0.324439,0.135916,-0.042163,0.027578,0.033561,1.0


In [None]:
# Print & Compare
diff = (corr_after - corr_before).round(3)
print("Max abs change in correlation:", diff.abs().to_numpy().max())
print("\nTop differences (absolute):")
pairs = []
cols = diff.columns
for i in range(len(cols)):
    for j in range(i+1, len(cols)):
        delta = float(diff.iloc[i, j])
        pairs.append((cols[i], cols[j], delta, abs(delta)))
pairs.sort(key=lambda x: x[3], reverse=True)
for a,b,delta,ad in pairs[:10]:
    print(f"{a} ↔ {b}: Δ={delta:.6f} |Δ|={ad:.6f}")

Max abs change in correlation: 0.333

Top differences (absolute):
SkinThickness ↔ Insulin: Δ=-0.333000 |Δ|=0.333000
SkinThickness ↔ Age: Δ=0.250000 |Δ|=0.250000
Pregnancies ↔ SkinThickness: Δ=0.176000 |Δ|=0.176000
SkinThickness ↔ BMI: Δ=0.173000 |Δ|=0.173000
Glucose ↔ SkinThickness: Δ=0.164000 |Δ|=0.164000
BloodPressure ↔ Insulin: Δ=-0.137000 |Δ|=0.137000
BloodPressure ↔ Age: Δ=0.085000 |Δ|=0.085000
SkinThickness ↔ DiabetesPedigreeFunction: Δ=-0.082000 |Δ|=0.082000
Glucose ↔ BloodPressure: Δ=0.070000 |Δ|=0.070000
Pregnancies ↔ BloodPressure: Δ=0.068000 |Δ|=0.068000
