In [14]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Read the old data from CSV
old_data = pd.read_csv('datamatch.csv')

# Define a function to calculate matching rate using cosine similarity
def calculate_matching_rate(new_entry, old_data):
    # Transform the text data into TF-IDF vectors
    vectorizer = TfidfVectorizer()
    old_text = old_data['Area'] + ' ' + old_data['Acts'] + ' ' + old_data['Crime'] + ' ' + old_data['Victim_ID'].astype(str)
    new_text = new_entry['Area'] + ' ' + new_entry['Acts'] + ' ' + new_entry['Crime'] + ' ' + str(new_entry['Victim_ID'])
    vectors = vectorizer.fit_transform(list(old_text) + [new_text])

    # Calculate cosine similarity
    similarity = cosine_similarity(vectors[-1], vectors[:-1])

    # Sort the similarity scores and get the top 5 matches
    top_matches = sorted(enumerate(similarity[0]), key=lambda x: x[1], reverse=True)[:5]

    # Calculate matching rate
    matching_rate = sum(score for _, score in top_matches) / len(top_matches)

    return matching_rate * 100, top_matches

# Manually provide new data
new_entry = {
    'Timestamp': '2024-04-04',
    'Area': 'Mambalam.',
    'Acts': 'act379',
    'Crime': 'Robbery',
    'Victim_ID': 'Victim66'
}

# Convert the new entry to DataFrame
new_entry_df = pd.DataFrame([new_entry])

# Concatenate the new entry with the old data
old_data = pd.concat([old_data, new_entry_df], ignore_index=True)

    
# Display the results
print("New Entry:")
print(new_entry)
print("Matching Rate: {:.2f}%".format(matching_rate))
print("Top 5 Matches:")
print()
for index, score in top_matches:
    print("Matching Rate: {:.2f}%".format(score * 100))
    print(old_data.iloc[index])
    print()
old_data.to_csv('datamatch.csv', index=False)


New Entry:
{'Timestamp': '2024-04-04', 'Area': 'Mambalam.', 'Acts': 'act379', 'Crime': 'Robbery', 'Victim_ID': 'Victim66'}
Matching Rate: 67.69%
Top 5 Matches:

Matching Rate: 100.00%
Timestamp    28-02-2018 21:00
Area                Mambalam.
Acts                   act379
Crime                 Robbery
Victim_ID            Victim66
Name: 0, dtype: object

Matching Rate: 69.64%
Timestamp    28-02-2018 19:00
Area                 Perambur
Acts                   act379
Crime                 Robbery
Victim_ID            Victim66
Name: 19, dtype: object

Matching Rate: 65.13%
Timestamp    03-03-2018 15:30
Area                   Vepery
Acts                   act379
Crime                 Robbery
Victim_ID            Victim66
Name: 843, dtype: object

Matching Rate: 53.20%
Timestamp    07-03-2018 14:00
Area            Ethiraj Salai
Acts                   act379
Crime                 Robbery
Victim_ID            Victim66
Name: 929, dtype: object

Matching Rate: 50.50%
Timestamp    28-02-2018 01: