In [13]:
import pandas as pd
import csv

# Load dataset
file_path = r"C:\Users\Deshan\Documents\Github\Bird-Range-Prediction\Migration model\data\migration_data.csv"
df = pd.read_csv(file_path)

# Get unique LOCALITY values
unique_localities = df['LOCALITY'].dropna().unique()
unique_localities_sorted = sorted(unique_localities)

# Format and print with quotes and commas
formatted = ', '.join(f'"{loc}"' for loc in unique_localities_sorted)

# Print wrapped across lines (optional for readability)
print("\nFormatted Unique Localities:\n")
print(formatted)

with open("cleaned_unique_localities.csv", 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f, quoting=csv.QUOTE_ALL)
    writer.writerow(unique_localities_sorted)


print("\n📝 Saved to 'cleaned_unique_localities.csv'")


Formatted Unique Localities:

"Agunukolapelessa Kepu Ela", "Airport Entrance Road Mattala Southern Province", "Airport Entrance Road Southern Province", "Alikatupalassa Yala National Park Block I Katagamuwa Entrance Southern Province", "Amaduwa Beach  Yala", "Amanwella Beach", "Amanwella New Land", "Ambilakala Lagoon Southern Province", "Ambilikala Kalapuwa Southern Province", "Ampitiya Lake Beliatta Southern Province", "Anantara Peace Haven Tangalle Resort", "Anantara Peace Haven Tangalle Resort Goyambokka", "Anantara Peace Heaven", "Anantara Peace Heaven Tangalle Resort Tangalle Southern Province", "Andarawewa Highway Exit Road", "Andarawewa Interchange Southern Expressway Southern Province", "Atulla Kalapuwa", "Atulla Lagoon", "Ayurvedic Hospital Gardens Badagiriya Hambantota  Southern Province", "B Kirinda Road Southern Province", "B Kirinda Southern Province", "B Kirinda Southern Province  Bundala National Park", "B New Road Hambantota", "B Tissamaharama Southern Province  yala N

In [4]:
# Step 1: Install required librar
# Step 2: Import libraries
import pandas as pd
import re
from rapidfuzz import process, fuzz

# Step 3: Load the dataset
file_path = r"C:\Users\Deshan\Documents\Github\Bird-Range-Prediction\Migration model\data\migration_data.csv"
df = pd.read_csv(file_path)

# Step 4: Normalize LOCALITY strings
def normalize(text):
    text = str(text).lower().strip()
    text = re.sub(r'\s+', ' ', text)                      # remove extra spaces
    text = re.sub(r'[^\w\s]', '', text)                   # remove punctuation
    text = re.sub(r'\bsouthern province\b', '', text)     # remove common suffixes
    text = re.sub(r'\bsri lanka\b', '', text)
    text = re.sub(r'\broad\b', '', text)
    text = re.sub(r'\bentrance\b', '', text)
    return text.strip()

df['LOCALITY_CLEAN'] = df['LOCALITY'].apply(normalize)

# Step 5: Fuzzy matching to group similar localities
unique_values = df['LOCALITY_CLEAN'].dropna().unique()
mapped = {}

for val in unique_values:
    if val in mapped:
        continue
    matches = process.extract(val, unique_values, scorer=fuzz.token_sort_ratio, limit=None)
    for match_val, score, _ in matches:
        if score >= 90:
            mapped[match_val] = val  # Map all close matches to the base value

# Step 6: Map to grouped values
df['LOCALITY_GROUPED'] = df['LOCALITY_CLEAN'].map(mapped)

# Step 7: Get the final unique locality list
final_localities = sorted(df['LOCALITY_GROUPED'].dropna().unique())

# Step 8: Print unique localities
print("✅ Unique Cleaned Localities:")
for loc in final_localities:
    print(loc)

# Step 9: Optional – Save to CSV
pd.Series(final_localities).to_csv("cleaned_unique_localities.csv", index=False)
print("\n📝 Saved to 'cleaned_unique_localities.csv'")


✅ Unique Cleaned Localities:

agunukolapelessa kepu ela
airport
airport   mattala
alikatupalassa yala national park block i katagamuwa
amaduwa beach yala
amanwella beach
amanwella new land
ambilakala lagoon
ambilikala kalapuwa
ampitiya lake beliatta
anantara peace haven tangalle resort
anantara peace haven tangalle resort goyambokka
anantara peace heaven
anantara peace heaven tangalle resort tangalle
andarawewa highway exit
andarawewa interchange southern expressway
atulla kalapuwa
atulla lagoon
ayurvedic hospital gardens badagiriya hambantota
b kirinda
b new  hambantota
b tissamaharama  yala national park
b_bundala_np_unnamed
b_unnamed
back of beyond
back of beyond kahandamodera ranna
back of beyond kahandaramodara
badagiriya keliyawalana tanks
badagiriya tank
bandagiriya
bandagiriya keliyawalana wewa connecting canal
bandagiriya lake
bandu wewa bundala village
beauty of nature guesthouse kirinda
behind debarawara lake sandungama
beliatta railway station
beliatta walasmulla
beragama l

In [14]:
import pandas as pd
import re
from rapidfuzz import fuzz, process

# Load your CSV
file_path = r"C:\Users\Deshan\Documents\Github\Bird-Range-Prediction\Migration model\data\migration_data.csv"
df = pd.read_csv(file_path)

# Step 1: Normalize locality names
def normalize(text):
    text = str(text).lower()
    text = re.sub(r'\s+', ' ', text)  # remove extra spaces
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    return text.strip()

df['LOCALITY_CLEAN'] = df['LOCALITY'].apply(normalize)

# Step 2: Fuzzy cluster similar names
unique_names = df['LOCALITY_CLEAN'].dropna().unique().tolist()
representatives = []
manual_mappings = {}

for name in unique_names:
    # Skip if already mapped
    if name in manual_mappings:
        continue

    # Use the current name as the group representative
    representative = name
    representatives.append(representative)

    # Find similar names
    matches = process.extract(representative, unique_names, scorer=fuzz.token_sort_ratio, limit=None)
    for match_name, score, _ in matches:
        if score >= 90:  # high similarity threshold
            manual_mappings[match_name] = representative

# Step 3: Reverse mapping to get: {variant: "Proper Name"}
reverse_mapping = {}
for variant, rep in manual_mappings.items():
    original_variants = df[df['LOCALITY_CLEAN'] == variant]['LOCALITY'].values
    if len(original_variants) > 0:
        reverse_mapping[original_variants[0]] = df[df['LOCALITY_CLEAN'] == rep]['LOCALITY'].value_counts().idxmax()

# Step 4: Print the final manual_mappings
print("manual_mappings = {")
for k, v in sorted(reverse_mapping.items()):
    print(f'    "{k}": "{v}",')
print("}")


manual_mappings = {
    "Agunukolapelessa Kepu Ela": "Agunukolapelessa Kepu Ela",
    "Airport Entrance Road Mattala Southern Province": "Airport Entrance Road Southern Province",
    "Airport Entrance Road Southern Province": "Airport Entrance Road Southern Province",
    "Alikatupalassa Yala National Park Block I Katagamuwa Entrance Southern Province": "Alikatupalassa Yala National Park Block I Katagamuwa Entrance Southern Province",
    "Amaduwa Beach  Yala": "Amaduwa Beach  Yala",
    "Amanwella Beach": "Amanwella Beach",
    "Amanwella New Land": "Amanwella New Land",
    "Ambilakala Lagoon Southern Province": "Ambilakala Lagoon Southern Province",
    "Ambilikala Kalapuwa Southern Province": "Ambilikala Kalapuwa Southern Province",
    "Ampitiya Lake Beliatta Southern Province": "Ampitiya Lake Beliatta Southern Province",
    "Anantara Peace Haven Tangalle Resort": "Anantara Peace Haven Tangalle Resort",
    "Anantara Peace Haven Tangalle Resort Goyambokka": "Anantara Peace Haven