In [1]:
!pip install rapidfuzz

Collecting rapidfuzz
  Downloading rapidfuzz-3.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading rapidfuzz-3.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m50.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz
Successfully installed rapidfuzz-3.9.3


In [2]:
import numpy as np
import pandas as pd
import emoji
import time
import os
import string
import re
from rapidfuzz import process, fuzz
from collections import defaultdict
from joblib import Parallel, delayed
from functools import lru_cache

from IPython.display import display, HTML
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/satria-data-semifinal/encrypted_merge.csv
/kaggle/input/indonesia-city-list-by-province-453-cities/city_list_indonesia.csv


# Helper Class

In [3]:
# Function to create a scrollable table
def create_scrollable_table(df, title):
    def format_element(x):
        if pd.isna(x):
            return ""
        if isinstance(x, (int, float)):
            return f"{x:.4f}"  # Keep the precision for numerical values
        return x

    formatted_df = df.applymap(format_element)
    df_html = formatted_df.to_html(classes='table table-striped', escape=False, index=False)
    html = f"""
    <div style='height:400px; overflow:auto;'>
        <h2>{title}</h2>
        {df_html}
    </div>
    """
    return html

# Data Loading, Simple Summary

In [4]:
df = pd.read_csv("/kaggle/input/satria-data-semifinal/encrypted_merge.csv")
cities_df = pd.read_csv('/kaggle/input/indonesia-city-list-by-province-453-cities/city_list_indonesia.csv')

In [5]:
abbreviations = {
    'dki jakarta': 'jakarta',
    '62': 'indonesia',
    "ina": 'indonesia',
    'id': "indonesia",
    'sby': "surabaya",
    "jkt": "jakarta",
    "jatim": "jawa timur",
    "jateng": "jawa tengah",
    "west": "barat",
    "east": "timur",
    "north": "utara",
    "south": "selatan",
    "java": "jawa",
    'tangsel': "tanggerang selatan",
    "sumsel": "sumatera selatan",
    "sumbar": "sumatera barat",
    "sunda empire": "noisy",
    "solo": "surakarta",
    "batavia": "jakarta",
    "bansel": "bandung selatan",
    "kalsel": "kalimantan selatan",
    "konoha": "indonesia",
    "ntt": "nusa tenggara timur",
    'jogja': "yogyakarta",
    'jktina': "jakarta",
    'diy':"yogyakarta",
    'dki':'',
    'central':"tengah",
    'jakarta':"jakarta",
    'jaksel':"kota administrasi jakarta selatan",
    'jakbar':"kota administrasi jakarta barat",
    'jakarta timur':"kota administrasi jakarta timur",
    'jakarta barat': "kota administrasi jakarta barat",
    'jakarta utara': "kota administrasi jakarta utara",
    'jakarta selatan': "kota administrasi jakarta selatan"
}

# Preprocessing

In [6]:
# Precompile the geocode regex
geocode_pattern = re.compile(r'.*[-+]?([1-8]?\d(\.\d+)?|90(\.0+)?),\s*[-+]?(180(\.0+)?|(1[0-7]\d)|([1-9]?\d))(\.\d+)?')

# Preprocessing function
def clean_location(location):
    if pd.isna(location) or location is None:
        return "Missing"
    
    # Geocode detection
    if geocode_pattern.match(location):
        return "Geocode"
    
    location = location.lower().translate(str.maketrans('', '', string.punctuation.replace(',', '')))
    location = re.sub(r',', ' ', location)
    location = re.sub(r'\s+', ' ', location).strip()
    words = location.split()
    cleaned_words = [abbreviations.get(word, word) for word in words]
    return ' '.join(cleaned_words)

# Create dictionaries for cities and provinces
city_dict = {row['City'].lower(): f"{row['City']}, {row['Province']}, {row['Nation']}" for index, row in cities_df.iterrows()}
province_dict = {row['Province'].lower(): f"{row['Province']}, {row['Nation']}" for index, row in cities_df.iterrows()}

# Combine dictionaries
combined_mappings = {**city_dict, **province_dict}

In [7]:
# Normalization function with hierarchical inference and fuzzy matching
@lru_cache(maxsize=10000)
def normalize_location(location, city_dict=city_dict, province_dict=province_dict):
    if location == "Missing" or location == "Geocode":
        return location

    words = location.split()
    possible_matches = {'city': [], 'province': []}

    # Check each word in the city dictionary
    for word in words:
        if word in city_dict:
            possible_matches['city'].append(city_dict[word])
    
    # If no city match is found, check each word in the province dictionary
    if not possible_matches['city']:
        for word in words:
            if word in province_dict:
                possible_matches['province'].append(province_dict[word])
    
    # Word Inference
    if possible_matches['city']:
        return possible_matches['city'][0] 
    elif possible_matches['province']:
        return possible_matches['province'][0]  
    
    # Fuzzy Inference in case manual Inference failed
    best_match = process.extractOne(location, list(city_dict.keys()), scorer=fuzz.token_set_ratio)
    if best_match and best_match[1] > 80:  # Adjust the threshold as needed
        return city_dict[best_match[0]]
    
    # In case city inference failed
    best_match = process.extractOne(location, list(province_dict.keys()), scorer=fuzz.token_set_ratio)
    if best_match and best_match[1] > 60:  # Adjust the threshold as needed
        return province_dict[best_match[0]]

    if 'indonesia' in words:
        return "Indonesia"
    return 'Unknown Location'


In [8]:
# DEBUG purpose
sample_size = 6000
sample_df = df.sample(n=sample_size, random_state=1).copy()

# Measure Preprocessing Speed
start_preprocessing = time.time()
sample_df['cleaned_loc'] = sample_df['loc'].apply(clean_location)
end_preprocessing = time.time()

# Measure Inference Speed
start_normalization = time.time()
sample_df['standardized_loc'] = sample_df['cleaned_loc'].apply(normalize_location)
end_normalization = time.time()

# Speed Metric Calculation
preprocessing_time = end_preprocessing - start_preprocessing
normalization_time = end_normalization - start_normalization
total_time = preprocessing_time + normalization_time

print(f"Time taken for preprocessing {sample_size} samples: {preprocessing_time:.2f} seconds")
print(f"Preprocessing speed: {sample_size / preprocessing_time:.2f} rows/second")

print(f"Time taken for inference {sample_size} samples: {normalization_time:.2f} seconds")
print(f"Inference speed: {sample_size / normalization_time:.2f} rows/second")

print(f"Total time taken for cleaning and normalizing {sample_size} samples: {total_time:.2f} seconds")
print(f"Total cleaning speed: {sample_size / total_time:.2f} rows/second")

Time taken for preprocessing 6000 samples: 0.03 seconds
Preprocessing speed: 204014.69 rows/second
Time taken for inference 6000 samples: 0.29 seconds
Inference speed: 21014.55 rows/second
Total time taken for cleaning and normalizing 6000 samples: 0.31 seconds
Total cleaning speed: 19052.09 rows/second


In [9]:
debug_html = create_scrollable_table(sample_df[['loc', 'cleaned_loc', 'standardized_loc']], 'Debug')
display(HTML(debug_html))

  formatted_df = df.applymap(format_element)


loc,cleaned_loc,standardized_loc
,Missing,Missing
,Missing,Missing
Singapore,singapore,Unknown Location
,Missing,Missing
,Missing,Missing
,Missing,Missing
,Missing,Missing
,Missing,Missing
unknown,unknown,Unknown Location
Riyadh,riyadh,Unknown Location


In [10]:
# Main Data Cleaning for 'loc'
start_preprocessing = time.time()
df['loc'] = df['loc'].apply(clean_location)
end_preprocessing = time.time()
# Speed Metric Calculation
preprocessing_time = end_preprocessing - start_preprocessing

print(f"Time taken for preprocessing {df.shape[0]} samples: {preprocessing_time:.2f} seconds")
print(f"Preprocessing speed: {df.shape[0] / preprocessing_time:.2f} rows/second")

Time taken for preprocessing 9817355 samples: 44.05 seconds
Preprocessing speed: 222872.60 rows/second


In [11]:
start_normalization = time.time()
df['loc'] = df['loc'].apply(normalize_location)
end_normalization = time.time()

normalization_time = end_normalization - start_normalization
total_time = preprocessing_time + normalization_time

print(f"Time taken for inference {df.shape[0]} samples: {normalization_time:.2f} seconds")
print(f"Inference speed: {df.shape[0] / normalization_time:.2f} rows/second")

print(f"Total time taken for cleaning and normalizing {df.shape[0]} samples: {total_time:.2f} seconds")
print(f"Total cleaning speed: {df.shape[0] / total_time:.2f} rows/second")

Time taken for inference 9817355 samples: 85.59 seconds
Inference speed: 114696.24 rows/second
Total time taken for cleaning and normalizing 9817355 samples: 129.64 seconds
Total cleaning speed: 75725.74 rows/second


In [12]:
df.head()

Unnamed: 0,created_at,username,tcode,num_retweets,type,frn_cnt,flw_cnt,sts_cnt,loc,lst_cnt,content,lang
0,2024-01-04T09:57:09Z,@QOS7XYPBfXZWFeLSmdLEt8njUMwwr2Fpel3Cqvh2gW4=,rt,1248,twit,266.0,107.0,9687.0,Missing,0.0,RT K-Popers Berencana Kirim Food Truck untuk A...,id
1,2024-01-04T09:57:09Z,@lSDenDKpcZVnv9txjBcg5qaqxYgVAq/3gTvA8yxPuL4=,rt,195,twit,564.0,303.0,12461.0,Indonesia,2.0,RT Bapak Pendeta Yusak ini Dari Magetan ke Pon...,id
2,2024-01-04T09:57:10Z,@Ykjdr3xs5+WfH9zBQMoAx5fdTeAwmRRm28PFVw5JeJE=,rt,116,twit,376.0,156.0,7488.0,Indonesia,1.0,"RT Viral , Gimana Mak Mak di jakarta tidak ter...",id
3,2024-01-04T09:57:10Z,@KQ/OmqgBG/U/OVkvpqAQYYiAThFGxQBtg3J5Vjp4Glk=,rt,2264,twit,163.0,203.0,2065.0,Missing,0.0,"RT Mendengar Pak Anies disini, rasanya saya su...",id
4,2024-01-04T09:57:11Z,@yqECLRUCZgqx8VzEUl430Wj6mfh2SgDYzKwala0bT5o=,rt,1157,twit,1.0,1.0,798.0,Missing,0.0,RT Media Asing Soroti Cara Anies Gaet Pemilih ...,id


In [13]:
df.to_csv("cleaned_loc_df.csv")