Part 1 "This code was made in google colab"

In [2]:
pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.2.0-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.2.0-py3-none-any.whl (255 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.2/255.2 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-3.2.0


In [3]:
import os
import re
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.cluster import DBSCAN
from collections import defaultdict
from sentence_transformers import SentenceTransformer
import spacy
from sklearn.metrics.pairwise import cosine_similarity
from concurrent.futures import ThreadPoolExecutor, as_completed
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load the embeddings model and the NLP model
model = SentenceTransformer('paraphrase-mpnet-base-v2')  # Pre-trained model of embeddings
nlp = spacy.load('en_core_web_sm')  # Model NLP

# Customized list of irrelevant words in company names
EXTRA_STOP_WORDS = {"inc", "corp", "co", "ltd", "group", "corporation", "limited", "company", "mall", "road", "street"}

# Function to clean up company names using spaCy and eliminate stop words
def clean_firm_name(name):
    doc = nlp(name.lower())
    cleaned_name = ' '.join(
        [token.lemma_ for token in doc if not token.is_stop and token.is_alpha and token.lemma_ not in EXTRA_STOP_WORDS]
    )
    return cleaned_name

# Function to calculate embeddings of company names in batches
def get_embeddings(names):
    batch_size = 90  # Sets this value according to the available memory
    embeddings = []
    for i in range(0, len(names), batch_size):
        batch = names[i:i + batch_size]
        embeddings.extend(model.encode(batch, convert_to_tensor=True))
    return np.array(embeddings)


# Function for grouping similar names using DBSCAN
def group_similar_names_with_dbscan(embeddings, names, eps=0.3, min_samples=3):
    clustering = DBSCAN(eps=eps, min_samples=min_samples, metric='cosine').fit(embeddings)
    grouped_names = defaultdict(list)

    for idx, label in enumerate(clustering.labels_):
        grouped_names[label].append(names[idx])

    # Filtrar outliers (label -1 indica ruido)
    grouped_names = {k: v for k, v in grouped_names.items() if k != -1}
    return grouped_names

# Function for filtering names using cosine similarity after DBSCAN
def filter_similar_names(grouped_names, embeddings, names, threshold=0.8):
    refined_groups = defaultdict(list)

    for label, group_names in grouped_names.items():
        group_indices = [names.index(name) for name in group_names]
        name_vectors = [embeddings[idx] for idx in group_indices]
        similarity_matrix = cosine_similarity(name_vectors)

        for i, name1 in enumerate(group_names):
            for j, name2 in enumerate(group_names):
                if i != j and similarity_matrix[i][j] >= threshold:
                    refined_groups[label].append(name1)
                    refined_groups[label].append(name2)

        # Eliminate duplicates within each group and keep only names with sufficient similarity
        refined_groups[label] = list(set(refined_groups[label]))

    # Filter out groups that were left empty
    refined_groups = {k: v for k, v in refined_groups.items() if len(v) > 1}
    return refined_groups

# Function to process names in parallel and calculate embeddings
def parallel_process_firm_names(group, text_column, max_workers=None):
    if max_workers is None:
        max_workers = os.cpu_count() * 2

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        tqdm.pandas(desc="Cleaning Firm Names")
        cleaned_names = list(executor.map(clean_firm_name, group[text_column].tolist()))

    # Get embeddings of the cleaned names in batches
    embeddings = get_embeddings(cleaned_names)

    # Grouping similar names using DBSCAN
    grouped_names = group_similar_names_with_dbscan(embeddings, cleaned_names, eps=0.7, min_samples=2)

    # Filtering names using cosine similarity to improve the quality of the groups
    refined_grouped_names = filter_similar_names(grouped_names, embeddings, cleaned_names, threshold=0.7)

    return refined_grouped_names, pd.DataFrame({text_column: cleaned_names, 'cleaned_name': cleaned_names})

# Function for assigning unique IDs and choosing the best name
def assign_unique_ids_and_best_names(grouped_names, original_group, text_column, country_name):
    cleaned_data = []

    for group, names in grouped_names.items():
        best_name = max(names, key=len)  # Choosing the longest name as the best name
        original_names = original_group[original_group['cleaned_name'].isin(names)][text_column].tolist()

        cleaned_data.append({
            'cleaned_ID': id(best_name),
            'cleaned_name': best_name,
            'original_firm_names': original_names,
            'country': country_name
        })

    return cleaned_data



# Main function to clean, group and assign IDs by group with parallelization by country using ThreadPoolExecutor
def clean_and_group_by_country(df, text_column, country_column, max_workers=None):
    all_cleaned_data = []

    # Group by country
    grouped_df = df.groupby(country_column)

    # Create a progress bar with the total number of countries to process
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {
            executor.submit(
                parallel_process_firm_names, group, text_column, max_workers
            ): (country, group) for country, group in grouped_df
        }

        # Configure tqdm manually
        with tqdm(total=len(futures), desc="Processing Countries") as pbar:
            for future in as_completed(futures):
                country, group = futures[future]
                try:
                    grouped_names, original_group = future.result()
                    cleaned_data = assign_unique_ids_and_best_names(grouped_names, original_group, text_column, country)
                    all_cleaned_data.extend(cleaned_data)
                except Exception as e:
                    print(f"Error processing country {country}: {e}")

                # Update the progress bar
                pbar.update(1)

    # Create a new DataFrame with the results.
    cleaned_df = pd.DataFrame(all_cleaned_data)

    return cleaned_df


df = pd.read_csv('ForeignNames_2019_2020.csv')
df['cleaned_ID'] = df['foreign']
df['cleaned_ID'] = df['cleaned_ID'].str.lower()
df['foreign'] = df['foreign'].str.lower()
df = df.dropna()
df= df.sample(n=60000)
df= df.groupby(['cleaned_ID','foreign', 'foreigncountry_cleaned']).count().reset_index()
processed_df = clean_and_group_by_country(df,'foreign', 'foreigncountry_cleaned', max_workers=40)

# Explode
df_explotado = processed_df.explode('original_firm_names')

# Upload Country_Name_ISO3.csv
iso_df = pd.read_csv('Country_Name_ISO3.csv')

# Ensure that the columns have the same data type.
df_explotado['country'] = df_explotado['country'].astype(str)
iso_df['country_name'] = iso_df['country_name'].astype(str)

# Perform the merge between the two databases using the corresponding columns
df_1 = df_explotado.merge(iso_df[['country_name', 'country_iso3']],
                     how='left',
                     left_on='country',
                     right_on='country_name')

# Verify whether the country “Iran” is in the base
#country = "Iran"

# Check if the country is present in the column 'country_name'.
#if country in iso_df['country_name'].values:
#    print(f"{country} in the data iso_df.")
#else:
#   print(f"{country} no no in the data iso_df.")

# Suppose you already have the DataFrame 'df_1' with the column 'cleaned_name'.
# Divide the data into 70% training and 30% testing
train_df, test_df = train_test_split(df_1, test_size=0.3, random_state=42)

print(f"Training set size: {len(train_df)}")
print(f"Test set size: {len(test_df)}")

# Make sure the line is complete and the quotation marks are properly closed.
#train_df.to_excel('train_df_output.xlsx', index=False, engine='openpyxl')

# Vectorizing the original names to create a numerical representation
vectorizer = TfidfVectorizer()

# Ensure that all values in 'original_firm_names' are text strings
train_df['original_firm_names'] = train_df['original_firm_names'].apply(lambda x: ' '.join(x) if isinstance(x, list) else str(x))

# Adjust the vectorizer and transform the company names in the training set
X_train = vectorizer.fit_transform(train_df['original_firm_names'])
y_train = train_df['cleaned_name']

# Training a RandomForest model to predict clean names
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Ensure that all values in 'original_firm_names' are text strings
test_df['original_firm_names'] = test_df['original_firm_names'].apply(lambda x: ' '.join(x) if isinstance(x, list) else str(x))

# Transform the company names in the test set
X_test = vectorizer.transform(test_df['original_firm_names'])
y_test = test_df['cleaned_name']

# Predict clean names in the test set
y_pred = model.predict(X_test)

# Evaluate the accuracy of the model after applying the algorithm
accuracy_after = accuracy_score(y_test, y_pred)

# Evaluate the accuracy before (assuming the original names were not good)
accuracy_before = accuracy_score(y_test, test_df['original_firm_names'])

print(f"Accuracy before applying the model: {accuracy_before:.2f}")
print(f"Accuracy after applying the model: {accuracy_after:.2f}")

# Compare and comment on whether accuracy has improved.
if accuracy_after > accuracy_before:
    print("Precision improved after applying the machine learning model.")
else:
    print("Precision improved after applying the machine learning model.")

# Add predictions to the test DataFrame
test_df['predicted_cleaned_name'] = y_pred

# Export the combined DataFrame to a CSV file
df_1.to_csv('outputfile_Maria_1.csv', index=False)

#Filter the rows where the original name is different from the clean name
df_changed = df_1[df_1['original_firm_names'] != df_1['cleaned_name']]

# Select the required columns
# This includes the original company name and the clean name it was changed to
df_changed = df_changed[['original_firm_names', 'cleaned_name']]

# Export the resulting DataFrame to a CSV file with the specified name
df_changed.to_csv('outputfile_Maria_1_changed.csv', index=False)

  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/594 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Processing Countries: 100%|██████████| 211/211 [56:34<00:00, 16.09s/it] 


Training set size: 18037
Test set size: 7731
Accuracy before applying the model: 0.00
Accuracy after applying the model: 0.76
Precision improved after applying the machine learning model.
