This script extracts all satellite names from each country database (obtained from Space Explorer) with the satellite names obtained from the administration letters.
It produces two lists: found and not found (in the SE database).  It uses a soft-match string search and produces a best-match   

In [1]:
import pandas as pd
import numpy as np
import os
from IPython.display import display
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
pd.set_option('display.max_rows', None)

In [2]:
countries_folder = './countriestables'
# load all tables from the folders into a single pd file
# Initialize an empty list to store DataFrames
dataframes = []

# Loop through all files in the folder
for filename in os.listdir(countries_folder):
    if filename.endswith('.csv'):  # Assuming the tables are in CSV format
        filepath = os.path.join(countries_folder, filename)
        df = pd.read_csv(filepath)
        dataframes.append(df)

# Concatenate all DataFrames into a single DataFrame
combined_df = pd.concat(dataframes, ignore_index=True)

# Display the combined DataFrame
display(combined_df.head())
# print all the column names
print(combined_df.columns)

  df = pd.read_csv(filepath)
  df = pd.read_csv(filepath)
  df = pd.read_csv(filepath)
  df = pd.read_csv(filepath)


Unnamed: 0,com_el.ntc_id,com_el.tgt_ntc_id,com_el.adm,com_el.ntwk_org,com_el.sat_name,com_el.long_nom,com_el.prov,com_el.d_rcv,com_el.st_cur,orbit.orb_id,...,grp.freq_max,grp.bdwdth,grp.d_inuse,grp.d_reg_limit,grp.d_prot_eff,grp.f_biu,emiss.seq_no,emiss.pwr_ds_max,emiss.design_emi,carrier_fr.freq_carr
0,117545401,,AFS,,ZACUBE-2,,9.1/IA,15.12.2017,50,1.0,...,402.0,,,15.12.2024,,,1.0,-37.0,9K50F1DBN,401.5
1,120545101,,AFS,,MDASAT-1,,9.1/IA,13.05.2020,50,1.0,...,402.0,,,13.05.2027,,,1.0,-37.0,9K50F1DBN,401.0
2,121545063,,AFS,,EOS AGRISAT-1,,9.1/IA,28.03.2021,50,1.0,...,2110.0,,,28.03.2028,,,1.0,-46.8,1M20G1DBN,2030.0
3,117545401,,AFS,,ZACUBE-2,,9.1/IA,15.12.2017,50,1.0,...,2290.0,,,15.12.2024,,,1.0,-65.0,6M40G2DDN,2225.0
4,121545063,,AFS,,EOS AGRISAT-1,,9.1/IA,28.03.2021,50,1.0,...,2290.0,,,28.03.2028,,,1.0,-59.1,680KG1DDN,2235.0


Index(['com_el.ntc_id', ' com_el.tgt_ntc_id', ' com_el.adm',
       ' com_el.ntwk_org', ' com_el.sat_name', ' com_el.long_nom',
       ' com_el.prov', ' com_el.d_rcv', ' com_el.st_cur', ' orbit.orb_id',
       ' orbit.nbr_sat_pl', ' orbit.apog_km', ' orbit.perig_km',
       ' orbit.op_ht_km', ' s_beam.emi_rcp', ' s_beam.beam_name',
       ' grp.grp_id', ' grp.freq_min', ' grp.freq_max', ' grp.bdwdth',
       ' grp.d_inuse', ' grp.d_reg_limit', ' grp.d_prot_eff', ' grp.f_biu',
       ' emiss.seq_no', ' emiss.pwr_ds_max', ' emiss.design_emi',
       ' carrier_fr.freq_carr'],
      dtype='object')


In [3]:

# Get all unique names in the column 'com_el.sat_name' and their first corresponding 'com_el.adm' column
unique_names_with_adm = combined_df.drop_duplicates(subset=[' com_el.sat_name'])[[' com_el.sat_name', ' com_el.adm']]
sat_names_db = unique_names_with_adm[' com_el.sat_name'].tolist()
adm_names_db = unique_names_with_adm[' com_el.adm'].tolist()
# Print the unique names with their corresponding 'com_el.adm' column without truncation
pd.set_option('display.max_rows', None)
display(unique_names_with_adm)


Unnamed: 0,com_el.sat_name,com_el.adm
0,ZACUBE-2,AFS
1,MDASAT-1,AFS
2,EOS AGRISAT-1,AFS
11,SHAHEEN SAT,ARS
12,SAUDISAT-1C,ARS
13,SAUDISAT-6,ARS
21,SAUDISAT-5,ARS
33,SPACETOWER-1,ARS
11578,ADF 95E IOR,AUS
11579,ADF 88E IOR,AUS


get the reference names from the satellitenames folder

In [4]:
names_folder = './satellitenames'
# get a list of names and a list of countries (name of the file dropping.txt)

# Initialize lists to store names and countries
names = []
countries = []

# Loop through all files in the folder
for filename in os.listdir(names_folder):
    if filename.endswith('.txt'):
        filepath = os.path.join(names_folder, filename)
        with open(filepath, 'r') as file:
            content = file.read().strip()
            if content:
                # Split the content by commas and extend the names list
                names.extend(content.split(', '))
                # Add the country name (filename without extension) to the countries list
                countries.extend([filename.split('.')[0]] * len(content.split(', ')))

# # Print the lists of names and countries
# print("Names:", names)
# print("Countries:", countries)


In [5]:
# Assuming unique_names_with_adm and names are already defined
# unique_names_with_adm is a DataFrame with columns 'com_el.sat_name' and 'com_el.adm'
# names is a list of satellite names

# Initialize lists to store the best match names and scores
best_match_names = []
scores = []

# Iterate over each name in the names list
for name in names:
    # Find the best match for the current name in the 'com_el.sat_name' column
    best_match, score = process.extractOne(name, sat_names_db)
    
    # If no match is found, set the best match to 'not found'
    if score < 95:  # You can adjust the threshold as needed
        best_match = 'not found'
    
    # Append the best match and score to the respective lists
    best_match_names.append(best_match)
    scores.append(score)

result_df = pd.DataFrame({
    'administration': countries,
    'sat_name': names,
    'best_match': best_match_names,
    'score': scores
})

# Export the DataFrame as a CSV file
result_df.to_csv('satellite_matches.csv', index=False)

print("The DataFrame has been exported as 'satellite_matches.csv'.")

# # Print the results
# print("Best Match Names:", best_match_names)
# print("Scores:", scores)
# make a pd dataframe with 4 columns administration, sat_name, best_match, score and export as csv

The DataFrame has been exported as 'satellite_matches.csv'.
