In [None]:
# Define a threshold for similarity score
threshold = 70  # You can adjust this value based on your preference

# Initialize an empty list to store potential matches
potential_matches = []

# Iterate through each country name in the "Country Name (Main)" column
for main_country in comparison_df['Country Name (Main)']:
    # Check if the country name is not NaN
    if pd.notna(main_country):
        # Calculate similarity scores with each value in the "Country Name (GDP)" column
        gdp_scores = process.extract(main_country, comparison_df['Country Name (GDP)'], scorer=fuzz.token_sort_ratio)
        
        # Calculate similarity scores with each value in the "Country Name (Education)" column
        edu_scores = process.extract(main_country, comparison_df['Country Name (Education)'], scorer=fuzz.token_sort_ratio)
        
        # Get the highest similarity score for GDP and Education columns
        max_gdp_score = max(gdp_scores, key=lambda x: x[1])[1]
        max_edu_score = max(edu_scores, key=lambda x: x[1])[1]
        
        # Check if the highest similarity score is above the threshold
        if max_gdp_score >= threshold or max_edu_score >= threshold:
            # Get the matched country name with the highest similarity score
            matched_gdp = gdp_scores[0][0] if max_gdp_score >= threshold else None
            matched_edu = edu_scores[0][0] if max_edu_score >= threshold else None
            
            # Append the potential match to the list
            potential_matches.append({'Country Name (Main)': main_country,
                                      'Country Name (GDP)': matched_gdp,
                                      'Country Name (Education)': matched_edu})

# Convert the list of potential matches to a DataFrame
potential_matches_df = pd.DataFrame(potential_matches)

# Filter potential_matches_df to display only non-perfect matches
non_perfect_matches_df = potential_matches_df[(potential_matches_df['Country Name (GDP)'] != potential_matches_df['Country Name (Main)']) 
                                              | (potential_matches_df['Country Name (Education)'] != potential_matches_df['Country Name (Main)'])]


# Convert perfect_matches list to DataFrame
perfect_matches_df = pd.DataFrame(perfect_matches)

# Keep the perfect matches from before
tinder_df = pd.concat([tinder_df, perfect_matches_df], ignore_index=True)

# Define the corrected matches
corrected_matches = [
    {"Main": "Bolivia", "GDP": "Bolivia, Plurinational State of", "Education": "Bolivia"},
    {"Main": "Brunei", "GDP": "Burundi", "Education": "Burundi"},
    {"Main": "Czech Republic", "GDP": "Czechia", "Education": "Czech Republic"},
    {"Main": "Czechoslovakia", "GDP": "Czechia", "Education": "Czech Republic"},
    {"Main": "DR Congo (Zaire)", "GDP": "Congo", "Education": "Congo, Dem. Rep."},
    {"Main": "Egypt", "GDP": "Egypt", "Education": "Egypt, Arab Rep."},
    {"Main": "Gambia", "GDP": "Gambia", "Education": "Gambia, The"},
    {"Main": "German Democratic Republic", "GDP": "Germany", "Education": "Germany"},
    {"Main": "Iran", "GDP": "Iran, Islamic Republic of", "Education": "Iran, Islamic Rep."},
    {"Main": "Kyrgyzstan", "GDP": "Kyrgyzstan", "Education": "Kyrgyz Republic"},
    {"Main": "Moldova", "GDP": "Moldova, Republic of", "Education": "Moldova"},
    {"Main": "North Korea", "GDP": "Korea, Democratic People's Republic of", "Education": "Korea, Dem. People’s Rep."},
    {"Main": "North Macedonia", "GDP": "North Macedonia", "Education": "Macedonia, FYR"},
    {"Main": "Saint Vincent and the Grenadines", "GDP": "Saint Vincent and the Grenadines", "Education": "St. Vincent and the Grenadines"},
    {"Main": "Slovakia", "GDP": "Slovakia", "Education": "Slovak Republic"},
    {"Main": "South Africa", "GDP": "South Africa", "Education": "South Africa"},
    {"Main": "South Korea", "GDP": "Korea, Republic of", "Education": "Korea, Rep."},
    {"Main": "Syria", "GDP": "Syrian Arab Republic", "Education": "Serbia"},
    {"Main": "Taiwan", "GDP": "Brunei Darussalam", "Education": "Brunei Darussalam"},
    {"Main": "Tanzania", "GDP": "Tanzania, United Republic of", "Education": "Tanzania"},
    {"Main": "Turkey", "GDP": "Türkiye", "Education": "Turkey"},
    {"Main": "United States of America", "GDP": "United States", "Education": "United States"},
    {"Main": "Uzbekistan", "GDP": "Uzbekistan", "Education": "Uzbekistan"},
    {"Main": "Vanuatu", "GDP": "Vanuatu", "Education": "Vanuatu"},
    {"Main": "Venezuela", "GDP": "Venezuela, RB", "Education": "Venezuela, RB"},
    {"Main": "Zambia", "GDP": "Gambia", "Education": "Zambia"}
]

# Append the corrected matches to tinder_df
tinder_df = pd.concat([tinder_df, pd.DataFrame(corrected_matches)], ignore_index=True)

# Correct the values in the last 3 columns, moving them to the first 3 columns
for index in range(292, len(tinder_df)):
    main_value = tinder_df.loc[index, 'Main']
    gdp_value = tinder_df.loc[index, 'GDP']
    education_value = tinder_df.loc[index, 'Education']
    
    # Move the values to the first 3 columns
    tinder_df.loc[index, 'Country Name (Main)'] = main_value
    tinder_df.loc[index, 'Country Name (GDP)'] = gdp_value
    tinder_df.loc[index, 'Country Name (Education)'] = education_value

# Drop the 'Main', 'GDP', and 'Education' columns
tinder_df.drop(columns=['Main', 'GDP', 'Education'], inplace=True)

# Drop rows with NaN values
tinder_df.dropna(inplace=True)

# Drop duplicates
tinder_df.drop_duplicates(inplace=True)

# Reset index
tinder_df.reset_index(drop=True, inplace=True)


# List of all country names in main_df
all_countries_main_df = main_df['country_cy'].tolist()

# List of country names in tinder_df
country_names_tinder_df = tinder_df['Country Name (Main)'].tolist()

# Find the ignored countries (those in main_df but not in tinder_df)
ignored_countries = set(all_countries_main_df) - set(country_names_tinder_df)

# Create a DataFrame for unique ignored countries
ignored_countries_df = pd.DataFrame({'Ignored Country Name': list(ignored_countries)})

# Manually match the ignored country names with potential matches from GDP and Education datasets
matches = [
    {"Ignored Country Name": "Algeria", "Matched Country Name (GDP)": "Algeria", "Matched Country Name (Education)": "Algeria"},
    {"Ignored Country Name": "Zimbabwe (Rhodesia)", "Matched Country Name (GDP)": "Zimbabwe", "Matched Country Name (Education)": "Zimbabwe"},
    {"Ignored Country Name": "Vatican City State", "Matched Country Name (GDP)": "N/A", "Matched Country Name (Education)": "N/A"},
    {"Ignored Country Name": "Kingdom of eSwatini (Swaziland)", "Matched Country Name (GDP)": "Eswatini", "Matched Country Name (Education)": "Swaziland"},
    {"Ignored Country Name": "Cambodia (Kampuchea)", "Matched Country Name (GDP)": "Cambodia", "Matched Country Name (Education)": "Cambodia"},
    {"Ignored Country Name": "Vietnam (North Vietnam)", "Matched Country Name (GDP)": "Vietnam", "Matched Country Name (Education)": "Vietnam"},
    {"Ignored Country Name": "Bosnia-Herzegovina", "Matched Country Name (GDP)": "Bosnia and Herzegovina", "Matched Country Name (Education)": "Bosnia and Herzegovina"},
    {"Ignored Country Name": "Albania", "Matched Country Name (GDP)": "Albania", "Matched Country Name (Education)": "Albania"},
    {"Ignored Country Name": "Congo", "Matched Country Name (GDP)": "Congo, Rep.", "Matched Country Name (Education)": "Congo, Dem. Rep."},
    {"Ignored Country Name": "East Timor", "Matched Country Name (GDP)": "Timor-Leste", "Matched Country Name (Education)": "Timor-Leste"},
    {"Ignored Country Name": "Antigua & Barbuda", "Matched Country Name (GDP)": "Antigua and Barbuda", "Matched Country Name (Education)": "Antigua and Barbuda"},
    {"Ignored Country Name": "Laos", "Matched Country Name (GDP)": "Lao PDR", "Matched Country Name (Education)": "Lao PDR"},
    {"Ignored Country Name": "Ivory Coast", "Matched Country Name (GDP)": "Cote d'Ivoire", "Matched Country Name (Education)": "Cote d'Ivoire"},
    {"Ignored Country Name": "Federated States of Micronesia", "Matched Country Name (GDP)": "Micronesia, Fed. Sts.", "Matched Country Name (Education)": "Micronesia, Fed. Sts."},
    {"Ignored Country Name": "Madagascar (Malagasy)", "Matched Country Name (GDP)": "Madagascar", "Matched Country Name (Education)": "Madagascar"},
    {"Ignored Country Name": "Afghanistan", "Matched Country Name (GDP)": "Afghanistan", "Matched Country Name (Education)": "Afghanistan"},
    {"Ignored Country Name": "Kosovo", "Matched Country Name (GDP)": "N/A", "Matched Country Name (Education)": "Kosovo"},
    {"Ignored Country Name": "Russia (Soviet Union)", "Matched Country Name (GDP)": "Russian Federation", "Matched Country Name (Education)": "Russian Federation"},
    {"Ignored Country Name": "Yemen (South Yemen)", "Matched Country Name (GDP)": "Yemen, Rep.", "Matched Country Name (Education)": "Yemen, Rep."},
    {"Ignored Country Name": "Saint Kitts and Nevis", "Matched Country Name (GDP)": "St. Kitts and Nevis", "Matched Country Name (Education)": "St. Kitts and Nevis"},
    {"Ignored Country Name": "Bahamas", "Matched Country Name (GDP)": "Bahamas, The", "Matched Country Name (Education)": "Bahamas, The"},
    {"Ignored Country Name": "Cape Verde", "Matched Country Name (GDP)": "Cabo Verde", "Matched Country Name (Education)": "Cabo Verde"},
    {"Ignored Country Name": "Saint Lucia", "Matched Country Name (GDP)": "St. Lucia", "Matched Country Name (Education)": "St. Lucia"},
    {"Ignored Country Name": "Myanmar (Burma)", "Matched Country Name (GDP)": "Myanmar", "Matched Country Name (Education)": "Myanmar"},
    {"Ignored Country Name": "Serbia (Yugoslavia)", "Matched Country Name (GDP)": "Serbia", "Matched Country Name (Education)": "Serbia"},
    {"Ignored Country Name": "Samoa/Western Samoa", "Matched Country Name (GDP)": "Samoa", "Matched Country Name (Education)": "Samoa"},
    {"Ignored Country Name": "Yemen (North Yemen)", "Matched Country Name (GDP)": "N/A", "Matched Country Name (Education)": "Yemen, Rep."}
]

# Create DataFrame from the manually matched data
matched_df = pd.DataFrame(matches)

# 1. Add a duplicate row of Tuvalu and name it "Vatican City State" in gdp_df
vatican_row_gdp = gdp_df[gdp_df['Country Name'] == 'Tuvalu'].copy()
vatican_row_gdp['Country Name'] = 'Vatican City State'
gdp_df = pd.concat([gdp_df, vatican_row_gdp], ignore_index=True)

# 1. Add a duplicate row of Tuvalu and name it "Vatican City State" in education_df
vatican_row_education = education_df[education_df['Country Name'] == 'Tuvalu'].copy()
vatican_row_education['Country Name'] = 'Vatican City State'
education_df = pd.concat([education_df, vatican_row_education], ignore_index=True)

# 2. Add a duplicate row of Malawi and name it "Kosovo" in gdp_df
kosovo_row_gdp = gdp_df[gdp_df['Country Name'] == 'Malawi'].copy()
kosovo_row_gdp['Country Name'] = 'Kosovo'
gdp_df = pd.concat([gdp_df, kosovo_row_gdp], ignore_index=True)

# Replace "N/A" with "Tuvalu" for "Vatican City State" row
matched_df.loc[matched_df['Ignored Country Name'] == 'Vatican City State', 'Matched Country Name (GDP)'] = 'Tuvalu'
matched_df.loc[matched_df['Ignored Country Name'] == 'Vatican City State', 'Matched Country Name (Education)'] = 'Tuvalu'

# Replace "Malawi" in "Matched Country Name (GDP)" for "Kosovo" row
matched_df.loc[matched_df['Ignored Country Name'] == 'Kosovo', 'Matched Country Name (GDP)'] = 'Malawi'

matched_df.loc[matched_df['Ignored Country Name'] == 'Yemen (North Yemen)', 'Matched Country Name (GDP)'] = 'Yemen'

# Create a new DataFrame
countries_df = pd.DataFrame()

# Concatenate the first column of each dataset into the first column of countries_df
countries_df['Country Name'] = pd.concat([perfect_matches_df.iloc[:, 0], tinder_df.iloc[:, 0], matched_df.iloc[:, 0]], ignore_index=True)

countries_df.rename(columns={'Country Name': 'main_name'}, inplace=True)

# Drop duplicates in the "main_name" column of countries_df
countries_df.drop_duplicates(subset='main_name', inplace=True)

# Rename the "Country Name (Main)" column in perfect_matches_df
perfect_matches_df.rename(columns={'Country Name (Main)': 'main_name'}, inplace=True)

for index, row in countries_df.iterrows():
    main_name = row['main_name']
    
    # Check if the main_name exists in tinder_df
    if main_name in tinder_df['Country Name (Main)'].values:
        # Get the corresponding GDP value from tinder_df and update gdp_name
        gdp_value = tinder_df.loc[tinder_df['Country Name (Main)'] == main_name, 'Country Name (GDP)'].iloc[0]
        countries_df.at[index, 'gdp_name'] = gdp_value
    elif main_name in matched_df['Ignored Country Name'].values:
        # If not found in tinder_df, check in matched_df and update gdp_name accordingly
        gdp_value = matched_df.loc[matched_df['Ignored Country Name'] == main_name, 'Matched Country Name (GDP)'].iloc[0]
        countries_df.at[index, 'gdp_name'] = gdp_value
    else:
        # Handle the case where neither tinder_df nor matched_df contains the main_name
        countries_df.at[index, 'gdp_name'] = None

# Update "education_name" column with values from perfect_matches_df where available
countries_df = pd.merge(countries_df, perfect_matches_df[['main_name', 'Country Name (Education)']],
                        left_on='main_name', right_on='main_name', how='left')
countries_df.rename(columns={'Country Name (Education)': 'education_name'}, inplace=True)

for index, row in countries_df.iterrows():
    main_name = row['main_name']
    # Check if the main_name exists in tinder_df
    if main_name in tinder_df['Country Name (Main)'].values:
        education_value = tinder_df.loc[tinder_df['Country Name (Main)'] == main_name, 'Country Name (Education)'].iloc[0]
        countries_df.at[index, 'education_name'] = education_value
    else:
        # Check if the main_name exists in matched_df
        if main_name in matched_df['Ignored Country Name'].values:
            education_value = matched_df.loc[matched_df['Ignored Country Name'] == main_name, 'Matched Country Name (Education)'].iloc[0]
            countries_df.at[index, 'education_name'] = education_value

# Iterate through each row in gdp_df
for index, row in gdp_df.iterrows():
    country_name_gdp = row['Country Name']
    
    # Check if the country name in gdp_df matches any entry in countries_df['gdp_name']
    matched_row = countries_df[countries_df['gdp_name'] == country_name_gdp]
    
    # If a match is found, update the country name in gdp_df
    if not matched_row.empty:
        new_country_name = matched_row.iloc[0]['gdp_name']
        gdp_df.at[index, 'Country Name'] = new_country_name
    else:
        # If no match is found, drop the row from gdp_df
        gdp_df.drop(index, inplace=True)

# Reset the index of gdp_df after dropping rows
gdp_df.reset_index(drop=True, inplace=True)


# Iterate through each row in education_df
for index, row in education_df.iterrows():
    country_name_education = row['Country Name']
    
    # Check if the country name in education_df matches any entry in countries_df['education_name']
    matched_row = countries_df[countries_df['education_name'] == country_name_education]
    
    # If a match is found, update the country name in education_df
    if not matched_row.empty:
        new_country_name = matched_row.iloc[0]['education_name']
        education_df.at[index, 'Country Name'] = new_country_name
    else:
        # If no match is found, drop the row from education_df
        education_df.drop(index, inplace=True)


In [None]:
# Initialize an empty list to store perfect matches
perfect_matches = []

# Iterate through each country name in the "Country Name (Main)" column
for main_country in comparison_df['Country Name (Main)']:
    # Look for a perfect match in the "Country Name (GDP)" column
    gdp_match = comparison_df[comparison_df['Country Name (GDP)'] == main_country]
    
    # Look for a perfect match in the "Country Name (Education)" column
    edu_match = comparison_df[comparison_df['Country Name (Education)'] == main_country]
    
    # Check if perfect match is found in both GDP and Education columns
    if not gdp_match.empty and not edu_match.empty:
        # Take the first match from each DataFrame
        gdp_country = gdp_match.iloc[0]['Country Name (GDP)']
        edu_country = edu_match.iloc[0]['Country Name (Education)']
        
        # Append the perfect match to the list
        perfect_matches.append({'Country Name (Main)': main_country,
                                'Country Name (GDP)': gdp_country,
                                'Country Name (Education)': edu_country})

# Convert the list of perfect matches to a DataFrame
tinder_df = pd.DataFrame(perfect_matches)


In [None]:
education_unique_names = education_df['Country Name'].drop_duplicates().reset_index(drop=True)

comparison_df = pd.DataFrame()

# Import unique country names from main_df
comparison_df['Country Name (Main)'] = main_df['country_cy'].unique()
comparison_df['Country Name (Main)'] = sorted(comparison_df['Country Name (Main)'])

# Import unique country names from gdp_df
gdp_unique_names = gdp_df['Country Name'].drop_duplicates().reset_index(drop=True)
comparison_df['Country Name (GDP)'] = gdp_unique_names
comparison_df['Country Name (GDP)'] = sorted(comparison_df['Country Name (GDP)'])

# Determine the maximum length among the unique country name lists
max_length = max(len(comparison_df), len(education_unique_names))

# Extend the length of comparison_df if necessary
if max_length > len(comparison_df):
    comparison_df = comparison_df.reindex(range(max_length))

# Import unique country names from education_df
comparison_df['Country Name (Education)'] = education_unique_names[:max_length]


In [None]:
# acled_df.reset_index(inplace=True)

# acled_df['Country'] = acled_df['Country'].replace({
  #  'Antigua and Barbuda': 'Antigua & Barbuda',
   # 'Bosnia and Herzegovina': 'Bosnia-Herzegovina',
   # 'Czech Republic': 'Czechoslovakia',
   # 'Democratic Republic of Congo': 'DR Congo (Zaire)',
   # 'East Timor': 'Timor-Leste',
   # 'Kingdom of eSwatini': 'eSwatini',
   # 'Myanmar': 'Myanmar (Burma)',
   # 'North Korea': 'North Korea',
   # 'North Macedonia': 'North Macedonia',
   # 'Russian Federation': 'Russia (Soviet Union)',
   # 'Saint Kitts and Nevis': 'Saint Kitts and Nevis',
   # 'Saint Lucia': 'Saint Lucia',
   # 'Saint Vincent and the Grenadines': 'Saint Vincent and the Grenadines',
   # 'Samoa': 'Samoa/Western Samoa',
   # 'South Sudan': 'South Sudan',
   # 'United States': 'United States of America',
   # 'Vatican City': 'Vatican City State',
   # 'Vietnam': 'Vietnam (North Vietnam)',
   # 'Yemen': 'Yemen (North Yemen)'
#})


# Filter acled_df to only include countries also in main_df
# acled_df = acled_df[acled_df['Country'].isin(main_df['country_cy'].unique())]

#row_means = acled_df.iloc[:, 2:].apply(lambda row: row.mean(), axis=1)

# Iterate over each row and replace NaN values with the corresponding row mean
#for index, row in acled_df.iterrows():
 #   acled_df.loc[index, acled_df.columns[2:]] = row.fillna(row_means[index])

#columns_to_round = acled_df.columns[2:]  # Exclude 'index' and 'Country' columns
# acled_df[columns_to_round] = acled_df[columns_to_round].applymap(lambda x: math.floor(x))
