# Confucius Institute Locations Globally 

Confucius institutes are educational centers globally that represent one of China's main sources of soft power. We aim to gather information about where these centers are and if they are currently in operation. 

In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd
import seaborn as sns

import os
import json
import re

temp_directory = os.path.join(os.path.join(os.path.expanduser('~')), 'Desktop')

In [2]:
def processing_results(results, full_df): 
    # any result that has been geocoded, restrict from wk 
    full_df = full_df.merge(results, on='id', how ="left")
    next_df = full_df[pd.isna(full_df['cities_temp_id'])]

    assert len(full_df) == (len(next_df) + len(full_df[pd.notna(full_df['cities_temp_id'])]))
    print("There are still " + str(len(next_df)) + " more entities to geocode")
    
    return full_df, next_df[['id', 'country', 'City', 'location_name']]

In [3]:
def load_dict(path): 
    file = open(path, "r")
    contents = file.read()
    dictionary = json.loads(contents)
    file.close()
    return dictionary

### Grab Data

In [4]:
df = pd.read_csv("ci.csv")

In [5]:
# map the names to a common value for searching for the geometry 
# grabs it from txt file for consistency of country naming conventions

recipient_mapping = load_dict("../country_config.txt")
df['country'] = df['country'].replace(recipient_mapping)

In [6]:
# finding the original dataframe of CI locations to pull cities and links from 

org = pd.read_csv("confucius_institutes_original.csv")
df = df.merge(org, left_on='confucius_institute', right_on = 'Confucius Institute', how='left')

In [7]:
# remove unnesscary columns 

df = df[['id', 'country', 'confucius_institute', 'partner_uni', 'status', 'location_alt',
       'City', 'Link', 'Other Links', 'date est.\n(mm/dd/yyyy)']]

d = {
    'Open': 'Open',
    'open':'Open', 
    'Closed':'Closed'
}

df['status'] = df.status.map(d)

In [8]:
# running the geocoder with the parsed location name
temp_list = [re.findall(r"(.*) (in|at) (the)?(.*)", x) for x in df['confucius_institute']]
df['location_name'] = [x[0][3].strip()  if len(x) > 0 else 'None' for x in temp_list]
df['location_name'] = [re.sub(r"(Confucius Institute)", "", x) if ((y == 'None') & (len(re.findall(r"(Confucius Institute)", x)) > 0)) else y for x, y in zip(df['confucius_institute'], df['location_name'])]
df['location_name'] = [x if (x != 'None') else y for x, y in zip(df['location_name'], df['confucius_institute'])]
df['location_name'] = [re.sub.replace(r"((Confucius Institute) (for|of)?) ", "", x).strip() if ((len(re.findall(r"(Confucius Institute)", x)) > 0)) else x.strip() for x in df['location_name']]
df['location_name'] = [x.split(", ")[0] for x in df['location_name']]
df['location_name'] = [x if pd.notna(x) else y for x, y in zip(df['location_alt'], df['location_name'])]

listing = ['Linguistic', 'Federal', 'Oriental', 'Sapienza', 'Ca\' Foscari', 'Autonomous ', ' of Education', 'Pedagogical', "Free "]
for i in range(0, len(listing)): 
    df['location_name'] = [re.sub(listing[i], "", x).strip() for x in df['location_name']]
    
loc_data = temp_directory + "/ci_temp.csv"
df_t = df.loc[df['location_name'] != 'None'][['id', 'country', 'location_name']]
df_t.to_csv(loc_data, index=False)

### Attempt to Geocode with Confucious Institute Names 

In [9]:
%cd ..
%run autogeocode.py /Users/natalie_kraft/Desktop/ci_temp.csv gl3 'location_name' 'country' 'id' force

/Users/natalie_kraft/Documents/LAS/LAS-BRI/data_processing
Preparing system configuration.
Loading file to geocode
You are geocoding cities. Begin geocoding.
Loading geocoded location entities.
Loading geocoded location entities.




Not found: Bobo-Dioulasso Polytechnic University
Not found: University of Finance and Administration
Not found: National University of Equatorial Guinea
Not found: National Board for Higher Education
a service error has occured when retrieving this file from OSM.
Not found: Tbilisi Open Teaching University
Not found: Gamal Abdel Nasser University of Conakry
Not found: Lovely Professional University
Not found: O.P. Global University
Not found: University of Al Azhar Indonesia
Not found: Catholic University of the Sacred Heart
Not found: University of Enna Kore
Not found: University of Felix Houphouette Boigny
Not found: Karaganda State Technical University
Not found: L. N. Gumilyov Eurasian National University
Not found: University College of Technology Sarawak
Not found: University of Letters and Human Science of Bamako
Not found: ZUYD University of Applied Sciences
Not found: Catholic University of Peru
Not found: Catholic University of Santa Maria
Not found: Transilvania University o

In [10]:
# checking geocoding accuracy 
results = pd.read_csv(temp_directory + "/ci_temp_results.csv", dtype={'cities_temp_id': str, 'country_id': str})
df, next_df = processing_results(results, df)

There are still 40 more entities to geocode


In [11]:
# now just run the CI with the city label 

next_df.to_csv(loc_data, index=False)

In [12]:
%run autogeocode.py /Users/natalie_kraft/Desktop/ci_temp.csv gl3 'City' 'country' 'id' none

Preparing system configuration.
Loading file to geocode
You are geocoding cities. Begin geocoding.
Loading geocoded location entities.
Loading geocoded location entities.
Exporting new cities to database
Exporting modified data to geocoded location entities.
Exporting mapping results.
Geocoding complete.


In [13]:
# checking geocoding accuracy 
results = pd.read_csv(temp_directory + "/ci_temp_results.csv", dtype={'cities_temp_id': str, 'country_id': str})

df = df.merge(results, on='id', how ="left")
df['gl3_id'] = [x if pd.notna(x) else y for x, y in zip(df['cities_temp_id_x'], df['cities_temp_id_y'])]
df['country_id'] = [x if pd.notna(x) else y for x, y in zip(df['country_id_x'], df['country_id_y'])]

assert len(df[pd.notna(df['gl3_id'])]) == len(df)

In [14]:
# remove unneeded columns
df = df[['id', 'confucius_institute', 'partner_uni', 'date est.\n(mm/dd/yyyy)', 'location_name', 
         'City', 'country', 'status', 'Link', 'Other Links', 'gl3_id', 'country_id']]
df.rename(columns={'date est.\n(mm/dd/yyyy)':'date_est', 'City':'nearest_city', 'Link':'ci_webpage', 'Other Links': 'sources'}, inplace=True)

### Data set processed. Confirm Clean & Export to Data Final 

In [16]:
df

Unnamed: 0,id,confucius_institute,partner_uni,date_est,location_name,nearest_city,country,status,ci_webpage,sources,gl3_id,country_id
0,1,Confucius Institute (Chinese Department) at Ka...,,,Kabul University,Kabul,Afghanistan,Closed,http://www.ku.edu.af/,https://english.tyut.edu.cn/info/1008/1778.htm,67,192
1,2,Confucius Institute at University of Tirana,,,University of Tirana,Tirana,Albania,Closed,http://www.ciut.edu.al/,,61,164
2,3,Confucius Institute at Agostinho Neto University,,2/6/15,Agostinho Neto University,Sapu,Angola,Closed,http://english.hanban.org/node_42885.htm,https://books.google.com/books?id=JmgREAAAQBAJ...,177,096
3,4,Confucius Institute at Antigua and Barbuda,,11/15/19,Antigua and Barbuda,Piggotts,Antigua and Barbuda,Closed,http://zhuanti.hanban.org/videolist/?cat=685&t...,https://foundationhalo.org/confucius-institute...,25,029
4,5,Confucius Institute at National University of ...,,10/13/20,National University of Cordoba,Cordoba,Argentina,Closed,http://zhuanti.hanban.org/videolist/?cat=682&t...,https://www.unc.edu.ar/english/confucius-insti...,351,015
...,...,...,...,...,...,...,...,...,...,...,...,...
660,661,Confucius Institute at Universidad Bolivariana...,,,Universidad Bolivariana de Venezuela,,Venezuela,Closed,http://english.hanban.org/node_45771.htm,,125,069
661,662,Confucius Institute at the Bolivarian Universi...,,12/16/16,Caracas,,Venezuela,Closed,,https://www.facebook.com/page/338109312883186/...,14,069
662,663,Confucius Institute at Hanoi University,,,Hanoi University,,Vietnam,Closed,http://hanu.vn/,,149,232
663,664,Confucius Institute at University of Zambia,,,University of Zambia,,Zambia,Closed,https://www.unza.zm/confucius/,,79,114


In [17]:
df.to_csv("../data_final/confucius_institutes.csv", index=False)