In [2]:
import pandas as pd
import numpy as np

# helpful modules
import fuzzywuzzy
from fuzzywuzzy import process
import charset_normalizer

# read in all our data
professors = pd.read_csv("../data/ks-projects/pakistan_intellectual_capital.csv")

# set seed for reproducibility
np.random.seed(0)



In [3]:
def replace_matches_in_column(df, column, string_to_match, min_ratio = 47):
    # get a list of unique strings
    strings = df[column].unique()

    # get the top 10 closest matches to our input string
    matches = fuzzywuzzy.process.extract(string_to_match, strings,
                                         limit=10, scorer=fuzzywuzzy.fuzz.token_sort_ratio)

    # only get matches with a ratio > 90
    close_matches = [matches[0] for matches in matches if matches[1] >= min_ratio]

    # get the rows of all the close matches in our dataframe
    rows_with_matches = df[column].isin(close_matches)

    # replace all rows with close matches with the input matches
    df.loc[rows_with_matches, column] = string_to_match

    # let us know the function's done
    print("All done!")

# replace_matches_in_column(df=professors, column='Country', string_to_match="south korea")


All done!


In [13]:
graduated = professors["Graduated from"].unique()
graduated.sort()
graduated

array(['abasyn university', 'abdul wali khan university, mardan',
       'abdus salam school of mathematical sciences,gc university',
       'agricultural university peshawar', 'allama iqbal open university',
       'asian institute of technology', 'aston university, birmingham',
       'australian national university, caneberra',
       'bahauddin zakariya university', 'bahria university',
       'bahria university,islamabad',
       'balochistan university of information technology, engineering and management sciences',
       'barani institute of information technology',
       'beaconhouse national university', 'beihang university',
       'beijing institute of technology',
       'beijing institute of technology beijing',
       'beijing university of posts & telecommunications',
       'biztek institute of business & technology,karachi',
       'blekinge institute of technology', 'boston university',
       'brock university canada', 'brunel university', 'bukc',
       'californi

In [12]:
# convert to lower case
professors['Graduated from'] = professors['Graduated from'].str.lower()
# remove trailing white spaces
professors['Graduated from'] = professors['Graduated from'].str.strip()

# get the top 10 closest matches to "south korea"
professors['Graduated from'].unique()
# matches = fuzzywuzzy.process.extract("south korea", countries, limit=10, scorer=fuzzywuzzy.fuzz.token_sort_ratio)

array(['asian institute of technology',
       'balochistan university of information technology, engineering and management sciences',
       'university of balochistan',
       "sardar bahadur khan women's university",
       'srh hochschule heidelberg',
       'institute of business administration,karachi', 'duet,karachi',
       'university of turbat', 'university of vienna',
       'monash university', 'university of stirling',
       'chinese academy of sciences', 'university of innsbruck',
       'vienna university of technology', 'university of paris-est',
       'the university of cambridge', 'harbin institute of technology',
       'university of nice, sophia antipolis', 'the university of york',
       'galilée - université paris 13', 'university of bedfordshire',
       'north dakota state university', 'kyungpook national university',
       'the university of manchester',
       'national university of sciences and technology',
       'fast– national university of computer

In [None]:
replace_matches_in_column(df=professors, column='Graduated from', string_to_match="south korea")