In [58]:
# modules we'll use
import pandas as pd
import numpy as np

# helpful modules
import fuzzywuzzy
from fuzzywuzzy import process
import chardet

In [21]:
# read in all our data
professors = pd.read_csv("/Users/yunus/Desktop/Machine_Learning/data_cleaning/materials/Pakistan Intellectual Capital - Computer Science - Ver 1.csv", encoding = 'ISO-8859-1')
# set seed for reproducibility
np.random.seed(0)

In [22]:
professors.head()

Unnamed: 0,S#,Teacher Name,University Currently Teaching,Department,Province University Located,Designation,Terminal Degree,Graduated from,Country,Year,Area of Specialization/Research Interests,Other Information
0,1,Dr. Ihsan Ullah,University of Balochistan,Computer Science & IT,Balochistan,Assistant Professor,PhD,,France,,"P2P Networks, Telecommunication Systems",
1,2,Dr. Atiq Ahmed,University of Balochistan,Computer Science & IT,Balochistan,Associate Professor,PhD,,France,,"Wireless Networks, Telecommunication Systems, ...",
2,3,Dr. Abdul Basit,University of Balochistan,Computer Science & IT,Balochistan,Assistant Professor,PhD,Asian Institute of Technology,Thailand,,Software Engineering & DBMS,
3,4,Mr. Muhammad Khalid Badini,University of Balochistan,Computer Science & IT,Balochistan,Assistant Professor,MS,,,,Operating System & DBMS,On study leave
4,5,Dr. Waheed Noor,University of Balochistan,Computer Science & IT,Balochistan,Assistant Professor,PhD,Asian Institute of Technology,Thailand,,DBMS,


In [48]:
# get all the unique values in the 'Country' column
countries = professors['Country'].unique()

# To remove Nan value from our ndarray
countries = countries[~pd.isnull(countries)]

# sort them alphabetically and then take a closer look
countries.sort()
countries

# Did some debug

# for country in countries:
#     print(country, type(country))

array(['australia', 'austria', 'canada', 'china', 'denmark', 'finland',
       'france', 'germany', 'greece', 'hongkong', 'ireland', 'italy',
       'japan', 'macau', 'malaysia', 'mauritius', 'netherland',
       'new zealand', 'norway', 'pakistan', 'portugal',
       'russian federation', 'saudi arabia', 'scotland', 'singapore',
       'south korea', 'spain', 'sweden', 'thailand', 'turkey', 'uk',
       'urbana', 'usa'], dtype=object)

In [49]:
# convert to lower case
professors['Country'] = professors['Country'].str.lower()
# remove trailing white spaces
professors['Country'] = professors['Country'].str.strip()

In [50]:
# get the top 10 closest matches to "south korea"
matches = fuzzywuzzy.process.extract("south korea", countries, limit=10, scorer=fuzzywuzzy.fuzz.token_sort_ratio)

# take a look at them
matches

[('south korea', 100),
 ('saudi arabia', 43),
 ('norway', 35),
 ('ireland', 33),
 ('portugal', 32),
 ('singapore', 30),
 ('netherland', 29),
 ('macau', 25),
 ('france', 24),
 ('greece', 24)]

In [51]:
# function to replace rows in the provided column of the provided dataframe
# that match the provided string above the provided ratio with the provided string
def replace_matches_in_column(df, column, string_to_match, min_ratio = 47):
    # get a list of unique strings
    strings = df[column].unique()
    
    # get the top 10 closest matches to our input string
    matches = fuzzywuzzy.process.extract(string_to_match, strings, 
                                         limit=10, scorer=fuzzywuzzy.fuzz.token_sort_ratio)

    # only get matches with a ratio > 90
    close_matches = [matches[0] for matches in matches if matches[1] >= min_ratio]

    # get the rows of all the close matches in our dataframe
    rows_with_matches = df[column].isin(close_matches)

    # replace all rows with close matches with the input matches 
    df.loc[rows_with_matches, column] = string_to_match
    
    # let us know the function's done
    print("All done!")

In [53]:
# use the function we just wrote to replace close matches to "south korea" with "south korea"
replace_matches_in_column(df = professors, column = 'Country', string_to_match = "south korea")

All done!


In [55]:
# get all the unique values in the 'Country' column
countries = professors['Country'].unique()
countries = countries[~pd.isnull(countries)]

# sort them alphabetically and then take a closer look
countries.sort()
countries

array(['australia', 'austria', 'canada', 'china', 'denmark', 'finland',
       'france', 'germany', 'greece', 'hongkong', 'ireland', 'italy',
       'japan', 'macau', 'malaysia', 'mauritius', 'netherland',
       'new zealand', 'norway', 'pakistan', 'portugal',
       'russian federation', 'saudi arabia', 'scotland', 'singapore',
       'south korea', 'spain', 'sweden', 'thailand', 'turkey', 'uk',
       'urbana', 'usa'], dtype=object)