# Inconsistent Data Entry
Let's learn how to fix typos in our data. 

In [15]:
import pandas as pd
import numpy as np

# Util modules
import fuzzywuzzy
from fuzzywuzzy import process
import charset_normalizer

# read in all our data
dataPath = "../data/pakistan_intellectual_capital.csv"

professors = pd.read_csv(dataPath)

# set seed for reproducibility
np.random.seed(0)

# Preview the data 
professors.head(3)



Unnamed: 0.1,Unnamed: 0,S#,Teacher Name,University Currently Teaching,Department,Province University Located,Designation,Terminal Degree,Graduated from,Country,Year,Area of Specialization/Research Interests,Other Information
0,2,3,Dr. Abdul Basit,University of Balochistan,Computer Science & IT,Balochistan,Assistant Professor,PhD,Asian Institute of Technology,Thailand,,Software Engineering & DBMS,
1,4,5,Dr. Waheed Noor,University of Balochistan,Computer Science & IT,Balochistan,Assistant Professor,PhD,Asian Institute of Technology,Thailand,,DBMS,
2,5,6,Dr. Junaid Baber,University of Balochistan,Computer Science & IT,Balochistan,Assistant Professor,PhD,Asian Institute of Technology,Thailand,,"Information processing, Multimedia mining",


In [None]:
'''
+ Clean Country Column.
'''

# 1. Check all possible values of the country column
all_countries = professors['Country'].unique()
sorted_countries = sorted(all_countries)
sorted_countries

# 2. Remove leading and trailing whitespace around country name, also lower case them.
professors['Country'] = professors['Country'].str.strip()
professors['Country'] = professors['Country'].str.lower()

'''
3. Remove 'inconsistent' country names e.g. 'South Korea' and 'SouthKorea' should be made the same
Rather than do this manually, let's do this programmatically. We'll use 'fuzzy matching' which is hte process
of automatically finding text strings to are very similar to the target string. The closeness of one string to 
another is determined by the amount of changes awway. Like 'apple' and 'snapple' are two shanges away from each other.
Add s, and add p.

So fuzzywuzzy will return a similarity ratio, the closer it is to 100% the closer the name of the two strings are to 
each other. Get the top 10 closest matches to 'south korea' and check them. Then via our discretion we can see 
which ones are essentially the same. The only other one that's an actual  duplicate is 'southkorea', so we'll 
update those values.

So 'SouthKorea' is at 95, so we'll replace anything that got a 95 or higher 
with 'south korea'. I know you can manually do this, but with this range related logic, 
you'll be prepared for cases where multiple duplicates show up on the ratios.
'''

matches = fuzzywuzzy.process.extract("south korea", sorted_countries, limit=10)
matches

def replace_amtches_in_column(df, column, string_to_match, min_ratio):
  strings = df[column].unique()
  matches = fuzzywuzzy.process.extract(string_to_match, strings, limit = 10)

  '''
  Remember that matches = [
    (countryName, similarity_ratio)
  ]'''
  close_matches = [m[0] for m in matches if m[1] >= min_ratio]
  rows_with_matches = df[column].isin(close_matches)

  # Replace all those close matched rows with the input string
  df.loc[rows_with_matches, column] = string_to_match

  print("All done!")
replace_amtches_in_column(df=professors, column="Country", string_to_match="south korea", min_ratio=90)

# Now check; there shouldn't be any typos involving south korea now.
all_countries = professors['Country'].unique()
all_countries





All done!


array(['thailand', 'pakistan', 'germany', 'austria', 'australia', 'uk',
       'china', 'france', 'usofa', 'south korea', 'malaysia', 'sweden',
       'italy', 'canada', 'norway', 'ireland', 'new zealand', 'urbana',
       'portugal', 'russian federation', 'usa', 'finland', 'netherland',
       'greece', 'turkey', 'macau', 'singapore', 'spain', 'japan',
       'hongkong', 'saudi arabia', 'mauritius', 'scotland'], dtype=object)