# Cleaning countries

In [4]:
#imports
import re
import pandas as pd
import numpy as np
import scipy as sp
import scipy.stats as stats
import matplotlib.pyplot as plt
from datetime import timedelta

In [5]:
import findspark
findspark.init()
import pyspark

from functools import reduce
from pyspark.sql import *
from pyspark.sql import functions as F
from pyspark.sql import SQLContext
from pyspark.sql.functions import *
from pyspark.sql.functions import min
from pyspark.sql.functions import to_date, last_day,date_add
from datetime import timedelta

spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

In [82]:
import warnings
warnings.filterwarnings("ignore", 'This pattern has match groups')

In [97]:
openfood_file = "data/en.openfoodfacts.org.products.csv"
cities_file = "data/countries_cleaning/cities.csv"
countries_file = "data/countries_cleaning/countries.csv"

output_mapping_just_countries = "data/countries_cleaning/output/mapping_just_countries.csv"

# Loading data

In [9]:
dataset_main = spark.read.csv(openfood_file, header=True, mode="DROPMALFORMED", sep = '\t')

dataset_main.createOrReplaceTempView("data_main")

# Filter required columns
p_id_col = " code, "
geo_tags_cols = " manufacturing_places_tags, countries_tags "

off_df = spark.sql("SELECT" + p_id_col + geo_tags_cols + " FROM data_main")
off_df.printSchema()

root
 |-- code: string (nullable = true)
 |-- manufacturing_places_tags: string (nullable = true)
 |-- countries_tags: string (nullable = true)



In [10]:
off_all_size = off_df.count()
off_cols_size = len(off_df.columns)
print("All data Size:\n" + str(off_cols_size) + "(columns) * " + str(off_all_size) + "(rows)")

All data Size:
3(columns) * 709945(rows)


### Data Cleaning and Preprocessing

In [11]:
# Find number of missing data

off_df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in off_df.columns]).show()

+----+-------------------------+--------------+
|code|manufacturing_places_tags|countries_tags|
+----+-------------------------+--------------+
|   0|                   641206|           554|
+----+-------------------------+--------------+



In [12]:
off_df.createOrReplaceTempView("off_df")

sql_filter = "SELECT * FROM off_df WHERE \
            countries_tags is not NULL AND manufacturing_places_tags is not NULL "

off_p_df = spark.sql(sql_filter)
off_p_all_size = off_p_df.count()
off_p_cols_size = len(off_p_df.columns)
print("Full GEO information data Size:\n" + str(off_p_cols_size) + "(columns) * " + str(off_p_all_size) + "(rows)")

Full GEO information data Size:
3(columns) * 68672(rows)


In [14]:
off_p_df.show(5)

+-------------+-------------------------+-----------------+
|         code|manufacturing_places_tags|   countries_tags|
+-------------+-------------------------+-----------------+
|0000000020114|                   france|        en:france|
|0000000274722|                   france|        en:france|
|0000000290616|          brossard-quebec|        en:canada|
|0000000394710|          brossard-quebec|        en:canada|
|0000001071894|           united-kingdom|en:united-kingdom|
+-------------+-------------------------+-----------------+
only showing top 5 rows



Since columns with _tag label have more consistent data, we will use these columns from now.

In [27]:
# Remove "en:" occurances before name of each country in coutries_tags
off_p_df = off_p_df.withColumn('countries_tags', F.regexp_replace('countries_tags', "en:", ""))
off_p_df.show(5)

+-------------+-------------------------+--------------+
|         code|manufacturing_places_tags|countries_tags|
+-------------+-------------------------+--------------+
|0000000020114|                   france|        france|
|0000000274722|                   france|        france|
|0000000290616|          brossard-quebec|        canada|
|0000000394710|          brossard-quebec|        canada|
|0000001071894|           united-kingdom|united-kingdom|
+-------------+-------------------------+--------------+
only showing top 5 rows



In [28]:
countries_mapping = off_p_df.toPandas()
countries_mapping.head()

Unnamed: 0,code,manufacturing_places_tags,countries_tags
0,20114,france,france
1,274722,france,france
2,290616,brossard-quebec,canada
3,394710,brossard-quebec,canada
4,1071894,united-kingdom,united-kingdom


In [33]:
# create a new database mapping each country to some labels

countries_mapping['all_countries'] = countries_mapping.manufacturing_places_tags +"," + countries_mapping.countries_tags

In [34]:
countries_mapping.head()

Unnamed: 0,code,manufacturing_places_tags,countries_tags,all_countries
0,20114,france,france,"france,france"
1,274722,france,france,"france,france"
2,290616,brossard-quebec,canada,"brossard-quebec,canada"
3,394710,brossard-quebec,canada,"brossard-quebec,canada"
4,1071894,united-kingdom,united-kingdom,"united-kingdom,united-kingdom"


In [56]:
countries = pd.concat([pd.Series(row['all_countries'].split(','))              
                    for _, row in countries_mapping.iterrows()]).reset_index(drop=True)
countries.head()

0             france
1             france
2             france
3             france
4    brossard-quebec
dtype: object

In [57]:
len(countries)

192203

In [58]:
countries = countries.drop_duplicates().reset_index(drop=True)
countries = countries.str.replace("-", " ") 

# Remove numbers from name of countries
countries = countries.str.replace('\d+', '')

print(len(countries))

12357


In [59]:
countries.head()

0             france
1    brossard quebec
2             canada
3     united kingdom
4           brossard
dtype: object

We have extracted all unique values in country_tags and manufacturing_places_tags columns. 
We have 12357 unique entry.
Now we should map each of these entries to a country code. 

In [68]:
# mapn_countries will keep the mappings
map_countries = pd.DataFrame(columns=['input', 'country_code'])
remained_countries = countries.copy()

In [69]:
def print_cleaning_status():
    print("{0} name of countries have been detected in uncleaned dataset".format(len(map_countries)))
    print("{0} name of countries have remained".format(len(remained_countries)))    

In [70]:
print_cleaning_status()

0 name of countries have been detected in uncleaned dataset
12357 name of countries have remained


Two external databases were used for mapping of country names and city names.
https://www.geodatasource.com

## Using country name

In [44]:
dataset_countries = pd.read_csv(countries_file, error_bad_lines=False)
dataset_countries.head()

Unnamed: 0,CC_FIPS,COUNTRY_NAME
0,AA,Aruba
1,AC,Antigua and Barbuda
2,AE,United Arab Emirates
3,AF,Afghanistan
4,AG,Algeria


In [61]:
def map_country(data, country_code):
    global map_countries
    map_countries = map_countries.append({'input': data, 'country_code': country_code}, ignore_index=True)

def find_country(data):
    # map data with country code
    output = dataset_countries[dataset_countries.CC_FIPS.str.match(data, case=False)]
    
    if len(data) <=2:
        # If length of data is less than 3 and has not been matched with a country code, data is not valid
        return 0
    
    if not len(output):
        # map data with name of a country 
        output = dataset_countries[dataset_countries.COUNTRY_NAME.str.match(data, case=False)]
        
    if not len(output):
        # map data with name of a country 
        output = dataset_countries[dataset_countries.COUNTRY_NAME.str.contains(data, case=False)]
        
    if len(output):
        return output.iloc[0].CC_FIPS
    return 0

def assign_country_code(row):
    output = find_country(row)
    if output:
        map_country(row, output)
        return True
    return False

In [71]:
for i in range(len(remained_countries)):
    if assign_country_code(remained_countries[i]):
        remained_countries = remained_countries.drop([i])
        i -=1

In [72]:
print_cleaning_status()

204 name of countries have been detected in uncleaned dataset
12153 name of countries have remained


### Find entries which contain name of a country

In [86]:
for index, row in dataset_countries.iterrows():
    # If data contains name of a country before other words
    output = remained_countries[remained_countries.str.contains(row.COUNTRY_NAME + " ", case=False, na=False)]
    
    if not len(output):
        # If data contains name of a country after other words
         output = remained_countries[remained_countries.str.contains(" " + row.COUNTRY_NAME, case=False, na=False)]   
    
    for i in range(len(output)):
        map_country(output.iloc[i], row.CC_FIPS)
        remained_countries = remained_countries.drop(remained_countries[remained_countries == output.iloc[i]].index[0])

In [87]:
print_cleaning_status()

776 name of countries have been detected in uncleaned dataset
11581 name of countries have remained


In [89]:
map_countries.tail(10)

Unnamed: 0,input,country_code
766,en south africa,SF
767,es gador almeira spain,SP
768,nutriops s l r g s a m u avda c blancos p ...,SP
769,alhama de murcia spain,SP
770,valencia spain,SP
771,micarna sa divison volaille rte de l industrie...,SZ
772,i̇stanbul turkey,TU
773,istanbul turkey,TU
774,en united states,US
775,virgin islands of the united states,US


In [99]:
# Save 776 detected inputs 
map_countries.to_csv(output_mapping_just_countries, index=False)

## Using City names

In [94]:
# Find name of cities and replace with country code (Here some bias may happen, some cities have similar name)

dataset_cities = pd.read_csv(cities_file, sep=',', error_bad_lines=False, encoding = "utf-8")

b'Skipping line 3404: expected 2 fields, saw 3\nSkipping line 26344: expected 2 fields, saw 3\nSkipping line 26424: expected 2 fields, saw 3\nSkipping line 27358: expected 2 fields, saw 3\nSkipping line 28220: expected 2 fields, saw 3\nSkipping line 28221: expected 2 fields, saw 3\nSkipping line 28382: expected 2 fields, saw 3\nSkipping line 28734: expected 2 fields, saw 3\nSkipping line 29051: expected 2 fields, saw 3\nSkipping line 29056: expected 2 fields, saw 3\nSkipping line 29128: expected 2 fields, saw 3\nSkipping line 29183: expected 2 fields, saw 3\nSkipping line 44241: expected 2 fields, saw 3\nSkipping line 65686: expected 2 fields, saw 3\nSkipping line 67481: expected 2 fields, saw 3\nSkipping line 69168: expected 2 fields, saw 3\nSkipping line 70683: expected 2 fields, saw 3\nSkipping line 74874: expected 2 fields, saw 3\nSkipping line 76715: expected 2 fields, saw 3\nSkipping line 79939: expected 2 fields, saw 3\nSkipping line 79940: expected 2 fields, saw 3\nSkipping lin

b'Skipping line 2120457: expected 2 fields, saw 3\nSkipping line 2143498: expected 2 fields, saw 3\nSkipping line 2152938: expected 2 fields, saw 3\nSkipping line 2152939: expected 2 fields, saw 3\nSkipping line 2152940: expected 2 fields, saw 3\nSkipping line 2152942: expected 2 fields, saw 3\nSkipping line 2152943: expected 2 fields, saw 3\nSkipping line 2152944: expected 2 fields, saw 3\nSkipping line 2152945: expected 2 fields, saw 3\nSkipping line 2156181: expected 2 fields, saw 3\nSkipping line 2156939: expected 2 fields, saw 3\nSkipping line 2157870: expected 2 fields, saw 3\nSkipping line 2159388: expected 2 fields, saw 5\nSkipping line 2159627: expected 2 fields, saw 3\nSkipping line 2160286: expected 2 fields, saw 3\nSkipping line 2164938: expected 2 fields, saw 3\nSkipping line 2165243: expected 2 fields, saw 3\nSkipping line 2167702: expected 2 fields, saw 3\nSkipping line 2171276: expected 2 fields, saw 3\nSkipping line 2172138: expected 2 fields, saw 3\nSkipping line 2176

In [95]:
dataset_cities.head()

Unnamed: 0,CC_FIPS,FULL_NAME_ND
0,AN,Aixas
1,AN,Aixirivall
2,AN,Aixovall
3,AN,Andorra la Vella
4,AN,Ansalonga


In [96]:
print("Number of cities in the external dataset: {0}".format(len(dataset_cities)))

Number of cities in the external dataset: 2915558


In [124]:
remained_countries_backup = remained_countries.copy()
map_countries_backup = map_countries.copy()
len(map_countries_backup)

776

In [132]:
remained_countries = remained_countries_backup.copy()
map_countries = map_countries_backup.copy()
print_cleaning_status()

776 name of countries have been detected in uncleaned dataset
11581 name of countries have remained


In [125]:
def find_city(data):
    
    if len(data) <=2:
        # If length of data is less than 3 data is not valid
        return 0
    
    # map data with name of a city 
    output = dataset_cities[dataset_cities.FULL_NAME_ND.str.match(data, case=False, na=False)]
        
    if not len(output):
        # map data if it contains name of a city 
        output = dataset_cities[dataset_cities.FULL_NAME_ND.str.contains(data, case=False, na=False)]

    if len(output):
            return output.iloc[0].CC_FIPS
    return 0

def assign_country_code_using_city(row):
    output = find_city(row)
    if output:
        map_country(row, output)
        return True
    return False

In [126]:
seen = 0
interval = 2000

In [None]:
import statistics

remained_countries = remained_countries.reset_index(drop=True)

if len(remained_countries) < interval:
    interval = len(remained_countries)
for i in range(seen, seen + interval):
    if i%200 == 0:
        print(i)
    
    if assign_country_code_using_city(remained_countries[i]):
        remained_countries = remained_countries.drop([i])
        i -=1
        seen = i
print(seen)   

In [135]:
seen

352

2319 locations were matched with city names

629 locations were matched with city names

## Using City name (contain)

In [263]:
for j in range(len(dataset_cities)):
    output = countries[countries.str.contains(str(dataset_cities.iloc[j].City) + " ", case=False, na=False)]
    for i in range(len(output)):
        map_country(output.iloc[i], dataset_cities.iloc[j].Country)
        countries = countries.drop(countries[countries == output.iloc[i]].index[0])
        
    output = countries[countries.str.contains(" " + str(dataset_cities.iloc[j].City), case=False, na=False)]
    for i in range(len(output)):
        map_country(output.iloc[i], dataset_cities.iloc[j].Country)
        countries = countries.drop(countries[countries == output.iloc[i]].index[0])

5,326 name of countries contain name of one city.

In [273]:
print("{0} strings remained\n{1} strings mapped to a country".format(len(countries), len(map_countries)))

1318 strings remained
7151 strings mapped to a country


### In continue:

- Evaluate the country detection algorithm by manual checking of a sample of 100 entries
- Compute Edit distance for ~1k remained countries
    
    