In [40]:
import requests
import unittest
import re
import numpy as np
import pandas as pd

pd.get_option("display.max_columns")
print(pd.get_option("display.max_columns"))


20


In [42]:
def is_type_str(text):
    return (type(text) == str)
 

def contains_a_number(text):
    '''return true or false if a string has a number'''
    if is_type_str(text):
        pattern = re.compile(r'(\d)')
        return bool(re.search(pattern, text))

def is_a_postcode(text):
    '''return true or false if a string is a postcode'''
    if is_type_str(text):
        upper_text = text.upper()
        if contains_a_number(upper_text):
            full_post_code = re.compile(r'^[A-Z]{1,2}\d[A-Z\d]? ?\d[A-Z]{2}$')
            partial_post_code = re.compile(r'^[A-Z]{1,2}[A-Z0-9]{1,2}$')
            return (bool(re.search(full_post_code, upper_text)) or bool(re.search(partial_post_code, upper_text)))
        return False
    return False
    


def add_location_to_a_postcode(post_code):
    '''add a location when a postcode is given'''
    url = f'https://api.postcodes.io/postcodes?q={post_code}'
    response = requests.get(url)
    if (response.status_code == 200):
        location = response.json()['result'][0]
        if location['region']:
            return location['region']
        else:
            return location['country']
        
    return response.status_code


def remove_special_characters(text):
    '''return a string without special characters except for ' and - '''
    return re.sub(r"[^a-z-A-Z-0-9\-|']", " ", text)


def remove_uk_in_string(text):
    '''remove the word UK from a string and add a white space instead'''
    return (re.sub(r"^(UK) | UK$", " ", remove_special_characters(text).upper())).lower().strip()


def format_string(text):
    '''return a string with the format required'''
    return text.lower().title().strip()


def replace_uk_with_unknown(text):
    '''replace UK with unknown'''
    if bool(re.search(r"^UK$", text.upper())):
        return 'unknown'
    return text


def csv_to_data_frame(file_name):  
    """ function to covert a csv into a pandas DataFrame"""
    return pd.read_csv(file_name, engine='python')  


def has_uk(text):
    """ return to True or False if UK in a text""" 
    if type(text) == str:
        return ('UK' in text.upper())
    


In [43]:
import unittest

class TestIfStringContainsANumber(unittest.TestCase):

    def test_return_true_if_string_contains_a_number(self):
        text = 'HS2 0ST'
        self.assertEqual(contains_a_number(text), True)
    def test_return_false_if_string_does_not_contains_a_number(self):
        text = 'AST'
        self.assertEqual(contains_a_number(text), False)

class TestRemoveSpecialCharacteresFromAString(unittest.TestCase):

    def test_remove_special_characters(self):
        text = 'surrey, UK'
        self.assertEqual(remove_special_characters(text), 'surrey  UK')

    def test_does_not_remove_quotation_marks(self):
        text = "King's Ash"
        self.assertEqual(remove_special_characters(text), "King's Ash")

    def test_does_not_remove_hyphen(self):
        text = "Caister-on-Sea"
        self.assertEqual(remove_special_characters(text), "Caister-on-Sea")
    
class TestFormatString(unittest.TestCase): 
    def test_format_string(self):   
        text = "CaisTer-on-Sea"
        self.assertEqual(format_string(text), "Caister-On-Sea")

class TestIsAPostCode(unittest.TestCase):

    def test_return_true_when_text_is_a_full_postcode(self):
        text = 'HS2 0ST'
        self.assertEqual(is_a_postcode(text), True)

    def test_return_true_when_text_is_a_partial_postcode(self):
        text = 'HS2'
        self.assertEqual(is_a_postcode(text), True)

    def test_return_false_when_text_has_only_numbers(self):
        text = '11'
        self.assertEqual(is_a_postcode(text), False)

    def test_return_false_when_is_not_a_post(self):
        text = 'MARACAY23'
        self.assertEqual(is_a_postcode(text), False)

class TestUKIsRemoveFromASentence(unittest.TestCase):

    def test_remove_uk_from_a_sentence_with_a_special_character(self):
        text = 'surrey,UK'
        self.assertEqual(remove_uk_in_string(text), 'surrey')

    def test_remove_uk_from_a_sentence_with_white_spaces(self):
        text = 'surrey , UK'
        self.assertEqual(remove_uk_in_string(text), 'surrey')
        
    def test_remove_uk_from_a_sentence_when_it_is_at_the_beginning(self):
        text = 'uk gloucester'
        self.assertEqual(remove_uk_in_string(text), 'gloucester')

    def test_return_uk_when_uk_if_not_in_a_sentence (self):
        text = 'uk'
        self.assertEqual(remove_uk_in_string(text), 'uk')

    def test_return_same_string_when_uk_is_in_a_word__1(self):
        text = 'ukra'
        self.assertEqual(remove_uk_in_string(text), 'ukra')

    def test_return_same_string_when_uk_is_in_a_word_2(self):
        text = 'st luke'
        self.assertEqual(remove_uk_in_string(text), 'st luke')
        
class TestUKIsReplaceWithUnknown(unittest.TestCase):

    def test_replace_uk_with_unknown_when_uk_is_not_in_a_sentence(self):
        text = 'UK'
        self.assertEqual(replace_uk_with_unknown(text), 'unknown')

    def test_return_original_string_if_uk_is_in_a_sentence(self):
        text = 'surrey,UK'
        self.assertEqual(replace_uk_with_unknown(text), 'surrey,UK')


                 
        
        
       
        
        

unittest.main(argv=[''], verbosity=1, exit=False)


..................
----------------------------------------------------------------------
Ran 18 tests in 0.008s

OK


<unittest.main.TestProgram at 0x10c60a2b0>

In [5]:
#Convert csv file into a data frame 

data = csv_to_data_frame('scraped_csv_one.csv')

#Check that if has been created 

type(data)


pandas.core.frame.DataFrame

In [6]:
data.head()

Unnamed: 0,content,location,date
0,Bt are awful - terrible communication. I wante...,TOTNES,2021-11-22
1,Very poor service inspite of being with BT for...,,
2,Absolutely rubbish customer service,wakefield,2021-11-19
3,Shambles of a company. I couldn`t use the Broa...,Stockport,2021-11-14
4,The service they provide is not close to that ...,Kirkcaldy,2021-11-14


In [7]:
#Amount of columns and rows

print(data.shape)


(6031, 3)


In [8]:
data.describe()


Unnamed: 0,content,location,date
count,6028,3171,3228
unique,5909,1458,1824
top,Bt are awful - terrible communication. I wante...,London,2016-02-29
freq,2,275,9


In [46]:
# select the location column to check the unique values and see the kind of data that we have in it 

unique_locations = data['location'].dropna().unique()
print(len(unique_locations))

for i in unique_locations:
    print(i)




TOTNES
wakefield
Stockport
Kirkcaldy
Surrey, UK
Reading
Sutton Coldfield
york
Peterborough
Scotland
UK
ABINGDON
London
St Albans
london
scotland
Liverpool
Somerset
Oxfordshire
Essex
Livingston
Bishop auckland
Great Missenden
Belfast
Edinburgh
Brentwood, UK
Brownhills
Glos
FAREHAM
Sheffield
Folkestone
Bradford
Birmingham
Datchet
Preston
Cleveleys
Tunstall
Aberdeen
Uckfield
NEWCASTLE
Paisley
Solihull
liverpool uk
South Elmsall
East Sussex
Hastings
Wrexham
High Wycombe
Penzance
Ormskirk
Kent
WEST WICKHAM
Colchester
Bristol
Caerphilly
Penkridge
Mordoor
Woking
Telford
Bangor
Chelmsford
Clacton
CM1 2EH
Littleborough
North Tyneside
AL5
Thetford
Cardiff
Northumberland
Northampton
Northamptonshire
Brighton
Warrington
Bathgate
London W2
Wirral
Stroud
Ig6 3uf
Cheshire
Derbyshire
Rhiddlan
Southampton
Southend on Sea
MINCHINHAMPTON
Enniskillen
Camrbridge
KETTERING
Burnley
Bath
north west
London NW9
Bournemouth
Wales
Forest Hill
Hambleton
Kelso
Cirencester
Archway Lodge  Stable Yard  Mentmore
hednes

As you can see we have a mix of locations so we will start the cleaning by first creating a new column called 'formated_location' where we will save all the changes applied.

In [10]:
# Create a new column called formated_location where we will aplly all the functions described above to the locations. 

data['formated_location'] = data['location']


#Print data frame to check that the new column was added


print(data.head())


        


                                             content   location        date  \
0  Bt are awful - terrible communication. I wante...     TOTNES  2021-11-22   
1  Very poor service inspite of being with BT for...       None        None   
2                Absolutely rubbish customer service  wakefield  2021-11-19   
3  Shambles of a company. I couldn`t use the Broa...  Stockport  2021-11-14   
4  The service they provide is not close to that ...  Kirkcaldy  2021-11-14   

  formated_location  
0            TOTNES  
1              None  
2         wakefield  
3         Stockport  
4         Kirkcaldy  


In [23]:
#Let's check all the locations with special characters including spaces:
location_columns = ['location','formated_location']

locations_with_special_characters=data[data['formated_location'].str.contains("\W", na=False, regex=True)]
print(locations_with_special_characters[location_columns])
#data['formated_location'] = data['formated_location'].apply(lambda x : format_string(x) if(type(x) == str) else x ) 
for i in locations_with_special_characters['location']:
    print(i)


                             location                formated_location
9                          Surrey, UK                       Surrey  UK
11                   Sutton Coldfield                 Sutton Coldfield
22                          St Albans                        St Albans
43                    Bishop auckland                  Bishop auckland
44                    Great Missenden                  Great Missenden
...                               ...                              ...
6021                 fakenham norfolk                 fakenham norfolk
6023  Bridgend Mid Glamorgan Wales UK  Bridgend Mid Glamorgan Wales UK
6024        Lynemouth, Northumberland        Lynemouth  Northumberland
6027                Egremont, Cumbria                Egremont  Cumbria
6028                 Barnstaple Devon                 Barnstaple Devon

[652 rows x 2 columns]
Surrey, UK
Sutton Coldfield
St Albans
Bishop auckland
Great Missenden
Brentwood, UK
Surrey, UK
Sutton Coldfield
St Albans
Bi

In [21]:
#Let's change the value of some specific rows that don't represent a real location

data['formated_location']=data['formated_location'].str.replace("Don’t", "unknown",case=False, regex=True)
data['formated_location']=data['formated_location'].str.replace("Ky154gxgordon.kidd@gmail.com", "unknown",case=False, regex=True)

In [25]:
#Let's remove special characteres but avoiding the ones that belong to real locations, let's use the function remove_special_characters
data['formated_location'] = data['formated_location'].apply(lambda x : remove_special_characters(x) if(type(x) == str) else x )

In [10]:
#Let's check if the location column has a location with uk or united kingdom  

uk_values = data.loc[data['location'].isin(['uk','UK','United Kingdom', 'Uk','uK','U K', 'united kingdom'])]

print(uk_values[location_columns])

            location formated_location
15                UK                UK
110               UK                UK
500               UK                UK
539               UK                UK
1118              Uk                Uk
1249              UK                UK
1273              Uk                Uk
1287  United Kingdom    United Kingdom
1299              UK                UK
1401              UK                UK
1424              UK                UK
1448              uk                uk
1723              uk                uk
1830              UK                UK
1888              Uk                Uk
2062  United Kingdom    United Kingdom
2155              Uk                Uk
2182  United Kingdom    United Kingdom
2391              UK                UK
2495              UK                UK
3170  United Kingdom    United Kingdom
3903              Uk                Uk
4246              UK                UK
4317              Uk                Uk
4445              UK     

As we can see there are several locations with United Kingdom and UK on them so let's replace them with unknown as this value (UK or United Kingdom ) is not specific enough. 

In [31]:
#Let's start by changing the all the values that only contains 'UK' to unknown using the function replace_uk_with_unknown

data['formated_location'] = data['formated_location'].apply(lambda x: replace_uk_with_unknown(x) if(type(x) == str) else x ) 

#Check changes for the rows with UK values 

uk_replaced= data.loc[data['location'].isin(['uk','UK', 'Uk','uK','U K'])]

print(uk_replaced[location_columns])




     location formated_location
15         UK           unknown
110        UK           unknown
500        UK           unknown
539        UK           unknown
1118       Uk           unknown
1249       UK           unknown
1273       Uk           unknown
1299       UK           unknown
1401       UK           unknown
1424       UK           unknown
1448       uk           unknown
1723       uk           unknown
1830       UK           unknown
1888       Uk           unknown
2155       Uk           unknown
2391       UK           unknown
2495       UK           unknown
3903       Uk           unknown
4246       UK           unknown
4317       Uk           unknown
4445       UK           unknown
4962       Uk           unknown
4970       Uk           unknown


As we can see now the rows with UK values in the formated_location column have been changed to unknown 

In [33]:
#Let's now replace the rows that have United Kingdom

data['formated_location']=data['formated_location'].str.replace("Birmingham United Kingdom", "Birmingham",case=False, regex=True)

data['formated_location']=data['formated_location'].str.replace("United Kingdom", "unknown",case=False, regex=True)

#Check changes

united_kingdom_replaced= data.loc[data['location'].isin(['United Kingdom'])]

print(united_kingdom_replaced[location_columns])


            location formated_location
1287  United Kingdom           unknown
2062  United Kingdom           unknown
2182  United Kingdom           unknown
3170  United Kingdom           unknown
5112  United Kingdom           unknown
5203  United Kingdom           unknown
5254  United Kingdom           unknown
5471  United Kingdom           unknown
5491  United Kingdom           unknown
5575  United Kingdom           unknown
5623  United Kingdom           unknown
5643  United Kingdom           unknown
5703  United Kingdom           unknown
6003  United Kingdom           unknown


In [34]:
#Let's check the locations with UK in a sentence

uk_in_sentences=data[data['location'].str.contains("UK| UK|United Kingdom|Uk|uK|U K|united kingdom", na=False)]

print(uk_in_sentences['location'])

9                            Surrey, UK
15                                   UK
67                        Brentwood, UK
104                          Surrey, UK
110                                  UK
                     ...               
5982                      Wiltshire, UK
5999                         Swansea,UK
6003                     United Kingdom
6010                  Staffordshire, UK
6023    Bridgend Mid Glamorgan Wales UK
Name: location, Length: 65, dtype: object


In [35]:
#Lets now replace the rows where UK is in a sentence using remove_uk_in_string

data['formated_location'] = data['formated_location'].apply(lambda x: remove_uk_in_string(x) if(type(x) == str) else x ) 

#Check changes

uk_in_sentences=data[data['location'].str.contains("UK|UK |Uk|uK| U K", na=False)]

print(uk_in_sentences[['location','formated_location']])



                             location             formated_location
9                          Surrey, UK                        surrey
15                                 UK                       unknown
67                      Brentwood, UK                     brentwood
104                        Surrey, UK                        surrey
110                                UK                       unknown
162                     Brentwood, UK                     brentwood
500                                UK                       unknown
516                      Faversham UK                     faversham
539                                UK                       unknown
561                        London, UK                        london
777                Gloucestershire UK               gloucestershire
1073                       UK Swindon                       swindon
1118                               Uk                       unknown
1249                               UK           

In [36]:
# Let's Get the location that have numbers:

locations_with_numbers=data[data['location'].str.contains("\d", na=False, regex=True)]

print(locations_with_numbers[location_columns])

                                 location                    formated_location
259                               CM1 2EH                              cm1 2eh
292                                   AL5                                  al5
315                             London W2                            london w2
326                               Ig6 3uf                              ig6 3uf
366                            London NW9                           london nw9
406                            London SW9                           london sw9
411                               B14 6DN                              b14 6dn
449                              Wrexham1                             wrexham1
502                            London EC1                           london ec1
757                                M124nt                               m124nt
837                              cf44 8ll                             cf44 8ll
886                               BT403TR           

In [44]:
#Let's assign a location when we have only postcodes:
data['formated_location'] = data['formated_location'].apply(lambda x: add_location_to_a_postcode(x) if(is_a_postcode(x)) else x ) 

# Let's check the changes

locations_with_numbers=data[data['location'].str.contains("\d", na=False, regex=True)]

print(locations_with_numbers[location_columns])

                                 location                    formated_location
259                               CM1 2EH                      East of England
292                                   AL5                      East of England
315                             London W2                            london w2
326                               Ig6 3uf                               London
366                            London NW9                           london nw9
406                            London SW9                           london sw9
411                               B14 6DN                        West Midlands
449                              Wrexham1                             wrexham1
502                            London EC1                           london ec1
757                                M124nt                           North West
837                              cf44 8ll                                Wales
886                               BT403TR           

               location  formated_location
0                TOTNES             totnes
2             wakefield          wakefield
3             Stockport          stockport
4             Kirkcaldy          kirkcaldy
9            Surrey, UK             surrey
...                 ...                ...
6026               Leek               leek
6027  Egremont, Cumbria  egremont  cumbria
6028   Barnstaple Devon   barnstaple devon
6029             Rosyth             rosyth
6030             London             london

[3171 rows x 2 columns]


In [45]:
#Let's apply lowercase and title for all the locations
data['formated_location'] = data['formated_location'].apply(lambda x : format_string(x) if(type(x) == str) else x ) 

In [48]:
 # select the 'formated_location' column to check the unique values and see the kind of data that we have in it 

unique_locations = data['formated_location'].dropna().unique()
print(len(unique_locations))

for i in unique_locations:
    print(i)


1184
Totnes
Wakefield
Stockport
Kirkcaldy
Surrey
Reading
Sutton Coldfield
York
Peterborough
Scotland
Unknown
Abingdon
London
St Albans
Liverpool
Somerset
Oxfordshire
Essex
Livingston
Bishop Auckland
Great Missenden
Belfast
Edinburgh
Brentwood
Brownhills
Glos
Fareham
Sheffield
Folkestone
Bradford
Birmingham
Datchet
Preston
Cleveleys
Tunstall
Aberdeen
Uckfield
Newcastle
Paisley
Solihull
South Elmsall
East Sussex
Hastings
Wrexham
High Wycombe
Penzance
Ormskirk
Kent
West Wickham
Colchester
Bristol
Caerphilly
Penkridge
Mordoor
Woking
Telford
Bangor
Chelmsford
Clacton
East Of England
Littleborough
North Tyneside
Thetford
Cardiff
Northumberland
Northampton
Northamptonshire
Brighton
Warrington
Bathgate
London W2
Wirral
Stroud
Cheshire
Derbyshire
Rhiddlan
Southampton
Southend On Sea
Minchinhampton
Enniskillen
Camrbridge
Kettering
Burnley
Bath
North West
London Nw9
Bournemouth
Wales
Forest Hill
Hambleton
Kelso
Cirencester
Archway Lodge  Stable Yard  Mentmore
Hednesford
Allington Maidstone
Leeds


Rbwm
Ky154Gxgordon Kidd Gmail Com
Mk
Abo
Ayr
Don T
No
U K
G
Llalala
Were
Sbsh
Ki
Ni
Noke