## Libraries & Constants

In [1]:
import pandas as pd

In [2]:
#pd.options.display.max_rows = 200
pd.set_option('display.max_rows', None)

In [74]:
# Constants
DATA_PATH   = "../data/raw/wikivoyage-listings-en.csv"
EXPORT_DATASET2_PATH = "../data/processed/dataset2.csv"
EXPORT_DESC_PATH = "../data/processed/descriptions2.txt"
EXPORT_DATASET3_MINI_PATH = "../data/processed/dataset3_mini.csv"
EXPORT_DESC_MINI_PATH = "../data/processed/descriptions3_mini.txt"

cols_to_read = ["article", "type", "title", "description"] 

---

# Dataset Exploration <a class="anchor" id="one"></a>

---

## Original Dataset

In [4]:
original_dataset = pd.read_csv(DATA_PATH)

  dataset = pd.read_csv(DATA_PATH)


In [5]:
original_dataset.head(5)

Unnamed: 0,article,type,title,alt,wikidata,wikipedia,address,directions,phone,tollFree,...,checkIn,checkOut,image,price,latitude,longitude,wifi,accessibility,lastEdit,description
0,'s-Hertogenbosch,buy,Taxi TCO,,,,,,+31 412 484 41,,...,,,,,,,,,2015-03-01,
1,'s-Hertogenbosch,buy,Taxi de Hart,,,,,,+31 73 5112733,,...,,,,,,,,,2015-03-01,
2,'s-Hertogenbosch,see,Saint John's Cathedral,Sint Jans Kathedraal,,,,,,,...,,,,,51.68808,5.30814,,,2016-01-25,one of the most prominent landmarks of Den Bos...
3,'s-Hertogenbosch,see,The Moriaan,,,,,on the market square,,,...,,,,,51.68967,5.30261,,,2016-01-25,"the oldest brick building in The Netherlands, ..."
4,'s-Hertogenbosch,see,Town Hall,Stadhuis,,,Markt 1,south side of the market square,,,...,,,,,51.68846,5.30315,,,2016-01-25,The facade was built in the 17th century and r...


In [6]:
original_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 280790 entries, 0 to 280789
Data columns (total 24 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   article        280790 non-null  object 
 1   type           280720 non-null  object 
 2   title          280279 non-null  object 
 3   alt            37663 non-null   object 
 4   wikidata       2656 non-null    object 
 5   wikipedia      0 non-null       float64
 6   address        188910 non-null  object 
 7   directions     73242 non-null   object 
 8   phone          158066 non-null  object 
 9   tollFree       7028 non-null    object 
 10  email          34656 non-null   object 
 11  fax            18397 non-null   object 
 12  url            152413 non-null  object 
 13  hours          51017 non-null   object 
 14  checkIn        6796 non-null    object 
 15  checkOut       7343 non-null    object 
 16  image          7434 non-null    object 
 17  price          63123 non-null

**Missing values exist in most of the columns, hence cleaning is required.**

In [7]:
#Get the count of each city in the dataset
cities_count = pd.value_counts(original_dataset.article).to_frame()

In [8]:
#Reduce the list to cities with count > 150
reduced_cities = cities_count[cities_count.article > 150]
#Get the total count of dataframe samples in the reduced cities list
reduced_cities.article.sum() 

25068

In [9]:
reduced_cities

Unnamed: 0,article
Chennai,410
Bangalore,401
Amusement parks and water parks in eastern United States,368
Swansea,328
Buffalo/East Side,327
Delhi,319
Chiang Mai,315
Kochi,305
Sopron,294
Hyderabad,287


---

## Dataset 1 <a class="anchor" id="two"></a>

**In dataset 1, we only use the following columns from the original dataset & rename them:**
* "article": "City" 
* "type": "Category" 
* "title": "POI" 
* "description":"Description"

**Also, we drop any row that has any missing value.**

In [20]:
dataset1 = pd.read_csv(DATA_PATH, usecols = cols_to_read)

In [21]:
dataset1.rename(columns = {"article": "City", "type": "Category", "title": "POI", "description":"Description"}, inplace = True)

In [22]:
dataset1.head(5)

Unnamed: 0,City,Category,POI,Description
0,'s-Hertogenbosch,buy,Taxi TCO,
1,'s-Hertogenbosch,buy,Taxi de Hart,
2,'s-Hertogenbosch,see,Saint John's Cathedral,one of the most prominent landmarks of Den Bos...
3,'s-Hertogenbosch,see,The Moriaan,"the oldest brick building in The Netherlands, ..."
4,'s-Hertogenbosch,see,Town Hall,The facade was built in the 17th century and r...


In [23]:
dataset1 = dataset1.dropna().reset_index(drop=True)

In [24]:
dataset1.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 213043 entries, 0 to 213042
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   City         213043 non-null  object
 1   Category     213043 non-null  object
 2   POI          213043 non-null  object
 3   Description  213043 non-null  object
dtypes: object(4)
memory usage: 99.9 MB


In [25]:
#Get the count of each city
cities_count1 = pd.value_counts(dataset1.City).to_frame()
#Reduce the list to cities with count > 150
reduced_cities1 = cities_count1[cities_count1.City > 150]
reduced_cities1

Unnamed: 0,City
Bangalore,312
Chennai,293
Chiang Mai,278
Delhi,275
Ann Arbor,270
Indianapolis,259
Sopron,258
Roman Empire,249
Oakland,242
Turku,241


In [26]:
#Get the total count of dataframe samples in the reduced cities list
reduced_cities1.City.sum()

12431

In [27]:
cities_list1 = reduced_cities1.index.to_list()
cities_list1

['Bangalore',
 'Chennai',
 'Chiang Mai',
 'Delhi',
 'Ann Arbor',
 'Indianapolis',
 'Sopron',
 'Roman Empire',
 'Oakland',
 'Turku',
 'Kochi',
 'Swansea',
 'Dublin',
 'Asheville',
 'Rochester (New York)',
 'Brisbane',
 'Las Vegas',
 'Győr',
 'Buffalo/East Side',
 'Birmingham (England)',
 'Buffalo/West Side',
 'Ho Chi Minh City',
 'Cincinnati',
 'Calgary',
 'Transdanubia',
 'Glasgow',
 'Austin',
 'Cape Town',
 'Hyderabad',
 'Aarhus',
 'Phnom Penh',
 'Brussels',
 'Portland (Oregon)',
 'Tampere',
 'Budapest/Pest',
 'Buffalo/North Buffalo',
 'Kaunas',
 'Madrid',
 'Saint Petersburg/Center',
 'Buffalo/Downtown',
 'Pattaya',
 'Vienna/Innere Stadt',
 'Kiev',
 'Budapest/Central Pest',
 'Venice',
 'Yellowstone National Park',
 'Albuquerque',
 'Valencia',
 'Berlin/Mitte',
 'Frankfurt',
 'Vientiane',
 'Beijing/Chaoyang',
 'Ubud',
 'Winnipeg',
 'Bucharest',
 'Baku',
 'Mombasa',
 'Lviv',
 'Montgomery County (Pennsylvania)',
 'Caldas da Rainha',
 'Phoenix',
 'Boracay']

#### Cairo City

In [28]:
#Look for Cairo city
cairo = dataset1[dataset1.City.str.contains("Cairo")]
cairo

Unnamed: 0,City,Category,POI,Description
28058,Cairo,sleep,Le Passage,was Iberotel
28059,Cairo,see,Egyptian Museum,Located in the Midan Tahrir area and officiall...
28060,Cairo,see,Ibn Tulun,"Arguably the oldest mosque in Cairo, built bet..."
28061,Cairo,see,Al-Azhar Park,A recently opened landscaped gardens overlooki...
28062,Cairo,see,Khan El Khalily,Cairo's souk area where visitors will find man...
28063,Cairo,see,Abdeen Palace,Located about one Kilometer away from the Mida...
28064,Cairo,see,Pharaonic Village,It is about twenty minutes driving from Downtown.
28065,Cairo,do,The Culture Wheel,The largest independent cultural centre in Cai...
28066,Cairo,do,The Garden Theater,In Al-Azhar Park offers a range of musical per...
28067,Cairo,do,Cairo Opera House,It hosted the Cairo International Film Festiva...


In [30]:
cairo_cities_list = list(cairo.City.unique())
cairo_cities_list

['Cairo',
 'Cairo (Illinois)',
 'Cairo/Dokki and Mohandiseen',
 'Cairo/Downtown',
 'Cairo/Garden City',
 'Cairo/Gezira',
 'Cairo/Giza',
 'Cairo/Heliopolis',
 'Cairo/Islamic Cairo',
 "Cairo/Ma'adi",
 'Cairo/Midan Ramses',
 'Cairo/Midan Tahrir',
 'Cairo/Old Cairo']

In [31]:
cairo_cities_list.remove("Cairo (Illinois)")

In [32]:
#No of dataset samples for Cairo city
len(cairo)

274

#### Dubai City

In [33]:
#Look for Dubai city
dubai = dataset1[dataset1.City.str.contains("Dubai")]
dubai

Unnamed: 0,City,Category,POI,Description
49428,Dubai,do,Yacht charter,An easy way to explore the man-made Palm Islan...
49429,Dubai,do,Burj Khalifa,Visit the tallest building in the world with t...
49430,Dubai,do,Hot Air Balloon Ride,Great fun seeing all the sand dunes and mounta...
49431,Dubai,do,Big Bus Company tour,"You can take a bus tour, both day time and nig..."
49432,Dubai,other,Neighbouring Sharjah,"while dry (no alcohol) and mostly suburban, ha..."
49433,Dubai,other,Abu Dhabi,"capital of the Emirates, is an entirely differ..."
49434,Dubai,other,The city of Al Ain,it is surprisingly a city of lush gardens and ...
49435,Dubai,other,peaceful Umm Al Quwain emirate,"if you want a cosy and relaxing environment, f..."
49436,Dubai,other,The Iranian island of Kish,s a free trade zone that does not require a visa.
49437,Dubai,other,Fjords of Musandam_Peninsula,Explore beautiful caves and enjoy the awesome ...


In [34]:
dubai_cities_list = list(dubai.City.unique())

In [35]:
full_cities_list = cities_list1 + cairo_cities_list + dubai_cities_list
full_cities_list

['Bangalore',
 'Chennai',
 'Chiang Mai',
 'Delhi',
 'Ann Arbor',
 'Indianapolis',
 'Sopron',
 'Roman Empire',
 'Oakland',
 'Turku',
 'Kochi',
 'Swansea',
 'Dublin',
 'Asheville',
 'Rochester (New York)',
 'Brisbane',
 'Las Vegas',
 'Győr',
 'Buffalo/East Side',
 'Birmingham (England)',
 'Buffalo/West Side',
 'Ho Chi Minh City',
 'Cincinnati',
 'Calgary',
 'Transdanubia',
 'Glasgow',
 'Austin',
 'Cape Town',
 'Hyderabad',
 'Aarhus',
 'Phnom Penh',
 'Brussels',
 'Portland (Oregon)',
 'Tampere',
 'Budapest/Pest',
 'Buffalo/North Buffalo',
 'Kaunas',
 'Madrid',
 'Saint Petersburg/Center',
 'Buffalo/Downtown',
 'Pattaya',
 'Vienna/Innere Stadt',
 'Kiev',
 'Budapest/Central Pest',
 'Venice',
 'Yellowstone National Park',
 'Albuquerque',
 'Valencia',
 'Berlin/Mitte',
 'Frankfurt',
 'Vientiane',
 'Beijing/Chaoyang',
 'Ubud',
 'Winnipeg',
 'Bucharest',
 'Baku',
 'Mombasa',
 'Lviv',
 'Montgomery County (Pennsylvania)',
 'Caldas da Rainha',
 'Phoenix',
 'Boracay',
 'Cairo',
 'Cairo/Dokki and Moha

In [36]:
full_cities_list.remove("Roman Empire")

In [37]:
len(full_cities_list)

79

---

## Dataset 2 <a class="anchor" id="three"></a>

**In dataset 2, we filter dataset 1 to have only the cities listed in "full_cities_list".**

In [38]:
dataset2 = dataset1[dataset1.City.isin(full_cities_list)].reset_index(drop=True)

In [39]:
dataset2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12658 entries, 0 to 12657
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   City         12658 non-null  object
 1   Category     12658 non-null  object
 2   POI          12658 non-null  object
 3   Description  12658 non-null  object
dtypes: object(4)
memory usage: 395.7+ KB


In [53]:
#EDA
#Show dataframe where city name is specified
dataset2[dataset2["City"] == "Bali"]

Unnamed: 0,City,Category,POI,Description,Country
10144,Bali,other,Bina Wisata,"Not especially helpful, but they always have a...",Indonesia
10145,Bali,see,Goa Gajah,The centerpiece here is a cave dating back to ...,Indonesia
10146,Bali,see,Gunung Kawi,"Dating from the eleventh century, this is pres...",Indonesia
10147,Bali,see,Pura Kehen,One of the most attractive temples in the whol...,Indonesia
10148,Bali,see,Puri Saren Agung,This was the palace of the kings of Ubud until...,Indonesia
10149,Bali,see,Tirta Empul,One of the holiest temples in Bali built aroun...,Indonesia
10150,Bali,see,Yeh Pulu,This complex of rock carvings is close to Goa ...,Indonesia
10151,Bali,see,"Chapel of Mother Goddess, Rajarajeshwari Tripu...",A modern chapel dedicated to the ''feminine as...,Indonesia
10152,Bali,see,Agung Rai Museum of Art,Showcases works by well known Balinese artists...,Indonesia
10153,Bali,see,Blanco Renaissance Museum,"Before he passed away in 1999, Spanish artist ...",Indonesia


In [41]:
pd.value_counts(dataset2.City)

Bangalore                           312
Chennai                             293
Chiang Mai                          278
Delhi                               275
Ann Arbor                           270
Indianapolis                        259
Sopron                              258
Oakland                             242
Turku                               241
Kochi                               238
Swansea                             231
Dublin                              228
Asheville                           225
Brisbane                            223
Rochester (New York)                223
Győr                                221
Las Vegas                           221
Buffalo/East Side                   220
Birmingham (England)                219
Buffalo/West Side                   212
Ho Chi Minh City                    211
Cincinnati                          208
Calgary                             207
Transdanubia                        206
Glasgow                             202


### Modify City Names <a class="anchor" id="three-one"></a>

In [42]:
#Modify cities names
city_rename_dict = {"Ann Arbor":"Michigan",
                   "Asheville":"North Carolina",
                   "Rochester (New York)":"New York",
                   "Buffalo/East Side":"New York",
                   "Birmingham (England)":"Birmingham",
                   "Buffalo/West Side":"New York",
                   "Austin":"Texas",
                   "Portland (Oregon)":"Oregon",
                   "Budapest/Pest":"Budapest",
                   "Buffalo/North Buffalo":"New York",
                   "Saint Petersburg/Center":"Saint Petersburg",
                   "Buffalo/Downtown":"New York",
                   "Vienna/Innere Stadt":"Vienna",
                   "Budapest/Central Pest":"Budapest",
                   "Albuquerque":"New Mexico",
                   "Berlin/Mitte":"Berlin",
                   "Ubud":"Bali",
                   "Beijing/Chaoyang":"Beijing",
                   "Montgomery County (Pennsylvania)":"Pennsylvania",
                   "Phoenix":"Arizona",
                   "Cairo/Downtown":"Cairo",
                   "Dubai/Jumeirah":"Dubai",
                   "Cairo/Gezira":"Cairo",
                   "Dubai/Deira":"Dubai",
                   "Dubai/Jebel Ali":"Dubai",
                   "Dubai/Bur Dubai":"Dubai",
                   "Dubai/Emirates Road":"Dubai",
                   "Cairo/Dokki and Mohandiseen":"Cairo",
                   "Cairo/Garden City":"Cairo",
                   "Cairo/Islamic Cairo":"Cairo",
                   "Cairo/Heliopolis":"Cairo",
                   "Cairo/Ma'adi":"Cairo",
                   "Cairo/Giza":"Cairo",
                   "Cairo/Midan Tahrir":"Cairo",
                   "Cairo/Old Cairo":"Cairo",
                   "Cairo/Midan Ramses":"Cairo"}

In [43]:
dataset2["City"] = dataset2["City"].replace(city_rename_dict)

### Add Country Column <a class="anchor" id="three-two"></a>

In [44]:
#Add Country column
city_country = {"Bangalore": "India", 
               "Chennai":"India",
               "Chiang Mai":"Thailand",
               "Delhi":"India",
               "New York":"USA",
               "Indianapolis":"USA",
               "Sopron":"Hungary",
               "Oakland":"USA",
               "Turku":"Finland",
               "Kochi":"India",
               "Swansea":"Wales",
               "Dublin":"Ireland",
               "North Carolina":"USA",
               "Brisbane":"Australia",
               "Győr":"Hungary",
               "Las Vegas":"USA",
               "Birmingham":"UK",
               "Ho Chi Minh City":"Vietnam",
               "Cincinnati":"USA",
               "Calgary":"Canada",
               "Transdanubia":"Hungary",
               "Glasgow":"Scotland",
               "Texas":"USA",
               "Cape Town":"South Africa",
               "Hyderabad":"India",
               "Aarhus":"Denmark",
               "Phnom Penh":"Cambodia",
               "Brussels":"Belgium",
               "Oregon":"USA",
               "Tampere":"Finland",
               "Budapest":"Hungary",
               "Kaunas":"Lithuania",
               "Saint Petersburg":"Russia",
               "Madrid":"Spain",
               "Pattaya":"Thailand",
               "Vienna":"Austria",
               "Kiev":"Ukraine",
               "Venice":"Italy",
               "Yellowstone National Park":"USA",
               "New Mexico":"USA",
               "Valencia":"Spain",
               "Berlin":"Germany",
               "Frankfurt":"Germany",
               "Vientiane":"Laos",
               "Bali":"Indonesia",
               "Beijing":"China",
               "Winnipeg":"Canada",
               "Bucharest":"Romania",
               "Baku":"Azerbaijan",
               "Mombasa":"Kenya",
               "Lviv":"Ukraine",
               "Pennsylvania":"USA",
               "Caldas da Rainha":"Portugal",
               "Boracay":"Philppines",
               "Arizona":"USA",
               "Cairo":"Egypt",
               "Dubai":"UAE",
               "Michigan":"USA",
               }

In [45]:
dataset2["Country"] = dataset2["City"].map(city_country)

In [46]:
dataset2.head()

Unnamed: 0,City,Category,POI,Description,Country
0,Aarhus,other,Rute 1000,"to Esbjerg, Kolding, København, Vejen, Vejle.",Denmark
1,Aarhus,other,Abildskou,They operate buses from Copenhagen (Valby Stat...,Denmark
2,Aarhus,other,Rødbillet,They operate buses to and from Copenhagen (nex...,Denmark
3,Aarhus,other,Eurolines,"They operate buses to Hamburg, from where you ...",Denmark
4,Aarhus,other,Mols Linien,They operate ferries to Odden and Kalundborg o...,Denmark


### Save Dataset2 <a class="anchor" id="three-three"></a>

In [48]:
#Saved once then commented out, to resave then uncomment & save

# dataset2.to_csv(EXPORT_DATASET2_PATH, index=False)

In [52]:
#Save Description Text in a text file
#Saved once then commented out, to resave then uncomment & save

# dataset2.Description.to_csv(EXPORT_DESC_PATH, sep=' ', index=False)

**Dataset 2 was built intentionally for initial ner annotation using spaCy pretrained model. But as manual NER correction & relations annotation consume a huge time, a mini datset is constructed to allow for lesser manual work & more effiecient full pipeline implementation accomplishment.**

---

## Dataset 3: Mini Dataset for NER Annotations

**In dataset 3, we create a smaller dataset that will be used for Named Entity Recognition annotations. Specific & familiar cities are selected.**

In [91]:
mini_cities_list = ["Cairo", "Abu Dhabi", "Dubai", "Beirut", "Alexandria", "Sharm el-Sheikh","New York", "Dublin", "Las Vegas", 
                    "Madrid", "Saint Petersburg", "Venice", "Delhi"]

# cairo_mini = dataset2[dataset2["City"] == "Cairo"]
# dubai_mini = dataset2[dataset2["City"] == "Dubai"]

* **We look for the selected cities & process the dataset to get a dataframe for each city then concatenate them in one dataframe.**
* **We check below if any city from the list has different formations for its name.**

In [77]:
#Show dataframe where city name is specified
dataset1[dataset1["City"].str.contains("Alexandria")]

Unnamed: 0,City,Category,POI,Description
2600,Alexandria,see,Citadel of Qaitbay,One of the icons of the city at a beautiful lo...
2601,Alexandria,see,Cemetery of Mostafa Kamel,The cemetery includes four tombs dating from t...
2602,Alexandria,see,Kom el-Shouqafa,Kom el-Shouqafa is the Arab translation of the...
2603,Alexandria,see,Pompey's Pillar,"An ancient monument, this 25-meter-high granit..."
2604,Alexandria,see,Roman Theatre,"Built in the 2nd century AD, this Roman amphit..."
2605,Alexandria,see,Montazah Palace,Built in 1892 by Abbas II of Egypt Abbas Hilmi...
2606,Alexandria,see,Tomb of the Unknown Soldier,Egypt has a Tomb of the Unknown Soldier honori...
2607,Alexandria,see,Ras el-Tin Palace,"Not open to visitors, alas."
2608,Alexandria,see,Alexandria National Museum,History Museum with more than 1800 archaeologi...
2609,Alexandria,see,Graeco-Roman Museum,A history museum with a vast collection mostly...


### Beirut City

In [55]:
#Look for Beirut city
beirut = dataset1[dataset1.City.str.contains("Beirut")]
beirut

Unnamed: 0,City,Category,POI,Description
15875,Beirut,see,National Museum Of Beirut,"About 1,300 artifacts are exhibited, ranging i..."
15876,Beirut,see,AUB Museum,"Archeology and History, the Middle East's olde..."
15877,Beirut,other,Atelier Camille Allam Beirut,Gallery and studio of painter sculptor and mus...
15878,Beirut,see,Beirut International Film Festival,"Held annually in October, the Beirut Film Fest..."
15879,Beirut,see,Beirut International Jazz Festival,Held annually during the month of July over a ...
15880,Beirut,see,Festival du cinéma francophone,Held between the month of March and April over...
15881,Beirut,do,NISD,"Solidere Beirut Marina, Downtown Beirut, P.O. ..."
15882,Beirut,do,LD,"Mardelli Building 2nd Floor, Saide Street Ashr..."
15883,Beirut,do,Atlantis,"Bel Azur Hotel, Jounieh, Greater Beirut."
15884,Beirut,buy,ABC Mall,Many international brands are here along with ...


In [56]:
beirut_cities_list = beirut.City.unique()

In [57]:
beirut_cities_list

array(['Beirut', 'Beirut/Ain El Mraiseh', 'Beirut/Ashrafieh',
       'Beirut/Downtown', 'Beirut/Hamra', 'Beirut/Jnah', 'Beirut/Rawcheh',
       'Beirut/Verdun'], dtype=object)

In [58]:
#Modify cities names
beirut_rename_dict = {
    'Beirut':"Beirut", 
    'Beirut/Ain El Mraiseh':"Beirut", 
    'Beirut/Ashrafieh':"Beirut",
    'Beirut/Downtown':"Beirut", 
    'Beirut/Hamra':"Beirut", 
    'Beirut/Jnah':"Beirut", 
    'Beirut/Rawcheh':"Beirut",
    'Beirut/Verdun':"Beirut"
}

In [59]:
beirut["City"] = beirut["City"].replace(beirut_rename_dict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  beirut["City"] = beirut["City"].replace(beirut_rename_dict)


In [66]:
#Add Country column
beirut_city_country = {"Beirut": "Lebanon"}
beirut["Country"] = beirut["City"].map(beirut_city_country)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  beirut["Country"] = beirut["City"].map(beirut_city_country)


In [67]:
beirut

Unnamed: 0,City,Category,POI,Description,Country
15875,Beirut,see,National Museum Of Beirut,"About 1,300 artifacts are exhibited, ranging i...",Lebanon
15876,Beirut,see,AUB Museum,"Archeology and History, the Middle East's olde...",Lebanon
15877,Beirut,other,Atelier Camille Allam Beirut,Gallery and studio of painter sculptor and mus...,Lebanon
15878,Beirut,see,Beirut International Film Festival,"Held annually in October, the Beirut Film Fest...",Lebanon
15879,Beirut,see,Beirut International Jazz Festival,Held annually during the month of July over a ...,Lebanon
15880,Beirut,see,Festival du cinéma francophone,Held between the month of March and April over...,Lebanon
15881,Beirut,do,NISD,"Solidere Beirut Marina, Downtown Beirut, P.O. ...",Lebanon
15882,Beirut,do,LD,"Mardelli Building 2nd Floor, Saide Street Ashr...",Lebanon
15883,Beirut,do,Atlantis,"Bel Azur Hotel, Jounieh, Greater Beirut.",Lebanon
15884,Beirut,buy,ABC Mall,Many international brands are here along with ...,Lebanon


### Alexandria city

In [80]:
alex = dataset1[dataset1.City == "Alexandria"]

In [84]:
#Add Country column
alex_city_country = {"Alexandria": "Egypt"}
alex["Country"] = alex["City"].map(alex_city_country)
alex

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  alex["Country"] = alex["City"].map(alex_city_country)


Unnamed: 0,City,Category,POI,Description,Country
2600,Alexandria,see,Citadel of Qaitbay,One of the icons of the city at a beautiful lo...,Egypt
2601,Alexandria,see,Cemetery of Mostafa Kamel,The cemetery includes four tombs dating from t...,Egypt
2602,Alexandria,see,Kom el-Shouqafa,Kom el-Shouqafa is the Arab translation of the...,Egypt
2603,Alexandria,see,Pompey's Pillar,"An ancient monument, this 25-meter-high granit...",Egypt
2604,Alexandria,see,Roman Theatre,"Built in the 2nd century AD, this Roman amphit...",Egypt
2605,Alexandria,see,Montazah Palace,Built in 1892 by Abbas II of Egypt Abbas Hilmi...,Egypt
2606,Alexandria,see,Tomb of the Unknown Soldier,Egypt has a Tomb of the Unknown Soldier honori...,Egypt
2607,Alexandria,see,Ras el-Tin Palace,"Not open to visitors, alas.",Egypt
2608,Alexandria,see,Alexandria National Museum,History Museum with more than 1800 archaeologi...,Egypt
2609,Alexandria,see,Graeco-Roman Museum,A history museum with a vast collection mostly...,Egypt


### Sharm El-Sheikh City

In [87]:
sharm = dataset1[dataset1.City == "Sharm el-Sheikh"]

In [90]:
#Add Country column
sharm_city_country = {"Sharm el-Sheikh": "Egypt"}
sharm["Country"] = sharm["City"].map(sharm_city_country)
sharm

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sharm["Country"] = sharm["City"].map(sharm_city_country)


Unnamed: 0,City,Category,POI,Description,Country
154024,Sharm el-Sheikh,do,Oonas Dive Center,"A small, friendly Dive Centre at the quieter e...",Egypt
154025,Sharm el-Sheikh,do,Divers International,PADI five star diving centres offering daily d...,Egypt
154026,Sharm el-Sheikh,do,Stables at Sofitel Hotel,Helpful personnel. Require helmets (provided);...,Egypt
154027,Sharm el-Sheikh,do,Tiba Safari / Tiba Trip,Sold by many agencies. Groups are 10-15 bikes;...,Egypt
154028,Sharm el-Sheikh,buy,Carrefour Express,one of the classic European shops with price t...,Egypt
154029,Sharm el-Sheikh,eat,El Masrien,Offering the finest Egyptian barbeque cuisine ...,Egypt
154030,Sharm el-Sheikh,eat,Fares Seafood,All kinds of seafood fresh from the Red Sea.,Egypt
154031,Sharm el-Sheikh,eat,Fawanes Cafe,Lebanese cuisine. Good place for waterpipes; o...,Egypt
154032,Sharm el-Sheikh,eat,Onions,"Fusion food, good services and reasonable prices.",Egypt
154033,Sharm el-Sheikh,eat,Safsafa,Probably one of the best fish cafes in the tow...,Egypt


## Dataset 3

In [92]:
dataset_temp = dataset2[dataset2.City.isin(mini_cities_list)].reset_index(drop=True)
dataset_mini = pd.concat([dataset_temp, beirut, alex, sharm], axis = 0, ignore_index = True)

In [93]:
dataset_mini.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2935 entries, 0 to 2934
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   City         2935 non-null   object
 1   Category     2935 non-null   object
 2   POI          2935 non-null   object
 3   Description  2935 non-null   object
 4   Country      2935 non-null   object
dtypes: object(5)
memory usage: 114.8+ KB


In [97]:
dataset_mini.tail()

Unnamed: 0,City,Category,POI,Description,Country
2930,Sharm el-Sheikh,sleep,Hyatt Regency Sharm El Sheikh,5 star resort style hotel with 439 rooms and s...,Egypt
2931,Sharm el-Sheikh,sleep,Noria Resort,It has been built in traditional Roman Style. ...,Egypt
2932,Sharm el-Sheikh,sleep,Ritz Carlton,"Spa, diving and snorkelling from the hotel.",Egypt
2933,Sharm el-Sheikh,sleep,The Cleopatra Luxury Resort Collection,"Spa, diving and snorkelling from the hotel.",Egypt
2934,Sharm el-Sheikh,sleep,Jaz Mirabel Beach,az Mirabel Beach offers stunning views of the ...,Egypt


## Export Dataset 3 & its Descriptions

In [98]:
#Save Description Text in a text file
#Saved once then commented out, to resave then uncomment & save

# dataset_mini.to_csv(EXPORT_DATASET3_MINI_PATH, index=False)
# dataset_mini.Description.to_csv(EXPORT_DESC_MINI_PATH, sep=' ', index=False)