In [1]:
import os
import pandas as pd
import re

In [33]:
def remove_link_from_context(context):
    # Regular expression pattern to match URLs
    url_pattern = r'https?://\S+'
    # Replace URLs with an empty string
    cleaned_context = re.sub(url_pattern, '', context)
    # Remove extra whitespaces
    cleaned_context = ' '.join(cleaned_context.split())
    return cleaned_context

def extract_info_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
        
        # Extracting the title using regular expression
        title_match = re.search(r'TITLE:\s*(.*?)\s*LINK:', text, re.IGNORECASE)
        if title_match:
            title = title_match.group(1).strip()
        else:
            title = None
        
        # Extracting the link using regular expression
        link_match = re.search(r'LINK:\s*(https?://\S+)', text, re.IGNORECASE)
        if link_match:
            link = link_match.group(1).strip()
        else:
            link = None
        
        # Extracting the context
        context_start = text.find("LINK:")
        if context_start != -1:
            context = text[context_start+len("LINK:"):].strip()
            # Remove link from context
            context = remove_link_from_context(context)
        else:
            context = None
        
        return title, link, context

def parse_folder(folder_path):
    data = []
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.txt'):
            file_path = os.path.join(folder_path, file_name)
            title, link, context = extract_info_from_file(file_path)
            data.append({'Title': title, 'Link': link, 'Context': context})
    return data

# Replace 'folder_path' with the path to your folder containing text files
folder_path = 'data'
parsed_data = parse_folder(folder_path)

# Create DataFrame
df = pd.DataFrame(parsed_data)

# Display DataFrame
df


Unnamed: 0,Title,Link,Context
0,Krewe du Vieux 2018: Take a virtual tour of th...,http://www.nola.com/mardi_gras_nola/2018/01/ta...,This area formed the heart of the plantation o...
1,Turkey shells Syrian city as it pushes into Ku...,http://www.eastoregonian.com/turkey-shells-syr...,BEIRUT (AP) � A Kurdish militia spokesman says...
2,"A new development in Gainesville, Va., provide...",https://www.washingtonpost.com/realestate/a-ta...,Southern design Buy Photo : Exteriors are vary...
3,Laura Ingraham takes an Easter break amid Davi...,http://bangordailynews.com/2018/03/31/news/nat...,"Cleve R. Wootson Jr., The Washington Post Marc..."
4,Perm Secretary Launches Book for Africa’s Deve...,http://www.thisdaylive.com/index.php/2018/03/3...,"Former Permanent Secretary, Federal Ministry o..."
...,...,...,...
195,Incoming MFSA chairman accuses Deutsche Bank o...,https://www.maltatoday.com.mt/news/national/85...,Incoming Malta Financial Services Authority ch...
196,EW Nutrition and ICON Form a Strategic Partner...,http://www.adnkronos.com/immediapress/pr-newsw...,EW Nutrition acquires 51% of the shares in ICO...
197,A question of diversity: how fashion is ditchi...,http://www.watoday.com.au/world/a-question-of-...,"""We want to be held accountable,"" she said in ..."
198,Altnagelvin’s new health and well-being hub opens,https://www.derryjournal.com/news/altnagelvin-...,A new Health and Wellbeing Campus has opened a...


In [37]:
print(df["Context"][2])

Southern design Buy Photo : Exteriors are varying shades of brick; window frames are desert sand colored. “Color packages were developed strictly for this community,” Green-Karol said. “The style looks a little like the architecture in Charleston,” said Sue Barusefski, a resident since September. “They tried to bring in that Southern influence in the design.” The residences are three levels. Entrance doors are on the ground floor or lower level adjacent to the garage and lead to a clubroom that can be converted to a guest bedroom. A door opens to a brick-paved outdoor terrace and fenced back yard. Barusefski turned the lower level “into the office I always dreamed of,” she said. “There’s room for deep shelves, a big desk and a credenza. I put in a dry bar where I put my Keurig and a beverage fridge. This has definitely made it a place you can stay till you’re finished with what you’re doing.” The Garays, who have six grandchildren, use the lower level as a rec room. Only one has visite

In [38]:
df.to_csv("geolocation_data.csv")

In [2]:
df = pd.read_csv("geolocation_data.csv")
df

Unnamed: 0.1,Unnamed: 0,Title,Link,Context
0,0,Krewe du Vieux 2018: Take a virtual tour of th...,http://www.nola.com/mardi_gras_nola/2018/01/ta...,This area formed the heart of the plantation o...
1,1,Turkey shells Syrian city as it pushes into Ku...,http://www.eastoregonian.com/turkey-shells-syr...,BEIRUT (AP) � A Kurdish militia spokesman says...
2,2,"A new development in Gainesville, Va., provide...",https://www.washingtonpost.com/realestate/a-ta...,Southern design Buy Photo : Exteriors are vary...
3,3,Laura Ingraham takes an Easter break amid Davi...,http://bangordailynews.com/2018/03/31/news/nat...,"Cleve R. Wootson Jr., The Washington Post Marc..."
4,4,Perm Secretary Launches Book for Africa’s Deve...,http://www.thisdaylive.com/index.php/2018/03/3...,"Former Permanent Secretary, Federal Ministry o..."
...,...,...,...,...
195,195,Incoming MFSA chairman accuses Deutsche Bank o...,https://www.maltatoday.com.mt/news/national/85...,Incoming Malta Financial Services Authority ch...
196,196,EW Nutrition and ICON Form a Strategic Partner...,http://www.adnkronos.com/immediapress/pr-newsw...,EW Nutrition acquires 51% of the shares in ICO...
197,197,A question of diversity: how fashion is ditchi...,http://www.watoday.com.au/world/a-question-of-...,"""We want to be held accountable,"" she said in ..."
198,198,Altnagelvin’s new health and well-being hub opens,https://www.derryjournal.com/news/altnagelvin-...,A new Health and Wellbeing Campus has opened a...


In [39]:
df.drop(columns=["Unnamed: 0","Link"], axis=1, inplace=True)
df

Unnamed: 0,Title,Context,Locations
0,Krewe du Vieux 2018: Take a virtual tour of th...,This area formed the heart of the plantation o...,{Dubreuil}
1,Turkey shells Syrian city as it pushes into Ku...,BEIRUT (AP) � A Kurdish militia spokesman says...,"{Turkey, Syria, Britain, Afrin}"
2,"A new development in Gainesville, Va., provide...",Southern design Buy Photo : Exteriors are vary...,"{Norbury, Gainesville, Somerville, Va., Best B..."
3,Laura Ingraham takes an Easter break amid Davi...,"Cleve R. Wootson Jr., The Washington Post Marc...","{Florida, Parkland, America, San Marcos, Calif..."
4,Perm Secretary Launches Book for Africa’s Deve...,"Former Permanent Secretary, Federal Ministry o...","{Abuja, Olaopa, Nigeria}"
...,...,...,...
195,Incoming MFSA chairman accuses Deutsche Bank o...,Incoming Malta Financial Services Authority ch...,"{Malta, Germany, London, UK}"
196,EW Nutrition and ICON Form a Strategic Partner...,EW Nutrition acquires 51% of the shares in ICO...,"{Ankara, Brazil, Japan, Ufuk Yilmaz, Germany, ..."
197,A question of diversity: how fashion is ditchi...,"""We want to be held accountable,"" she said in ...","{London, New York, Stockholm, South Africa, Sp..."
198,Altnagelvin’s new health and well-being hub opens,A new Health and Wellbeing Campus has opened a...,"{Northern Ireland, the Republic and Northern T..."


In [9]:
import spacy
from spacy import displacy

NER = spacy.load("en_core_web_sm")

In [10]:
def contextToloc(text:str):
    text = NER(text)
    loc = []
    for word in text.ents:
        if word.label_ == "GPE":
            loc.append(word.text)
    return loc


In [12]:
#df["Locations"] = df["Context"].apply(contextToloc)
df

Unnamed: 0,Title,Link,Context,Locations
0,Krewe du Vieux 2018: Take a virtual tour of th...,http://www.nola.com/mardi_gras_nola/2018/01/ta...,This area formed the heart of the plantation o...,"[Dubreuil, Dubreuil]"
1,Turkey shells Syrian city as it pushes into Ku...,http://www.eastoregonian.com/turkey-shells-syr...,BEIRUT (AP) � A Kurdish militia spokesman says...,"[Turkey, Syria, Turkey, Turkey, Afrin, Britain]"
2,"A new development in Gainesville, Va., provide...",https://www.washingtonpost.com/realestate/a-ta...,Southern design Buy Photo : Exteriors are vary...,"[Charleston, Norbury, Granville, Somerville, N..."
3,Laura Ingraham takes an Easter break amid Davi...,http://bangordailynews.com/2018/03/31/news/nat...,"Cleve R. Wootson Jr., The Washington Post Marc...","[Parkland, Florida, California, San Marcos, Am..."
4,Perm Secretary Launches Book for Africa’s Deve...,http://www.thisdaylive.com/index.php/2018/03/3...,"Former Permanent Secretary, Federal Ministry o...","[Abuja, Nigeria, Nigeria, Olaopa]"
...,...,...,...,...
195,Incoming MFSA chairman accuses Deutsche Bank o...,https://www.maltatoday.com.mt/news/national/85...,Incoming Malta Financial Services Authority ch...,"[Malta, UK, Malta, Malta, Germany, London, Mal..."
196,EW Nutrition and ICON Form a Strategic Partner...,http://www.adnkronos.com/immediapress/pr-newsw...,EW Nutrition acquires 51% of the shares in ICO...,"[Turkey, Ufuk Yilmaz, Ankara, Turkey, Visbek, ..."
197,A question of diversity: how fashion is ditchi...,http://www.watoday.com.au/world/a-question-of-...,"""We want to be held accountable,"" she said in ...","[Los Angeles, London, Spain, South Africa, Tai..."
198,Altnagelvin’s new health and well-being hub opens,https://www.derryjournal.com/news/altnagelvin-...,A new Health and Wellbeing Campus has opened a...,"[Northern Ireland, the Republic and Northern T..."


In [35]:
def remove_dup(arr_input: list) ->list:
    arr_input = set(arr_input)
    

Dubreuil


In [37]:
df["Locations"] = df["Locations"].apply(lambda x: set(x))

In [40]:
df

Unnamed: 0,Title,Context,Locations
0,Krewe du Vieux 2018: Take a virtual tour of th...,This area formed the heart of the plantation o...,{Dubreuil}
1,Turkey shells Syrian city as it pushes into Ku...,BEIRUT (AP) � A Kurdish militia spokesman says...,"{Turkey, Syria, Britain, Afrin}"
2,"A new development in Gainesville, Va., provide...",Southern design Buy Photo : Exteriors are vary...,"{Norbury, Gainesville, Somerville, Va., Best B..."
3,Laura Ingraham takes an Easter break amid Davi...,"Cleve R. Wootson Jr., The Washington Post Marc...","{Florida, Parkland, America, San Marcos, Calif..."
4,Perm Secretary Launches Book for Africa’s Deve...,"Former Permanent Secretary, Federal Ministry o...","{Abuja, Olaopa, Nigeria}"
...,...,...,...
195,Incoming MFSA chairman accuses Deutsche Bank o...,Incoming Malta Financial Services Authority ch...,"{Malta, Germany, London, UK}"
196,EW Nutrition and ICON Form a Strategic Partner...,EW Nutrition acquires 51% of the shares in ICO...,"{Ankara, Brazil, Japan, Ufuk Yilmaz, Germany, ..."
197,A question of diversity: how fashion is ditchi...,"""We want to be held accountable,"" she said in ...","{London, New York, Stockholm, South Africa, Sp..."
198,Altnagelvin’s new health and well-being hub opens,A new Health and Wellbeing Campus has opened a...,"{Northern Ireland, the Republic and Northern T..."
