In [3]:
import pandas as pd
import numpy as np

In [38]:

train_data = pd.read_csv('corona_nlp_train.csv', encoding='latin-1')
test_data = pd.read_csv('corona_nlp_test.csv', encoding='latin-1')

train_locations = train_data['Location']
test_locations = test_data['Location']

combined_locations = np.concatenate([train_locations, test_locations], axis=0)

In [39]:
combined_locations[:10]

array(['Coastal Spain', nan, 'everywhere', 'India',
       'Everywhere and Nowhere', 'Karachi', nan, nan, 'Hyderabad, India',
       'Sheffieldish'], dtype=object)

In [40]:
combined_locations.shape

(44952,)

In [41]:
combined_locations = ['unknown' if pd.isna(loc) else loc for loc in combined_locations]

combined_locations[:10]


['Coastal Spain',
 'unknown',
 'everywhere',
 'India',
 'Everywhere and Nowhere',
 'Karachi',
 'unknown',
 'unknown',
 'Hyderabad, India',
 'Sheffieldish']

In [42]:
combined_df = pd.DataFrame({
    'Location': combined_locations,
    'Country': ['unknown'] * len(combined_locations)
})
combined_df.head()

Unnamed: 0,Location,Country
0,Coastal Spain,unknown
1,unknown,unknown
2,everywhere,unknown
3,India,unknown
4,Everywhere and Nowhere,unknown


In [43]:
unique_locations_df = pd.DataFrame({
    'Location': combined_df['Location'].value_counts().index,
    'Country': ['unknown'] * combined_df['Location'].unique().shape[0]
})

In [4]:
unique_locations_df = pd.read_csv('unique_locations.csv', encoding='latin-1')

In [5]:
unique_locations_df.shape

(11521, 3)

In [44]:
unique_locations_df.tail(100)

Unnamed: 0,Location,Country
11421,"2nd Flr, DSM Place, Kijabe St.",unknown
11422,"Odisha, India",unknown
11423,Ames Iowa,unknown
11424,"Columbia, Missouri",unknown
11425,"London, uk",unknown
...,...,...
11516,"Malahide, Co Dublin",unknown
11517,"Ottawa, Canada",unknown
11518,Central,unknown
11519,"Waterbury, CT",unknown


In [21]:
import g4f
import json  # Safer parsing than eval
import time  # For timing
import re
import math


def fix_invalid_escape_sequences(s):
    """
    Fixes invalid escape sequences in a JSON string.
    """
    # Replace backslash followed by any character that is not a valid escape sequence
    # Valid escape sequences in JSON are: \", \\, \/, \b, \f, \n, \r, \t, \uXXXX
    s = re.sub(r'\\(?!["\\/bfnrtu])', r'\\\\', s)
    return s

def update_country_from_locations_g4f(df, location_column, country_column, batch_size=10):
    """
    Updates the 'Country' column in the DataFrame based on 'Location' using g4f.

    Parameters:
        df (pd.DataFrame): DataFrame containing the 'Location' column.
        location_column (str): Column name containing locations.
        country_column (str): Column name to update with country information.
        batch_size (int): Number of locations to process in one batch.
    """
    # Get unique locations where country is 'unknown'
    unknown_locations = df[df[country_column] == 'unknown'][location_column].unique()
    n=0
    total = math.ceil(len(unknown_locations)/batch_size)
    print(f"TOTAL batches {total}")
    # Batch process locations
    for i in range(0, len(unknown_locations), batch_size):
        start_time = time.time()  # Start timing the loop

        batch = unknown_locations[i:i + batch_size]

        # Create the prompt
        prompt = f"""
        Your output should be only in parsable JSON and nothing else or the world will end.
        You are tasked with identifying countries from Twitter profile 'Location' information. 
        These locations are user-provided and may include:
        - Actual locations (e.g., New York, USA).
        - Fictional places (e.g., Hogwarts).
        - Phrases (e.g., Somewhere over the rainbow).
        - Coordinates (e.g., 37.7749, -122.4194).
        - Full addresses or combinations of locations (e.g., 1600 Pennsylvania Ave, Washington, D.C.).
        - Symbols or nonsensical text (e.g., ***!!!???).
        - Vague terms (e.g., Global, Everywhere, Anywhere).
        
        Your task:
        - For each location, deduce the proper country if possible.
        - If the location cannot be mapped to a real country, mark it as 'unknown_location'.
        - Return the output as a JSON object where the key is the location and the value is the country.
        - Use the official name of the country.

        Locations:
        {batch}
        """

        # Call g4f LLM
        try:
            response = g4f.ChatCompletion.create(
                model="gpt-4",
                messages=[
                    {"role": "system", "content": "You are a helpful assistant."},
                    {"role": "user", "content": prompt}
                ]
            )
            response_cleaned = response.strip().strip('```json').strip('```')
            response_cleaned = fix_invalid_escape_sequences(response_cleaned)

            # Parse the output
            try:
                location_country_mapping = json.loads(response_cleaned)
                print(f"Batch {i // batch_size + 1} processed successfully.")
            except json.JSONDecodeError as e:
                print(f"Failed to parse JSON for batch {i // batch_size + 1}: {response_cleaned}")
                print(f"Error: {e}")
                continue

            # Update the DataFrame
            for location, country in location_country_mapping.items():
                df.loc[df[location_column] == location, country_column] = country
            a = df[df['Country'] == 'unknown'].shape[0]
            n=n+1

        except Exception as e:
            print(f"Error processing batch {i // batch_size + 1}: {e}")

        # End timing and print elapsed time
        end_time = time.time()
        print(f"Time taken for batch {i // batch_size + 1}: {end_time - start_time:.2f} seconds.")
        print(f"Completed {n} of {total} batches successfully. {a} Unique locations left.")

In [26]:
import g4f
import json  # Safer parsing than eval
import time  # For timing
import re
import math


def fix_invalid_escape_sequences(s):
    """
    Fixes invalid escape sequences in a JSON string.
    """
    # Replace backslash followed by any character that is not a valid escape sequence
    # Valid escape sequences in JSON are: \", \\, \/, \b, \f, \n, \r, \t, \uXXXX
    s = re.sub(r'\\(?!["\\/bfnrtu])', r'\\\\', s)
    return s

def update_country_from_locations_g4f(df, location_column, country_column, batch_size=10):
    """
    Updates the 'Country' column in the DataFrame based on 'Location' using g4f.

    Parameters:
        df (pd.DataFrame): DataFrame containing the 'Location' column.
        location_column (str): Column name containing locations.
        country_column (str): Column name to update with country information.
        batch_size (int): Number of locations to process in one batch.
    """
    # Get unique locations where country is 'unknown'
    unknown_locations = df[df[country_column] == 'unknown'][location_column].unique()
    # Batch process locations
    a = df[df['Country'] == 'unknown'].shape[0]
    for i, location in enumerate(unknown_locations, start=1):
        start_time = time.time()  # Start timing the loop


        # Create the prompt
        prompt = f"""
        Your output should be only in parsable JSON and nothing else or the world will end.
        You are tasked with identifying countries from Twitter profile 'Location' information. 
        These locations are user-provided and may include:
        - Actual locations (e.g., New York, USA).
        - Fictional places (e.g., Hogwarts).
        - Phrases (e.g., Somewhere over the rainbow).
        - Coordinates (e.g., 37.7749, -122.4194).
        - Full addresses or combinations of locations (e.g., 1600 Pennsylvania Ave, Washington, D.C.).
        - Symbols or nonsensical text (e.g., ***!!!???).
        - Vague terms (e.g., Global, Everywhere, Anywhere).
        
        Your task:
        - For each location, deduce the proper country if possible.
        - If the location cannot be mapped to a real country, mark it as 'unknown_location'.
        - Return the output as a JSON object where the key is the location and the value is the country.
        - Use the official name of the country.

        Locations:
        {location}
        """

        # Call g4f LLM
        try:
            response = g4f.ChatCompletion.create(
                model="gpt-4",
                messages=[
                    {"role": "system", "content": "You are a helpful assistant."},
                    {"role": "user", "content": prompt}
                ]
            )
            response_cleaned = response.strip().strip('```json').strip('```')
            response_cleaned = fix_invalid_escape_sequences(response_cleaned)

            # Parse the output
            try:
                location_country_mapping = json.loads(response_cleaned)
                print(f"Batch {i // batch_size + 1} processed successfully.")
            except json.JSONDecodeError as e:
                print(f"Failed to parse JSON for batch {i // batch_size + 1}: {response_cleaned}")
                print(f"Error: {e}")
                continue

            # Update the DataFrame
            for loc, country in location_country_mapping.items():
                df.loc[df[location_column] == location, country_column] = country

            a = df[df['Country'] == 'unknown'].shape[0]

        except Exception as e:
            print(f"Error processing location{location}: {e}")

        # End timing and print elapsed time
        end_time = time.time()
        print(f"Time taken for location {location}: {end_time - start_time:.2f} seconds.")
        print(f"{a} Unique locations left.")

In [None]:
import time
a = unique_locations_df[unique_locations_df['Country'] == 'unknown'].shape[0]
while a != 0:
    update_country_from_locations_g4f(unique_locations_df, location_column='Location', country_column='Country', batch_size=1)
    a = unique_locations_df[unique_locations_df['Country'] == 'unknown'].shape[0]
    print(f"Finished with unique locations. unique locations left: {a}")

Failed to parse JSON for batch 2: 
{
  "locations": [
    {
      "location": "New York, USA",
      "country": "United States"
    },
    {
      "location": "Hogwarts",
      "country": "unknown_location"
    },
    {
      "location": "Somewhere over the rainbow",
      "country": "unknown_location"
    },
    {
      "location": "37.7749, -122.4194",
      "country": "United States"
    },
    {
      "location": "1600 Pennsylvania Ave, Washington, D.C.",
      "country": "United States"
    },
    {
      "location": "***!!!???",
      "country": "unknown_location"
    },
    {
      "location": "Global",
      "country": "unknown_location"
    },
    {
      "location": "Everywhere",
      "country": "unknown_location"
    },
    {
      "location": "Anywhere",
      "country": "unknown_location"
    }
  ]
}
```Based on the information provided, the country is:

Turkey

Note: TÃÂÃÂ¼rkiye is the Turkish name for Turkey, and ?stanbul is the Turkish spelling for Istanbul.
Error: 

In [28]:
unique_locations_df[unique_locations_df['Country'] == 'unknown'].shape[0]

217

In [82]:
unique_locations_df.to_csv('unique_locations.csv', index=True)