In [3]:
import pandas as pd

# Example: Loading a DataFrame with Twitter data
# Replace this with your actual data loading code
# Ensure you have a column named 'location' in your DataFrame
df = pd.read_csv('twitter_data.csv')

# Function to clean and standardize location data
def clean_location(location):
    if pd.isnull(location) or location.strip() == '':
        return None  # Handle missing or empty locations
    location = location.strip().lower()  # Standardize: trim spaces and convert to lowercase
    
    # Handle specific known variations
    # Expand this dictionary based on your dataset
    location_corrections = {
        #'nyc': 'new york city',
        #'sf': 'san francisco',
        # Add more corrections as needed
    }

    return location_corrections.get(location, location)

# Apply the cleaning function to the location column
df['cleaned_location'] = df['location'].apply(clean_location)

# Optionally, drop rows where location is None if you don't need them
# df = df.dropna(subset=['cleaned_location'])

# Inspect the cleaned DataFrame
print(df.head())


   Unnamed: 0         hashed_userid masked_username         location  \
0      415371  19868647935216335990       *****ecot              NaN   
1      415370  95273352056344375133       *****kh59  Terre Haute, IN   
2      415369  42256911176251501556  *******eDuster         Chi-town   
3      415368  98949018742144878760       *****ll42              NaN   
4      415367  83242079331442835051  *******tresist              NaN   

   following  followers  totaltweets usercreateddt              tweetid  \
0       1109        796       189199       9/16/09  1560823411047268352   
1        854        298        16999      10/10/17  1560823347583361024   
2        416       8852        11699       3/26/09  1560823151671488513   
3        603        179        59766       3/14/14  1560822898780213249   
4        257        758       272531       2/16/17  1560822761706094592   

  tweetcreatedts  ...                                           hashtags  \
0        57:54.0  ...  [{'text': 'MyBody

In [16]:
#!python -m spacy download en_core_web_sm
#!python -m spacy download en_core_web_lg

import pandas as pd
import spacy

# Load spaCy's English language model
#nlp = spacy.load("en_core_web_sm")
nlp = spacy.load("en_core_web_lg")

# Function to extract locations using spaCy's NER
def extract_locations(text):
    doc = nlp(text)
    return [ent.text for ent in doc.ents if ent.label_ == "GPE"]

# Function to clean and standardize location data
def clean_location(location):
    if pd.isnull(location) or location.strip() == '':
        return None  # Handle missing or empty locations
    location = location.strip().lower()  # Standardize: trim spaces and convert to lowercase

    # Here, you can also integrate spaCy's NER results if needed
    # For example, using extract_locations(location) to further process the location

    # Handle specific known variations
    location_corrections = {
        # Add your corrections here
    }

    return location_corrections.get(location, location)

# Load your DataFrame
df = pd.read_csv('twitter_data.csv')

# Apply the cleaning function to the location column
df['cleaned_location'] = df['location'].apply(clean_location)

# New: Extract locations using NER and add to a new column
df['extracted_locations'] = df['location'].apply(lambda x: extract_locations(x) if pd.notnull(x) else x)

# Optionally, drop rows where location is None if you don't need them
# df = df.dropna(subset=['cleaned_location'])

# Inspect the cleaned DataFrame
print(df.head())


   Unnamed: 0         hashed_userid masked_username         location  \
0      415371  19868647935216335990       *****ecot              NaN   
1      415370  95273352056344375133       *****kh59  Terre Haute, IN   
2      415369  42256911176251501556  *******eDuster         Chi-town   
3      415368  98949018742144878760       *****ll42              NaN   
4      415367  83242079331442835051  *******tresist              NaN   

   following  followers  totaltweets usercreateddt              tweetid  \
0       1109        796       189199       9/16/09  1560823411047268352   
1        854        298        16999      10/10/17  1560823347583361024   
2        416       8852        11699       3/26/09  1560823151671488513   
3        603        179        59766       3/14/14  1560822898780213249   
4        257        758       272531       2/16/17  1560822761706094592   

  tweetcreatedts  ...  language favorite_count is_retweet  \
0        57:54.0  ...        en              0       Tr

In [24]:
# Function to replace empty lists with None
def remove_empty_lists(location_list):
    if location_list:  # This will be False for empty lists
        return location_list
    else:
        return None

df['extracted_locations'] = df['extracted_locations'].apply(remove_empty_lists)
print(df.head())

   Unnamed: 0         hashed_userid masked_username         location  \
0      415371  19868647935216335990       *****ecot              NaN   
1      415370  95273352056344375133       *****kh59  Terre Haute, IN   
2      415369  42256911176251501556  *******eDuster         Chi-town   
3      415368  98949018742144878760       *****ll42              NaN   
4      415367  83242079331442835051  *******tresist              NaN   

   following  followers  totaltweets usercreateddt              tweetid  \
0       1109        796       189199       9/16/09  1560823411047268352   
1        854        298        16999      10/10/17  1560823347583361024   
2        416       8852        11699       3/26/09  1560823151671488513   
3        603        179        59766       3/14/14  1560822898780213249   
4        257        758       272531       2/16/17  1560822761706094592   

  tweetcreatedts  ...  language favorite_count is_retweet  \
0        57:54.0  ...        en              0       Tr

In [25]:
df.to_csv('cleaned_locations.csv', index=False)

In [13]:
none_count = df['extracted_locations'].isna().sum()

In [14]:
print(none_count)

140531
