# Tweet Cleaning

### Import Libraries

In [361]:
import numpy as np
import pandas as pd
import os

# Word processing libraries
import re
from nltk.corpus import wordnet
import string
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer
from shapely.geometry import LineString
from geopandas import GeoDataFrame
from shapely.geometry import Point, Polygon
import folium

#Nan
from cmath import nan

### Read Tweets from CSV - Provided by scraping

In [362]:
tweets = pd.read_csv('CSV/Scrape_Tweets.csv')

In [363]:
tweets.shape

(21686, 13)

In [364]:
tweets = tweets.drop(['Unnamed: 0'], axis=1)
tweets.head()

Unnamed: 0,tweet_id,author_id,author_followers,text,created_at,retweets,replies,likes,quote_count,place_id,place_name,bbox
0,1575571139358691329,1353804645580353537,1436,Go shota R1000 ya registration \n\nBlesser: +R...,2022-09-29 19:40:06+00:00,0,0,1,0,{'place_id': 'dd9c0d7d7e07eb49'},South Africa,"[16.4475932, -34.8342468, 32.8922934, -22.1247..."
1,1575489974316584961,1258457287120818182,1326,Spx yesterday's rally\nPoof gone https://t.co/...,2022-09-29 14:17:35+00:00,0,0,1,0,{'place_id': 'a02e6c261fa62b42'},"Benoni, South Africa","[28.2722463, -26.2315204, 28.4449594, -26.0681..."
2,1575482837989474304,2496356957,1436,@JohnPerlman A driverless car wouldn't work in...,2022-09-29 13:49:14+00:00,1,0,0,0,{'place_id': '0e587c59401d0a27'},"Pretoria, South Africa","[27.9483035, -25.9157727, 28.4198285, -25.5894..."
3,1575442792800722946,1309123451433746437,6455,At least offer ukumbhalela ama board for that ...,2022-09-29 11:10:06+00:00,0,0,1,0,{'place_id': '52e073e7724385c3'},"Pietermaritzburg, South Africa","[30.2563496, -29.6999989, 30.4512134, -29.5328..."
4,1575438290962075648,177685467,1880,We used to catch a train from Ikwezi to Mzimhl...,2022-09-29 10:52:13+00:00,1,0,1,0,{'place_id': '3e46a98adcf05e59'},"Meadowlands, South Africa","[27.8651453, -26.2359005, 27.9219768, -26.1964..."


In [365]:
tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21686 entries, 0 to 21685
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   tweet_id          21686 non-null  int64 
 1   author_id         21686 non-null  int64 
 2   author_followers  21686 non-null  int64 
 3   text              21686 non-null  object
 4   created_at        21686 non-null  object
 5   retweets          21686 non-null  int64 
 6   replies           21686 non-null  int64 
 7   likes             21686 non-null  int64 
 8   quote_count       21686 non-null  int64 
 9   place_id          21686 non-null  object
 10  place_name        21686 non-null  object
 11  bbox              21686 non-null  object
dtypes: int64(7), object(5)
memory usage: 2.0+ MB


### Remove Duplicates
If entry is the same then drop it</br>
There should't be duplicates

In [366]:
print('Initial size of dataset before dropping duplicated rows:', tweets.shape)
tweets.drop_duplicates(keep = False, inplace = True)

print('Current size of dataset after dropping duplicated rows, if any, is:', tweets.shape)

Initial size of dataset before dropping duplicated rows: (21686, 12)
Current size of dataset after dropping duplicated rows, if any, is: (21686, 12)


### Remove Empty Tweets
If tweet content is empty/Nan then drop it

In [367]:
tweets.dropna(subset = ['text'], inplace = True)

In [368]:
len(tweets)

21686

### Collect @Users in Text
Identify all mentions of other users using @ </br>
Create new feature containg all mentions (@s)</br>
Remove all mentions from text - done in next section

In [369]:
def mentioned_users(string):
    usernames = re.findall('@[^\s]+', string)
    if usernames == []:
        return nan
    return usernames

In [370]:
tweets['mentioned_users'] = tweets['text'].apply(lambda x: mentioned_users(x))
tweets.head()

Unnamed: 0,tweet_id,author_id,author_followers,text,created_at,retweets,replies,likes,quote_count,place_id,place_name,bbox,mentioned_users
0,1575571139358691329,1353804645580353537,1436,Go shota R1000 ya registration \n\nBlesser: +R...,2022-09-29 19:40:06+00:00,0,0,1,0,{'place_id': 'dd9c0d7d7e07eb49'},South Africa,"[16.4475932, -34.8342468, 32.8922934, -22.1247...",
1,1575489974316584961,1258457287120818182,1326,Spx yesterday's rally\nPoof gone https://t.co/...,2022-09-29 14:17:35+00:00,0,0,1,0,{'place_id': 'a02e6c261fa62b42'},"Benoni, South Africa","[28.2722463, -26.2315204, 28.4449594, -26.0681...",
2,1575482837989474304,2496356957,1436,@JohnPerlman A driverless car wouldn't work in...,2022-09-29 13:49:14+00:00,1,0,0,0,{'place_id': '0e587c59401d0a27'},"Pretoria, South Africa","[27.9483035, -25.9157727, 28.4198285, -25.5894...",[@JohnPerlman]
3,1575442792800722946,1309123451433746437,6455,At least offer ukumbhalela ama board for that ...,2022-09-29 11:10:06+00:00,0,0,1,0,{'place_id': '52e073e7724385c3'},"Pietermaritzburg, South Africa","[30.2563496, -29.6999989, 30.4512134, -29.5328...",
4,1575438290962075648,177685467,1880,We used to catch a train from Ikwezi to Mzimhl...,2022-09-29 10:52:13+00:00,1,0,1,0,{'place_id': '3e46a98adcf05e59'},"Meadowlands, South Africa","[27.8651453, -26.2359005, 27.9219768, -26.1964...",


### Collect #Hashtags in Text
Identify all hashtags using # </br>
Create new feature containg all hashtags (#s)</br>
Remove all hashtags from text - done in next section

In [371]:
def hashtags(string):
    hashtags = re.findall('#[^\s]+', string)
    if hashtags == []:
        return nan
    return hashtags

In [372]:
tweets['hashtags'] = tweets['text'].apply(lambda x: hashtags(x))
tweets.head()

Unnamed: 0,tweet_id,author_id,author_followers,text,created_at,retweets,replies,likes,quote_count,place_id,place_name,bbox,mentioned_users,hashtags
0,1575571139358691329,1353804645580353537,1436,Go shota R1000 ya registration \n\nBlesser: +R...,2022-09-29 19:40:06+00:00,0,0,1,0,{'place_id': 'dd9c0d7d7e07eb49'},South Africa,"[16.4475932, -34.8342468, 32.8922934, -22.1247...",,
1,1575489974316584961,1258457287120818182,1326,Spx yesterday's rally\nPoof gone https://t.co/...,2022-09-29 14:17:35+00:00,0,0,1,0,{'place_id': 'a02e6c261fa62b42'},"Benoni, South Africa","[28.2722463, -26.2315204, 28.4449594, -26.0681...",,
2,1575482837989474304,2496356957,1436,@JohnPerlman A driverless car wouldn't work in...,2022-09-29 13:49:14+00:00,1,0,0,0,{'place_id': '0e587c59401d0a27'},"Pretoria, South Africa","[27.9483035, -25.9157727, 28.4198285, -25.5894...",[@JohnPerlman],[#702drive]
3,1575442792800722946,1309123451433746437,6455,At least offer ukumbhalela ama board for that ...,2022-09-29 11:10:06+00:00,0,0,1,0,{'place_id': '52e073e7724385c3'},"Pietermaritzburg, South Africa","[30.2563496, -29.6999989, 30.4512134, -29.5328...",,
4,1575438290962075648,177685467,1880,We used to catch a train from Ikwezi to Mzimhl...,2022-09-29 10:52:13+00:00,1,0,1,0,{'place_id': '3e46a98adcf05e59'},"Meadowlands, South Africa","[27.8651453, -26.2359005, 27.9219768, -26.1964...",,


### Collect Emojis in text
Identify all emojis using unicode value</br>
Create new feature containg all emojis</br>
Remove all emojis from text - done in next section</br>
Note: We could identify our own emojis that could be useful instead of all emojis and put them in a dictionary

In [373]:
from cmath import nan
import advertools as adv
def extract_emojis(string):
    list = [string]
    emoji_dict = adv.extract_emoji(list)
    emojis = emoji_dict['emoji'][0]
    if(emojis == []):
        return nan
    return emojis

In [374]:
# For if an error saying float can not be changed to lower is called!
#tweets.text=tweets.text.astype(str)

In [375]:
tweets['emojis'] = tweets['text'].apply(lambda x: extract_emojis(x))
tweets.head()

Unnamed: 0,tweet_id,author_id,author_followers,text,created_at,retweets,replies,likes,quote_count,place_id,place_name,bbox,mentioned_users,hashtags,emojis
0,1575571139358691329,1353804645580353537,1436,Go shota R1000 ya registration \n\nBlesser: +R...,2022-09-29 19:40:06+00:00,0,0,1,0,{'place_id': 'dd9c0d7d7e07eb49'},South Africa,"[16.4475932, -34.8342468, 32.8922934, -22.1247...",,,
1,1575489974316584961,1258457287120818182,1326,Spx yesterday's rally\nPoof gone https://t.co/...,2022-09-29 14:17:35+00:00,0,0,1,0,{'place_id': 'a02e6c261fa62b42'},"Benoni, South Africa","[28.2722463, -26.2315204, 28.4449594, -26.0681...",,,
2,1575482837989474304,2496356957,1436,@JohnPerlman A driverless car wouldn't work in...,2022-09-29 13:49:14+00:00,1,0,0,0,{'place_id': '0e587c59401d0a27'},"Pretoria, South Africa","[27.9483035, -25.9157727, 28.4198285, -25.5894...",[@JohnPerlman],[#702drive],
3,1575442792800722946,1309123451433746437,6455,At least offer ukumbhalela ama board for that ...,2022-09-29 11:10:06+00:00,0,0,1,0,{'place_id': '52e073e7724385c3'},"Pietermaritzburg, South Africa","[30.2563496, -29.6999989, 30.4512134, -29.5328...",,,"[🤣, 🙌]"
4,1575438290962075648,177685467,1880,We used to catch a train from Ikwezi to Mzimhl...,2022-09-29 10:52:13+00:00,1,0,1,0,{'place_id': '3e46a98adcf05e59'},"Meadowlands, South Africa","[27.8651453, -26.2359005, 27.9219768, -26.1964...",,,


### Collect Links in text
Identify all links using a URL</br>
Create new feature containg all Links</br>
Remove all links from text - done in next section</br>
Maybe look into if 'www' syntax must also be used

In [376]:
def find_urls(string):
    try:
        urls = re.search("(?P<url>https?://[^\s]+)", string).group("url")
    except:
        return nan
    return urls

In [377]:
tweets['urls'] = tweets['text'].apply(lambda x: find_urls(x))
tweets.head()

Unnamed: 0,tweet_id,author_id,author_followers,text,created_at,retweets,replies,likes,quote_count,place_id,place_name,bbox,mentioned_users,hashtags,emojis,urls
0,1575571139358691329,1353804645580353537,1436,Go shota R1000 ya registration \n\nBlesser: +R...,2022-09-29 19:40:06+00:00,0,0,1,0,{'place_id': 'dd9c0d7d7e07eb49'},South Africa,"[16.4475932, -34.8342468, 32.8922934, -22.1247...",,,,
1,1575489974316584961,1258457287120818182,1326,Spx yesterday's rally\nPoof gone https://t.co/...,2022-09-29 14:17:35+00:00,0,0,1,0,{'place_id': 'a02e6c261fa62b42'},"Benoni, South Africa","[28.2722463, -26.2315204, 28.4449594, -26.0681...",,,,https://t.co/hvAYVEs6NU
2,1575482837989474304,2496356957,1436,@JohnPerlman A driverless car wouldn't work in...,2022-09-29 13:49:14+00:00,1,0,0,0,{'place_id': '0e587c59401d0a27'},"Pretoria, South Africa","[27.9483035, -25.9157727, 28.4198285, -25.5894...",[@JohnPerlman],[#702drive],,
3,1575442792800722946,1309123451433746437,6455,At least offer ukumbhalela ama board for that ...,2022-09-29 11:10:06+00:00,0,0,1,0,{'place_id': '52e073e7724385c3'},"Pietermaritzburg, South Africa","[30.2563496, -29.6999989, 30.4512134, -29.5328...",,,"[🤣, 🙌]",https://t.co/MpbRqL0gpU
4,1575438290962075648,177685467,1880,We used to catch a train from Ikwezi to Mzimhl...,2022-09-29 10:52:13+00:00,1,0,1,0,{'place_id': '3e46a98adcf05e59'},"Meadowlands, South Africa","[27.8651453, -26.2359005, 27.9219768, -26.1964...",,,,https://t.co/DeR5KoJPXb


In [378]:
tweets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21686 entries, 0 to 21685
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   tweet_id          21686 non-null  int64 
 1   author_id         21686 non-null  int64 
 2   author_followers  21686 non-null  int64 
 3   text              21686 non-null  object
 4   created_at        21686 non-null  object
 5   retweets          21686 non-null  int64 
 6   replies           21686 non-null  int64 
 7   likes             21686 non-null  int64 
 8   quote_count       21686 non-null  int64 
 9   place_id          21686 non-null  object
 10  place_name        21686 non-null  object
 11  bbox              21686 non-null  object
 12  mentioned_users   10662 non-null  object
 13  hashtags          6331 non-null   object
 14  emojis            6464 non-null   object
 15  urls              10516 non-null  object
dtypes: int64(7), object(9)
memory usage: 2.8+ MB


### Remove Unwanted Information and Clean Tweet text
To Clean Text:
* Convert to Lowercase
* Tokenise
* Tag Text
* Lemmatise Text

This includes:
* @mentions
* URLs
* Hashtags
* Emojis
* Punctuation
* Numbers
* Stop Words
* Single Letter Words
* Empty Tokens


In [379]:
# Define Emoji_patterns
emoji_pattern = re.compile("["
         u"\U0001F600-\U0001F64F"  # emoticons
         u"\U0001F300-\U0001F5FF"  # symbols & pictographs
         u"\U0001F680-\U0001F6FF"  # transport & map symbols
         u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
         u"\U00002702-\U000027B0"
         u"\U000024C2-\U0001F251"
         "]+", flags=re.UNICODE)

In [380]:
# Define the function to implement POS tagging:
def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN


# Define the main function to clean text in various ways:
def clean_text(text):
    
    # Apply regex expressions first before converting string to list of tokens/words:
    # 1. remove @usernames
    text = re.sub('@[^\s]+', '', text)
    
    # 2. remove URLs
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', '', text)
    
    # 3. remove hashtags entirely i.e. #hashtags
    text = re.sub(r'#([^\s]+)', '', text)
    
    # 4. remove emojis
    text = emoji_pattern.sub(r'', text)
    
    # 5. Convert text to lowercase
    text = text.lower()
    
    # 6. tokenize text and remove punctuation
    text = [word.strip(string.punctuation) for word in text.split(" ")]
    
    # 7. remove numbers
    text = [word for word in text if not any(c.isdigit() for c in word)]
    
    # 8. remove stop words
    stop = stopwords.words('english')
    text = [x for x in text if x not in stop]
    
    # 9. remove empty tokens
    text = [t for t in text if len(t) > 0]
    
    # 10. pos tag text and lemmatize text
    pos_tags = pos_tag(text)
    text = [WordNetLemmatizer().lemmatize(t[0], get_wordnet_pos(t[1])) for t in pos_tags]
    
    # 11. remove words with only one letter
    text = [t for t in text if len(t) > 1]
    
    # join all
    text = " ".join(text)
    
    return(text)

In [381]:
# Apply function on the column 'text':
tweets['cleaned_text'] = tweets['text'].apply(lambda x: clean_text(x))
tweets.head()

Unnamed: 0,tweet_id,author_id,author_followers,text,created_at,retweets,replies,likes,quote_count,place_id,place_name,bbox,mentioned_users,hashtags,emojis,urls,cleaned_text
0,1575571139358691329,1353804645580353537,1436,Go shota R1000 ya registration \n\nBlesser: +R...,2022-09-29 19:40:06+00:00,0,0,1,0,{'place_id': 'dd9c0d7d7e07eb49'},South Africa,"[16.4475932, -34.8342468, 32.8922934, -22.1247...",,,,,go shota ya registration \n\nblesser babe lesa...
1,1575489974316584961,1258457287120818182,1326,Spx yesterday's rally\nPoof gone https://t.co/...,2022-09-29 14:17:35+00:00,0,0,1,0,{'place_id': 'a02e6c261fa62b42'},"Benoni, South Africa","[28.2722463, -26.2315204, 28.4449594, -26.0681...",,,,https://t.co/hvAYVEs6NU,spx yesterday's rally\npoof go
2,1575482837989474304,2496356957,1436,@JohnPerlman A driverless car wouldn't work in...,2022-09-29 13:49:14+00:00,1,0,0,0,{'place_id': '0e587c59401d0a27'},"Pretoria, South Africa","[27.9483035, -25.9157727, 28.4198285, -25.5894...",[@JohnPerlman],[#702drive],,,driverless car work south africa.the car would...
3,1575442792800722946,1309123451433746437,6455,At least offer ukumbhalela ama board for that ...,2022-09-29 11:10:06+00:00,0,0,1,0,{'place_id': '52e073e7724385c3'},"Pietermaritzburg, South Africa","[30.2563496, -29.6999989, 30.4512134, -29.5328...",,,"[🤣, 🙌]",https://t.co/MpbRqL0gpU,least offer ukumbhalela ama board strike🤣
4,1575438290962075648,177685467,1880,We used to catch a train from Ikwezi to Mzimhl...,2022-09-29 10:52:13+00:00,1,0,1,0,{'place_id': '3e46a98adcf05e59'},"Meadowlands, South Africa","[27.8651453, -26.2359005, 27.9219768, -26.1964...",,,,https://t.co/DeR5KoJPXb,use catch train ikwezi mzimhlophe go church in...


We can now remove the original tweet text because the cleaned text is all that is needed.

In [382]:
tweets = tweets.drop('text', axis = 1)

In [383]:
tweets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21686 entries, 0 to 21685
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   tweet_id          21686 non-null  int64 
 1   author_id         21686 non-null  int64 
 2   author_followers  21686 non-null  int64 
 3   created_at        21686 non-null  object
 4   retweets          21686 non-null  int64 
 5   replies           21686 non-null  int64 
 6   likes             21686 non-null  int64 
 7   quote_count       21686 non-null  int64 
 8   place_id          21686 non-null  object
 9   place_name        21686 non-null  object
 10  bbox              21686 non-null  object
 11  mentioned_users   10662 non-null  object
 12  hashtags          6331 non-null   object
 13  emojis            6464 non-null   object
 14  urls              10516 non-null  object
 15  cleaned_text      21686 non-null  object
dtypes: int64(7), object(9)
memory usage: 2.8+ MB


### Convert Boundry Box to a set of coordinates of Latitude and Longitude
There are a few ways of doing this
* Take an average and find the middle of the Boundry Box
* Indentify where the location is using another API based on place_name
* Keep the location as a polygon and then place each user into a munucipality in hich the area is largest

The way we will do it here is using centroid of the boundry box

In [384]:
def bbox_to_coords(bbox):
    all_coords = bbox.split()
    coords = []
    for coordinate in all_coords:
        coordinate = coordinate.replace('[','')
        coordinate = coordinate.replace(']','')
        coordinate = coordinate.replace(',','')
        coord = float(coordinate)
        coords.append(coord)

    return coords

In [385]:
tweets['coords'] = tweets['bbox'].apply(lambda x: bbox_to_coords(x))

In [386]:
tweets['longitude_1'] = tweets['coords'].apply(lambda x: x[0])
tweets['latitude_1'] = tweets['coords'].apply(lambda x: x[1])
tweets['longitude_2'] = tweets['coords'].apply(lambda x: x[2])
tweets['latitude_2'] = tweets['coords'].apply(lambda x: x[3])

In [387]:
tweets.head(3)

Unnamed: 0,tweet_id,author_id,author_followers,created_at,retweets,replies,likes,quote_count,place_id,place_name,...,mentioned_users,hashtags,emojis,urls,cleaned_text,coords,longitude_1,latitude_1,longitude_2,latitude_2
0,1575571139358691329,1353804645580353537,1436,2022-09-29 19:40:06+00:00,0,0,1,0,{'place_id': 'dd9c0d7d7e07eb49'},South Africa,...,,,,,go shota ya registration \n\nblesser babe lesa...,"[16.4475932, -34.8342468, 32.8922934, -22.1247...",16.447593,-34.834247,32.892293,-22.124724
1,1575489974316584961,1258457287120818182,1326,2022-09-29 14:17:35+00:00,0,0,1,0,{'place_id': 'a02e6c261fa62b42'},"Benoni, South Africa",...,,,,https://t.co/hvAYVEs6NU,spx yesterday's rally\npoof go,"[28.2722463, -26.2315204, 28.4449594, -26.0681...",28.272246,-26.23152,28.444959,-26.068193
2,1575482837989474304,2496356957,1436,2022-09-29 13:49:14+00:00,1,0,0,0,{'place_id': '0e587c59401d0a27'},"Pretoria, South Africa",...,[@JohnPerlman],[#702drive],,,driverless car work south africa.the car would...,"[27.9483035, -25.9157727, 28.4198285, -25.5894...",27.948304,-25.915773,28.419829,-25.589438


In [388]:
tweets = tweets.drop('bbox', axis=1)

In [389]:
def find_centroid(coords):
    geometry = LineString([(coords[0], coords[1]),(coords[2], coords[3])])
    centroid = geometry.centroid
    return centroid

In [390]:
tweets['centroid'] = tweets['coords'].apply(lambda x: find_centroid(x))

In [391]:
tweets['centroid_long'] = tweets['centroid'].apply(lambda z: z.x)
tweets['centroid_lat'] = tweets['centroid'].apply(lambda z: z.y)

In [392]:
tweets = tweets.drop('centroid', axis =1)
tweets.head(3)

Unnamed: 0,tweet_id,author_id,author_followers,created_at,retweets,replies,likes,quote_count,place_id,place_name,...,emojis,urls,cleaned_text,coords,longitude_1,latitude_1,longitude_2,latitude_2,centroid_long,centroid_lat
0,1575571139358691329,1353804645580353537,1436,2022-09-29 19:40:06+00:00,0,0,1,0,{'place_id': 'dd9c0d7d7e07eb49'},South Africa,...,,,go shota ya registration \n\nblesser babe lesa...,"[16.4475932, -34.8342468, 32.8922934, -22.1247...",16.447593,-34.834247,32.892293,-22.124724,24.669943,-28.479485
1,1575489974316584961,1258457287120818182,1326,2022-09-29 14:17:35+00:00,0,0,1,0,{'place_id': 'a02e6c261fa62b42'},"Benoni, South Africa",...,,https://t.co/hvAYVEs6NU,spx yesterday's rally\npoof go,"[28.2722463, -26.2315204, 28.4449594, -26.0681...",28.272246,-26.23152,28.444959,-26.068193,28.358603,-26.149857
2,1575482837989474304,2496356957,1436,2022-09-29 13:49:14+00:00,1,0,0,0,{'place_id': '0e587c59401d0a27'},"Pretoria, South Africa",...,,,driverless car work south africa.the car would...,"[27.9483035, -25.9157727, 28.4198285, -25.5894...",27.948304,-25.915773,28.419829,-25.589438,28.184066,-25.752605


### Fix Edge Cases
Such as Cape Town

In [393]:
tweets.loc[tweets.place_name == 'Cape Town, South Africa', ['centroid_long', 'centroid_lat']] = 18.4241, -33.9249

In [394]:
geometry = [Point(xy) for xy in zip(tweets['centroid_long'], tweets['centroid_lat'])]
gdf = GeoDataFrame(tweets, geometry=geometry)  

In [395]:
tweets = tweets.drop('longitude_1', axis=1)
tweets = tweets.drop('longitude_2', axis=1)
tweets = tweets.drop('latitude_1', axis=1)
tweets = tweets.drop('latitude_2', axis=1)
tweets = tweets.drop('centroid_long', axis=1)
tweets = tweets.drop('centroid_lat', axis=1)
tweets.head(2)

Unnamed: 0,tweet_id,author_id,author_followers,created_at,retweets,replies,likes,quote_count,place_id,place_name,mentioned_users,hashtags,emojis,urls,cleaned_text,coords,geometry
0,1575571139358691329,1353804645580353537,1436,2022-09-29 19:40:06+00:00,0,0,1,0,{'place_id': 'dd9c0d7d7e07eb49'},South Africa,,,,,go shota ya registration \n\nblesser babe lesa...,"[16.4475932, -34.8342468, 32.8922934, -22.1247...",POINT (24.66994 -28.47949)
1,1575489974316584961,1258457287120818182,1326,2022-09-29 14:17:35+00:00,0,0,1,0,{'place_id': 'a02e6c261fa62b42'},"Benoni, South Africa",,,,https://t.co/hvAYVEs6NU,spx yesterday's rally\npoof go,"[28.2722463, -26.2315204, 28.4449594, -26.0681...",POINT (28.35860 -26.14986)


### Remove Useless locations
Drop row where location is = South Africa, since this location is not specfic enough and offers little value.

In [396]:
tweets.shape

(21686, 17)

In [397]:
tweets = tweets[tweets['place_name'] != 'South Africa']

In [398]:
tweets.shape

(19409, 17)

In [399]:
tweets.head(2)

Unnamed: 0,tweet_id,author_id,author_followers,created_at,retweets,replies,likes,quote_count,place_id,place_name,mentioned_users,hashtags,emojis,urls,cleaned_text,coords,geometry
1,1575489974316584961,1258457287120818182,1326,2022-09-29 14:17:35+00:00,0,0,1,0,{'place_id': 'a02e6c261fa62b42'},"Benoni, South Africa",,,,https://t.co/hvAYVEs6NU,spx yesterday's rally\npoof go,"[28.2722463, -26.2315204, 28.4449594, -26.0681...",POINT (28.35860 -26.14986)
2,1575482837989474304,2496356957,1436,2022-09-29 13:49:14+00:00,1,0,0,0,{'place_id': '0e587c59401d0a27'},"Pretoria, South Africa",[@JohnPerlman],[#702drive],,,driverless car work south africa.the car would...,"[27.9483035, -25.9157727, 28.4198285, -25.5894...",POINT (28.18407 -25.75261)


In [400]:
tweets.to_csv('CSV/Clean_Tweets_All_Info.csv')

### Set up tweet dataframe into a normalised relational database
Use the following databases:
* Tweet Database - Contains all the information about each tweet
* Location Database - Contains all the information about each location
* Author Database - Contains all the information specfic to each author
* Relational Database - Contains the relations between all the above databases

#### Location Database

In [404]:
locations = GeoDataFrame()
locations['place_id'] = tweets['place_id']
locations['place_name'] = tweets['place_name']
locations['coords'] = tweets['coords']
locations['geometry'] = tweets['geometry']
locations = locations.loc[locations.astype(str).drop_duplicates().index]
locations = locations.set_index('place_id')
locations.head(3)


Unnamed: 0_level_0,place_name,coords,geometry
place_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
{'place_id': 'a02e6c261fa62b42'},"Benoni, South Africa","[28.2722463, -26.2315204, 28.4449594, -26.0681...",POINT (28.35860 -26.14986)
{'place_id': '0e587c59401d0a27'},"Pretoria, South Africa","[27.9483035, -25.9157727, 28.4198285, -25.5894...",POINT (28.18407 -25.75261)
{'place_id': '52e073e7724385c3'},"Pietermaritzburg, South Africa","[30.2563496, -29.6999989, 30.4512134, -29.5328...",POINT (30.35378 -29.61640)


In [412]:
locations.shape

(1084, 3)

#### Tweet Database

In [414]:
just_tweets = tweets
just_tweets = just_tweets.drop('place_name', axis = 1)
just_tweets = just_tweets.drop('coords', axis = 1)
just_tweets = just_tweets.drop('geometry', axis = 1)
just_tweets = just_tweets.drop('place_id', axis = 1)
just_tweets = just_tweets.drop('author_id', axis = 1)
just_tweets = just_tweets.drop('author_followers', axis = 1)
just_tweets = just_tweets.set_index('tweet_id')
just_tweets.head(3)

Unnamed: 0_level_0,created_at,retweets,replies,likes,quote_count,mentioned_users,hashtags,emojis,urls,cleaned_text
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1575489974316584961,2022-09-29 14:17:35+00:00,0,0,1,0,,,,https://t.co/hvAYVEs6NU,spx yesterday's rally\npoof go
1575482837989474304,2022-09-29 13:49:14+00:00,1,0,0,0,[@JohnPerlman],[#702drive],,,driverless car work south africa.the car would...
1575442792800722946,2022-09-29 11:10:06+00:00,0,0,1,0,,,"[🤣, 🙌]",https://t.co/MpbRqL0gpU,least offer ukumbhalela ama board strike🤣


In [411]:
just_tweets.shape

(19409, 10)

#### Author Database

In [407]:
authors = pd.DataFrame()
authors['author_id'] = tweets['author_id']
authors['author_followers'] = tweets['author_followers']
authors = authors.loc[authors.astype(str).drop_duplicates().index]
authors = authors.set_index('author_id')
authors.head()

Unnamed: 0_level_0,author_followers
author_id,Unnamed: 1_level_1
1258457287120818182,1326
2496356957,1436
1309123451433746437,6455
177685467,1880
1438481501327400964,966


In [410]:
authors.shape

(8275, 1)

#### Relational Database

In [417]:
relations = pd.DataFrame()
relations['tweet_id'] = tweets['tweet_id']
relations['author_id'] = tweets['author_id']
relations['place_id'] = tweets['place_id']
relations = relations.loc[relations.astype(str).drop_duplicates().index]
relations = relations.set_index('author_id')
relations.head()

Unnamed: 0_level_0,tweet_id,place_id
author_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1258457287120818182,1575489974316584961,{'place_id': 'a02e6c261fa62b42'}
2496356957,1575482837989474304,{'place_id': '0e587c59401d0a27'}
1309123451433746437,1575442792800722946,{'place_id': '52e073e7724385c3'}
177685467,1575438290962075648,{'place_id': '3e46a98adcf05e59'}
1438481501327400964,1575418000047472641,{'place_id': '2cef54f8b7d99a87'}


In [418]:
relations.shape

(19409, 2)

#### Create csv for each database

In [420]:
just_tweets.to_csv('CSV/Relational_Databases/just_tweets.csv')
authors.to_csv('CSV/Relational_Databases/authors.csv')
locations.to_csv('CSV/Relational_Databases/locations.csv')
relations.to_csv('CSV/Relational_Databases/relations.csv')

### Possible Still to do Cleaning
ensure that there are no NaN values, this can be done by either creating a custom value possibly using an average or alike. </br>
Or filling in a value such as n/a indicating that no value is available or provided.