# Tweet Cleaning

### Import Libraries

In [1]:
import numpy as np
import pandas as pd
import os

# Word processing libraries
import re
from nltk.corpus import wordnet
import string
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer
from shapely.geometry import LineString
from geopandas import GeoDataFrame
from shapely.geometry import Point, Polygon
import folium

#Nan
from cmath import nan

### Read Tweets from CSV - Provided by scraping

In [3]:
tweets = pd.read_csv('DATA/Scrape Tweets.csv')

In [202]:
tweets.shape

(61, 12)

In [203]:
tweets = tweets.drop(['Unnamed: 0'], axis=1)
tweets.head()

Unnamed: 0,tweet_id,author_followers,text,created_at,retweets,replies,likes,quote_count,place_id,place_name,bbox
0,923322201125916672,1074,Idk if these niggas are singing struggle songs...,2017-10-25 22:55:41+00:00,0,0,0,0,{'place_id': 'e564d30dc173d2a8'},"Johannesburg, South Africa","[27.7518557, -26.5126489, 28.1843404, -26.0396..."
1,923320610373107712,3938,@Pali_Matli So nawe stop associating Wits with...,2017-10-25 22:49:22+00:00,0,0,0,0,{'place_id': '810e12ec4af983de'},"Bloemfontein, South Africa","[26.0019056, -29.2319585, 26.4243136, -28.9812..."
2,923287512633798656,60191,Please RT for Awareness.This kid deserves free...,2017-10-25 20:37:51+00:00,0,0,1,0,{'place_id': 'd3d46bdc072ad347'},"Vryburg, South Africa","[24.6565185, -27.0016931, 24.7703747, -26.894228]"
3,923282795048460288,2322,No vision for what or where the institution is...,2017-10-25 20:19:06+00:00,0,1,1,0,{'place_id': '8b9ec16fdc0d7e55'},"Cape Town, South Africa","[18.3180332, -34.35839, 18.6600898, -33.8849254]"
4,923243015740837893,796552,UCT students have disrupted some classes on th...,2017-10-25 17:41:02+00:00,1,0,3,0,{'place_id': 'e564d30dc173d2a8'},"Johannesburg, South Africa","[27.7518557, -26.5126489, 28.1843404, -26.0396..."


In [204]:
tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61 entries, 0 to 60
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   tweet_id          61 non-null     int64 
 1   author_followers  61 non-null     int64 
 2   text              61 non-null     object
 3   created_at        61 non-null     object
 4   retweets          61 non-null     int64 
 5   replies           61 non-null     int64 
 6   likes             61 non-null     int64 
 7   quote_count       61 non-null     int64 
 8   place_id          61 non-null     object
 9   place_name        61 non-null     object
 10  bbox              61 non-null     object
dtypes: int64(6), object(5)
memory usage: 5.4+ KB


### Remove Duplicates
If entry is the same then drop it</br>
There should't be duplicates

In [205]:
print('Initial size of dataset before dropping duplicated rows:', tweets.shape)
tweets.drop_duplicates(keep = False, inplace = True)

print('Current size of dataset after dropping duplicated rows, if any, is:', tweets.shape)

Initial size of dataset before dropping duplicated rows: (61, 11)
Current size of dataset after dropping duplicated rows, if any, is: (61, 11)


### Remove Empty Tweets
If tweet content is empty/Nan then drop it

In [206]:
tweets.dropna(subset = ['text'], inplace = True)

In [207]:
len(tweets)

61

### Collect @Users in Text
Identify all mentions of other users using @ </br>
Create new feature containg all mentions (@s)</br>
Remove all mentions from text - done in next section

In [208]:
def mentioned_users(string):
    usernames = re.findall('@[^\s]+', string)
    if usernames == []:
        return nan
    return usernames

In [209]:
# tweets['mentioned_users'] = tweets['text'].apply(lambda x: mentioned_users(x))
# tweets.head()

### Collect #Hashtags in Text
Identify all hashtags using # </br>
Create new feature containg all hashtags (#s)</br>
Remove all hashtags from text - done in next section

In [210]:
def hashtags(string):
    hashtags = re.findall('#[^\s]+', string)
    if hashtags == []:
        return nan
    return hashtags

In [211]:
# tweets['hashtags'] = tweets['text'].apply(lambda x: hashtags(x))
# tweets.head()

### Collect Emojis in text
Identify all emojis using unicode value</br>
Create new feature containg all emojis</br>
Remove all emojis from text - done in next section</br>
Note: We could identify our own emojis that could be useful instead of all emojis and put them in a dictionary

In [212]:
from cmath import nan
import advertools as adv
def extract_emojis(string):
    list = [string]
    emoji_dict = adv.extract_emoji(list)
    emojis = emoji_dict['emoji'][0]
    if(emojis == []):
        return nan
    return emojis

In [213]:
# For if an error saying float can not be changed to lower is called!
#tweets.text=tweets.text.astype(str)

In [214]:
# tweets['emojis'] = tweets['text'].apply(lambda x: extract_emojis(x))
# tweets.head()

### Collect Links in text
Identify all links using a URL</br>
Create new feature containg all Links</br>
Remove all links from text - done in next section</br>
Maybe look into if 'www' syntax must also be used

In [215]:
def find_urls(string):
    try:
        urls = re.search("(?P<url>https?://[^\s]+)", string).group("url")
    except:
        return nan
    return urls

In [216]:
# tweets['urls'] = tweets['text'].apply(lambda x: find_urls(x))
# tweets.head()

In [217]:
tweets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 61 entries, 0 to 60
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   tweet_id          61 non-null     int64 
 1   author_followers  61 non-null     int64 
 2   text              61 non-null     object
 3   created_at        61 non-null     object
 4   retweets          61 non-null     int64 
 5   replies           61 non-null     int64 
 6   likes             61 non-null     int64 
 7   quote_count       61 non-null     int64 
 8   place_id          61 non-null     object
 9   place_name        61 non-null     object
 10  bbox              61 non-null     object
dtypes: int64(6), object(5)
memory usage: 5.7+ KB


### Remove Unwanted Information and Clean Tweet text
To Clean Text:
* Convert to Lowercase
* Tokenise
* Tag Text
* Lemmatise Text

This includes:
* @mentions
* URLs
* Hashtags
* Emojis
* Punctuation
* Numbers
* Stop Words
* Single Letter Words
* Empty Tokens


In [218]:
# Define Emoji_patterns
emoji_pattern = re.compile("["
         u"\U0001F600-\U0001F64F"  # emoticons
         u"\U0001F300-\U0001F5FF"  # symbols & pictographs
         u"\U0001F680-\U0001F6FF"  # transport & map symbols
         u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
         u"\U00002702-\U000027B0"
         u"\U000024C2-\U0001F251"
         "]+", flags=re.UNICODE)

In [219]:
# Define the function to implement POS tagging:
def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN


# Define the main function to clean text in various ways:
def clean_text(text):
    
    # Apply regex expressions first before converting string to list of tokens/words:
    # 1. remove @usernames
    text = re.sub('@[^\s]+', '', text)
    
    # 2. remove URLs
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', '', text)
    
    # 3. remove hashtags entirely i.e. #hashtags
    text = re.sub(r'#([^\s]+)', '', text)
    
    # 4. remove emojis
    text = emoji_pattern.sub(r'', text)
    
    # 5. Convert text to lowercase
    text = text.lower()
    
    # 6. tokenize text and remove punctuation
    text = [word.strip(string.punctuation) for word in text.split(" ")]
    
    # 7. remove numbers
    text = [word for word in text if not any(c.isdigit() for c in word)]
    
    # 8. remove stop words
    stop = stopwords.words('english')
    text = [x for x in text if x not in stop]
    
    # 9. remove empty tokens
    text = [t for t in text if len(t) > 0]
    
    # 10. pos tag text and lemmatize text
    pos_tags = pos_tag(text)
    text = [WordNetLemmatizer().lemmatize(t[0], get_wordnet_pos(t[1])) for t in pos_tags]
    
    # 11. remove words with only one letter
    text = [t for t in text if len(t) > 1]
    
    # join all
    text = " ".join(text)
    
    return(text)

In [220]:
# Apply function on the column 'text':
tweets['cleaned_text'] = tweets['text'].apply(lambda x: clean_text(x))
tweets.head()

Unnamed: 0,tweet_id,author_followers,text,created_at,retweets,replies,likes,quote_count,place_id,place_name,bbox,cleaned_text
0,923322201125916672,1074,Idk if these niggas are singing struggle songs...,2017-10-25 22:55:41+00:00,0,0,0,0,{'place_id': 'e564d30dc173d2a8'},"Johannesburg, South Africa","[27.7518557, -26.5126489, 28.1843404, -26.0396...",idk nigga sing struggle song coz they’re drunk...
1,923320610373107712,3938,@Pali_Matli So nawe stop associating Wits with...,2017-10-25 22:49:22+00:00,0,0,0,0,{'place_id': '810e12ec4af983de'},"Bloemfontein, South Africa","[26.0019056, -29.2319585, 26.4243136, -28.9812...","nawe stop associate wit ufs,because also know ..."
2,923287512633798656,60191,Please RT for Awareness.This kid deserves free...,2017-10-25 20:37:51+00:00,0,0,1,0,{'place_id': 'd3d46bdc072ad347'},"Vryburg, South Africa","[24.6565185, -27.0016931, 24.7703747, -26.894228]",please rt awareness.this kid deserve free educ...
3,923282795048460288,2322,No vision for what or where the institution is...,2017-10-25 20:19:06+00:00,0,1,1,0,{'place_id': '8b9ec16fdc0d7e55'},"Cape Town, South Africa","[18.3180332, -34.35839, 18.6600898, -33.8849254]",vision institution go struggle locate pulse uc...
4,923243015740837893,796552,UCT students have disrupted some classes on th...,2017-10-25 17:41:02+00:00,1,0,3,0,{'place_id': 'e564d30dc173d2a8'},"Johannesburg, South Africa","[27.7518557, -26.5126489, 28.1843404, -26.0396...",uct student disrupt class institution's main c...


We can now remove the original tweet text because the cleaned text is all that is needed.
Actually Keep for Content Analysis

In [221]:
#tweets = tweets.drop('text', axis = 1)

In [222]:
tweets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 61 entries, 0 to 60
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   tweet_id          61 non-null     int64 
 1   author_followers  61 non-null     int64 
 2   text              61 non-null     object
 3   created_at        61 non-null     object
 4   retweets          61 non-null     int64 
 5   replies           61 non-null     int64 
 6   likes             61 non-null     int64 
 7   quote_count       61 non-null     int64 
 8   place_id          61 non-null     object
 9   place_name        61 non-null     object
 10  bbox              61 non-null     object
 11  cleaned_text      61 non-null     object
dtypes: int64(6), object(6)
memory usage: 6.2+ KB


### Convert Boundry Box to a set of coordinates of Latitude and Longitude
There are a few ways of doing this
* Take an average and find the middle of the Boundry Box
* Indentify where the location is using another API based on place_name
* Keep the location as a polygon and then place each user into a munucipality in hich the area is largest

The way we will do it here is using centroid of the boundry box

In [225]:
def bbox_to_coords(bbox):
    all_coords = bbox.split()
    coords = []
    for coordinate in all_coords:
        coordinate = coordinate.replace('[','')
        coordinate = coordinate.replace(']','')
        coordinate = coordinate.replace(',','')
        coord = float(coordinate)
        coords.append(coord)

    return coords

In [226]:
tweets['coords'] = tweets['bbox'].apply(lambda x: bbox_to_coords(x))

In [227]:
tweets['longitude_1'] = tweets['coords'].apply(lambda x: x[0])
tweets['latitude_1'] = tweets['coords'].apply(lambda x: x[1])
tweets['longitude_2'] = tweets['coords'].apply(lambda x: x[2])
tweets['latitude_2'] = tweets['coords'].apply(lambda x: x[3])

In [228]:
tweets.head(3)

Unnamed: 0.1,Unnamed: 0,tweet_id,author_followers,text,created_at,retweets,replies,likes,quote_count,place_id,place_name,bbox,cleaned_text,coords,longitude_1,latitude_1,longitude_2,latitude_2
0,0,923322201125916672,1074,Idk if these niggas are singing struggle songs...,2017-10-25 22:55:41+00:00,0,0,0,0,{'place_id': 'e564d30dc173d2a8'},"Johannesburg, South Africa","[27.7518557, -26.5126489, 28.1843404, -26.0396...",idk nigga sing struggle song coz they’re drunk...,"[27.7518557, -26.5126489, 28.1843404, -26.0396...",27.751856,-26.512649,28.18434,-26.039628
1,1,923320610373107712,3938,@Pali_Matli So nawe stop associating Wits with...,2017-10-25 22:49:22+00:00,0,0,0,0,{'place_id': '810e12ec4af983de'},"Bloemfontein, South Africa","[26.0019056, -29.2319585, 26.4243136, -28.9812...","nawe stop associate wit ufs,because also know ...","[26.0019056, -29.2319585, 26.4243136, -28.9812...",26.001906,-29.231959,26.424314,-28.981218
2,2,923287512633798656,60191,Please RT for Awareness.This kid deserves free...,2017-10-25 20:37:51+00:00,0,0,1,0,{'place_id': 'd3d46bdc072ad347'},"Vryburg, South Africa","[24.6565185, -27.0016931, 24.7703747, -26.894228]",please rt awareness.this kid deserve free educ...,"[24.6565185, -27.0016931, 24.7703747, -26.894228]",24.656519,-27.001693,24.770375,-26.894228


In [229]:
tweets = tweets.drop('bbox', axis=1)

In [230]:
def find_centroid(coords):
    geometry = LineString([(coords[0], coords[1]),(coords[2], coords[3])])
    centroid = geometry.centroid
    return centroid

In [231]:
tweets['centroid'] = tweets['coords'].apply(lambda x: find_centroid(x))

In [232]:
tweets['centroid_long'] = tweets['centroid'].apply(lambda z: z.x)
tweets['centroid_lat'] = tweets['centroid'].apply(lambda z: z.y)

In [233]:
tweets = tweets.drop('centroid', axis =1)
tweets.head(3)

Unnamed: 0.1,Unnamed: 0,tweet_id,author_followers,text,created_at,retweets,replies,likes,quote_count,place_id,place_name,cleaned_text,coords,longitude_1,latitude_1,longitude_2,latitude_2,centroid_long,centroid_lat
0,0,923322201125916672,1074,Idk if these niggas are singing struggle songs...,2017-10-25 22:55:41+00:00,0,0,0,0,{'place_id': 'e564d30dc173d2a8'},"Johannesburg, South Africa",idk nigga sing struggle song coz they’re drunk...,"[27.7518557, -26.5126489, 28.1843404, -26.0396...",27.751856,-26.512649,28.18434,-26.039628,27.968098,-26.276139
1,1,923320610373107712,3938,@Pali_Matli So nawe stop associating Wits with...,2017-10-25 22:49:22+00:00,0,0,0,0,{'place_id': '810e12ec4af983de'},"Bloemfontein, South Africa","nawe stop associate wit ufs,because also know ...","[26.0019056, -29.2319585, 26.4243136, -28.9812...",26.001906,-29.231959,26.424314,-28.981218,26.21311,-29.106588
2,2,923287512633798656,60191,Please RT for Awareness.This kid deserves free...,2017-10-25 20:37:51+00:00,0,0,1,0,{'place_id': 'd3d46bdc072ad347'},"Vryburg, South Africa",please rt awareness.this kid deserve free educ...,"[24.6565185, -27.0016931, 24.7703747, -26.894228]",24.656519,-27.001693,24.770375,-26.894228,24.713447,-26.947961


### Fix Edge Cases
* Cape Town
* Betty's Bay
* Bloubergstrand
* Mdumbi Beach 
</br>
Using: https://www.distancesto.com/coordinates/za/bloubergstrand-latitude-longitude/history/76385.html

In [234]:
tweets.loc[tweets.place_name == 'Cape Town, South Africa', ['centroid_long', 'centroid_lat']] = 18.4241, -33.9249
tweets.loc[tweets.place_name == 'Mdumbi Beach', ['centroid_long', 'centroid_lat']] = 29.215369, -31.933896
tweets.loc[tweets.place_name == "Betty's Bay, South Africa", ['centroid_long', 'centroid_lat']] = 18.92051, -34.34747
tweets.loc[tweets.place_name == 'Bloubergstrand', ['centroid_long', 'centroid_lat']] = 18.46173, -33.800418

In [235]:
geometry = [Point(xy) for xy in zip(tweets['centroid_long'], tweets['centroid_lat'])]
gdf = GeoDataFrame(tweets, geometry=geometry)  

In [236]:
tweets = tweets.drop('longitude_1', axis=1)
tweets = tweets.drop('longitude_2', axis=1)
tweets = tweets.drop('latitude_1', axis=1)
tweets = tweets.drop('latitude_2', axis=1)
tweets = tweets.drop('centroid_long', axis=1)
tweets = tweets.drop('centroid_lat', axis=1)
tweets.head(2)

Unnamed: 0.1,Unnamed: 0,tweet_id,author_followers,text,created_at,retweets,replies,likes,quote_count,place_id,place_name,cleaned_text,coords,geometry
0,0,923322201125916672,1074,Idk if these niggas are singing struggle songs...,2017-10-25 22:55:41+00:00,0,0,0,0,{'place_id': 'e564d30dc173d2a8'},"Johannesburg, South Africa",idk nigga sing struggle song coz they’re drunk...,"[27.7518557, -26.5126489, 28.1843404, -26.0396...",POINT (27.96810 -26.27614)
1,1,923320610373107712,3938,@Pali_Matli So nawe stop associating Wits with...,2017-10-25 22:49:22+00:00,0,0,0,0,{'place_id': '810e12ec4af983de'},"Bloemfontein, South Africa","nawe stop associate wit ufs,because also know ...","[26.0019056, -29.2319585, 26.4243136, -28.9812...",POINT (26.21311 -29.10659)


### Remove Useless locations
Drop row where location is = South Africa, since this location is not specfic enough and offers little value.

In [237]:
tweets.shape

(61, 14)

In [238]:
tweets = tweets[tweets['place_name'] != 'South Africa']

In [239]:
tweets.shape

(61, 14)

In [240]:
tweets.head(2)

Unnamed: 0.1,Unnamed: 0,tweet_id,author_followers,text,created_at,retweets,replies,likes,quote_count,place_id,place_name,cleaned_text,coords,geometry
0,0,923322201125916672,1074,Idk if these niggas are singing struggle songs...,2017-10-25 22:55:41+00:00,0,0,0,0,{'place_id': 'e564d30dc173d2a8'},"Johannesburg, South Africa",idk nigga sing struggle song coz they’re drunk...,"[27.7518557, -26.5126489, 28.1843404, -26.0396...",POINT (27.96810 -26.27614)
1,1,923320610373107712,3938,@Pali_Matli So nawe stop associating Wits with...,2017-10-25 22:49:22+00:00,0,0,0,0,{'place_id': '810e12ec4af983de'},"Bloemfontein, South Africa","nawe stop associate wit ufs,because also know ...","[26.0019056, -29.2319585, 26.4243136, -28.9812...",POINT (26.21311 -29.10659)


In [241]:
tweets.drop('coords', axis=1, inplace=True)

In [242]:
tweets.to_csv('DATA/Clean_Tweets_All_Info.csv')

### Set up tweet dataframe into a normalised relational database
Use the following databases:
* Tweet Database - Contains all the information about each tweet
* Location Database - Contains all the information about each location
* Author Database - Contains all the information specfic to each author
* Relational Database - Contains the relations between all the above databases

#### Location Database

In [243]:
locations = GeoDataFrame()
locations['place_id'] = tweets['place_id']
locations['place_name'] = tweets['place_name']
# locations['coords'] = tweets['coords']
locations['geometry'] = tweets['geometry']
print(locations.shape)
locations = locations.loc[locations.astype(str).drop_duplicates().index]
print(locations.shape)
locations = locations.set_index('place_id')
locations.head(3)


(61, 3)
(15, 3)


Unnamed: 0_level_0,place_name,geometry
place_id,Unnamed: 1_level_1,Unnamed: 2_level_1
{'place_id': 'e564d30dc173d2a8'},"Johannesburg, South Africa",POINT (27.96810 -26.27614)
{'place_id': '810e12ec4af983de'},"Bloemfontein, South Africa",POINT (26.21311 -29.10659)
{'place_id': 'd3d46bdc072ad347'},"Vryburg, South Africa",POINT (24.71345 -26.94796)


In [244]:
locations.shape

(15, 2)

#### Tweet Database

In [245]:
just_tweets = tweets
just_tweets = just_tweets.drop('place_name', axis = 1)
# just_tweets = just_tweets.drop('coords', axis = 1)
just_tweets = just_tweets.drop('geometry', axis = 1)
just_tweets = just_tweets.drop('place_id', axis = 1)
# just_tweets = just_tweets.drop('author_id', axis = 1)
just_tweets = just_tweets.set_index('tweet_id')
just_tweets.head(3)

Unnamed: 0_level_0,Unnamed: 0,author_followers,text,created_at,retweets,replies,likes,quote_count,cleaned_text
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
923322201125916672,0,1074,Idk if these niggas are singing struggle songs...,2017-10-25 22:55:41+00:00,0,0,0,0,idk nigga sing struggle song coz they’re drunk...
923320610373107712,1,3938,@Pali_Matli So nawe stop associating Wits with...,2017-10-25 22:49:22+00:00,0,0,0,0,"nawe stop associate wit ufs,because also know ..."
923287512633798656,2,60191,Please RT for Awareness.This kid deserves free...,2017-10-25 20:37:51+00:00,0,0,1,0,please rt awareness.this kid deserve free educ...


In [246]:
just_tweets.shape

(61, 9)

#### Relational Database

In [247]:
relations = pd.DataFrame()
relations['tweet_id'] = tweets['tweet_id']
relations['place_id'] = tweets['place_id']
relations = relations.loc[relations.astype(str).drop_duplicates().index]
relations = relations.set_index('tweet_id')
relations.head()

Unnamed: 0_level_0,place_id
tweet_id,Unnamed: 1_level_1
923322201125916672,{'place_id': 'e564d30dc173d2a8'}
923320610373107712,{'place_id': '810e12ec4af983de'}
923287512633798656,{'place_id': 'd3d46bdc072ad347'}
923282795048460288,{'place_id': '8b9ec16fdc0d7e55'}
923243015740837893,{'place_id': 'e564d30dc173d2a8'}


In [248]:
relations.shape

(61, 1)

#### Create csv for each database

In [249]:
just_tweets.to_csv('DATA/Relational_Databases/just_tweets.csv')
locations.to_csv('DATA/Relational_Databases/locations.csv')
relations.to_csv('DATA/Relational_Databases/relations.csv')

### Possible Still to do Cleaning
ensure that there are no NaN values, this can be done by either creating a custom value possibly using an average or alike. </br>
Or filling in a value such as n/a indicating that no value is available or provided.