# Tweet Cleaning

### Import Libraries

In [286]:
import numpy as np
import pandas as pd
import os

# Word processing libraries
import re
from nltk.corpus import wordnet
import string
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer

# Widen the size of each cell
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

#Nan
from cmath import nan

  from IPython.core.display import display, HTML


### Read Tweets from CSV - Provided by scraping

In [340]:
df = pd.read_csv('Dummy_Tweets2.csv')

In [342]:
df.shape

(9887, 20)

In [343]:
df = df.drop(['Unnamed: 0'], axis=1)
df.head()

Unnamed: 0,author_id,username,author_followers,author_tweets,author_description,author_location,text,created_at,retweets,replies,likes,quote_count,place_id,place_id_check,place_name,country_code,country,place_type,bbox
0,1284693824887906304,LonnaMarie726,76,2441,Organized Chaos. Wild Unconditional Love. A Fi...,"Seattle, WA",@seattletimes Martin Luther King Jr was arrest...,2022-09-16 23:59:51+00:00,0,0,0,0,,5d058f2e9fe1516c,Halifax,CA,Canada,city,"[-64.237659, 43.366298, -59.385802, 45.27617]"
1,1483912172,ajthompson13,4312,91662,"Investor NZ Manufacturing, author 'Life Changi...",Auckland,@Pongochch @MatthewHootonNZ @simonwilson I don...,2022-09-16 23:59:42+00:00,0,2,0,0,,5d058f2e9fe1516c,Halifax,CA,Canada,city,"[-64.237659, 43.366298, -59.385802, 45.27617]"
2,1525319286985609217,DennaouiHibe,9,1580,Ey azgin Kamalistler..!\nEger secmek zorunda k...,,@PalEvePlus You have to also protest Israel fo...,2022-09-16 23:59:40+00:00,0,0,0,0,,5d058f2e9fe1516c,Halifax,CA,Canada,city,"[-64.237659, 43.366298, -59.385802, 45.27617]"
3,1502792954640887811,Fuckronaut666,5,668,Gymnosophist. Ekranoplan enthusiast. Here. Now.,,@Pal_action Aww it's so cute you think you shu...,2022-09-16 23:59:29+00:00,0,0,1,0,,5d058f2e9fe1516c,Halifax,CA,Canada,city,"[-64.237659, 43.366298, -59.385802, 45.27617]"
4,1466199598515441668,oz_quality,31,1088,winning,,@XRPPHOENIXX But is it going to be a protest f...,2022-09-16 23:59:26+00:00,0,1,6,0,,5d058f2e9fe1516c,Halifax,CA,Canada,city,"[-64.237659, 43.366298, -59.385802, 45.27617]"


In [344]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9887 entries, 0 to 9886
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   author_id           9887 non-null   int64 
 1   username            9887 non-null   object
 2   author_followers    9887 non-null   int64 
 3   author_tweets       9887 non-null   int64 
 4   author_description  8273 non-null   object
 5   author_location     6289 non-null   object
 6   text                9887 non-null   object
 7   created_at          9887 non-null   object
 8   retweets            9887 non-null   int64 
 9   replies             9887 non-null   int64 
 10  likes               9887 non-null   int64 
 11  quote_count         9887 non-null   int64 
 12  place_id            160 non-null    object
 13  place_id_check      9887 non-null   object
 14  place_name          9887 non-null   object
 15  country_code        9887 non-null   object
 16  country             9887

### Remove Duplicates
If entry is the same then drop it</br>
There should't be duplicates

In [345]:
print('Initial size of dataset before dropping duplicated rows:', df.shape)
df.drop_duplicates(keep = False, inplace = True)

print('Current size of dataset after dropping duplicated rows, if any, is:', df.shape)
print(df.head())

Initial size of dataset before dropping duplicated rows: (9887, 19)
Current size of dataset after dropping duplicated rows, if any, is: (9887, 19)
             author_id       username  author_followers  author_tweets  \
0  1284693824887906304  LonnaMarie726                76           2441   
1           1483912172   ajthompson13              4312          91662   
2  1525319286985609217   DennaouiHibe                 9           1580   
3  1502792954640887811  Fuckronaut666                 5            668   
4  1466199598515441668     oz_quality                31           1088   

                                  author_description author_location  \
0  Organized Chaos. Wild Unconditional Love. A Fi...     Seattle, WA   
1  Investor NZ Manufacturing, author 'Life Changi...        Auckland   
2  Ey azgin Kamalistler..!\nEger secmek zorunda k...             NaN   
3    Gymnosophist. Ekranoplan enthusiast. Here. Now.             NaN   
4                                            win

### Remove Empty Tweets
If tweet content is empty/Nan then drop it

In [346]:
df.dropna(subset = ['text'], inplace = True)

In [347]:
len(df)

9887

### Collect @Users in Text
Identify all mentions of other users using @ </br>
Create new feature containg all mentions (@s)</br>
Remove all mentions from text - done in next section

In [348]:
def mentioned_users(string):
    usernames = re.findall('@[^\s]+', string)
    if usernames == []:
        return nan
    return usernames

In [349]:
df['mentioned_users'] = df['text'].apply(lambda x: mentioned_users(x))
df.head()

Unnamed: 0,author_id,username,author_followers,author_tweets,author_description,author_location,text,created_at,retweets,replies,likes,quote_count,place_id,place_id_check,place_name,country_code,country,place_type,bbox,mentioned_users
0,1284693824887906304,LonnaMarie726,76,2441,Organized Chaos. Wild Unconditional Love. A Fi...,"Seattle, WA",@seattletimes Martin Luther King Jr was arrest...,2022-09-16 23:59:51+00:00,0,0,0,0,,5d058f2e9fe1516c,Halifax,CA,Canada,city,"[-64.237659, 43.366298, -59.385802, 45.27617]",[@seattletimes]
1,1483912172,ajthompson13,4312,91662,"Investor NZ Manufacturing, author 'Life Changi...",Auckland,@Pongochch @MatthewHootonNZ @simonwilson I don...,2022-09-16 23:59:42+00:00,0,2,0,0,,5d058f2e9fe1516c,Halifax,CA,Canada,city,"[-64.237659, 43.366298, -59.385802, 45.27617]","[@Pongochch, @MatthewHootonNZ, @simonwilson]"
2,1525319286985609217,DennaouiHibe,9,1580,Ey azgin Kamalistler..!\nEger secmek zorunda k...,,@PalEvePlus You have to also protest Israel fo...,2022-09-16 23:59:40+00:00,0,0,0,0,,5d058f2e9fe1516c,Halifax,CA,Canada,city,"[-64.237659, 43.366298, -59.385802, 45.27617]",[@PalEvePlus]
3,1502792954640887811,Fuckronaut666,5,668,Gymnosophist. Ekranoplan enthusiast. Here. Now.,,@Pal_action Aww it's so cute you think you shu...,2022-09-16 23:59:29+00:00,0,0,1,0,,5d058f2e9fe1516c,Halifax,CA,Canada,city,"[-64.237659, 43.366298, -59.385802, 45.27617]",[@Pal_action]
4,1466199598515441668,oz_quality,31,1088,winning,,@XRPPHOENIXX But is it going to be a protest f...,2022-09-16 23:59:26+00:00,0,1,6,0,,5d058f2e9fe1516c,Halifax,CA,Canada,city,"[-64.237659, 43.366298, -59.385802, 45.27617]",[@XRPPHOENIXX]


### Collect #Hashtags in Text
Identify all hashtags using # </br>
Create new feature containg all hashtags (#s)</br>
Remove all hashtags from text - done in next section

In [350]:
def hashtags(string):
    hashtags = re.findall('#[^\s]+', string)
    if hashtags == []:
        return nan
    return hashtags

In [351]:
df['hashtags'] = df['text'].apply(lambda x: hashtags(x))
df.head()

Unnamed: 0,author_id,username,author_followers,author_tweets,author_description,author_location,text,created_at,retweets,replies,...,quote_count,place_id,place_id_check,place_name,country_code,country,place_type,bbox,mentioned_users,hashtags
0,1284693824887906304,LonnaMarie726,76,2441,Organized Chaos. Wild Unconditional Love. A Fi...,"Seattle, WA",@seattletimes Martin Luther King Jr was arrest...,2022-09-16 23:59:51+00:00,0,0,...,0,,5d058f2e9fe1516c,Halifax,CA,Canada,city,"[-64.237659, 43.366298, -59.385802, 45.27617]",[@seattletimes],
1,1483912172,ajthompson13,4312,91662,"Investor NZ Manufacturing, author 'Life Changi...",Auckland,@Pongochch @MatthewHootonNZ @simonwilson I don...,2022-09-16 23:59:42+00:00,0,2,...,0,,5d058f2e9fe1516c,Halifax,CA,Canada,city,"[-64.237659, 43.366298, -59.385802, 45.27617]","[@Pongochch, @MatthewHootonNZ, @simonwilson]",
2,1525319286985609217,DennaouiHibe,9,1580,Ey azgin Kamalistler..!\nEger secmek zorunda k...,,@PalEvePlus You have to also protest Israel fo...,2022-09-16 23:59:40+00:00,0,0,...,0,,5d058f2e9fe1516c,Halifax,CA,Canada,city,"[-64.237659, 43.366298, -59.385802, 45.27617]",[@PalEvePlus],[#SabraAndShatilla]
3,1502792954640887811,Fuckronaut666,5,668,Gymnosophist. Ekranoplan enthusiast. Here. Now.,,@Pal_action Aww it's so cute you think you shu...,2022-09-16 23:59:29+00:00,0,0,...,0,,5d058f2e9fe1516c,Halifax,CA,Canada,city,"[-64.237659, 43.366298, -59.385802, 45.27617]",[@Pal_action],
4,1466199598515441668,oz_quality,31,1088,winning,,@XRPPHOENIXX But is it going to be a protest f...,2022-09-16 23:59:26+00:00,0,1,...,0,,5d058f2e9fe1516c,Halifax,CA,Canada,city,"[-64.237659, 43.366298, -59.385802, 45.27617]",[@XRPPHOENIXX],


### Collect Emojis in text
Identify all emojis using unicode value</br>
Create new feature containg all emojis</br>
Remove all emojis from text - done in next section</br>
Note: We could identify our own emojis that could be useful instead of all emojis and put them in a dictionary

In [352]:
from cmath import nan
import advertools as adv
def extract_emojis(string):
    list = [string]
    emoji_dict = adv.extract_emoji(list)
    emojis = emoji_dict['emoji'][0]
    if(emojis == []):
        return nan
    return emojis

In [353]:
# For if an error saying float can not be changed to lower is called!
#df.text=df.text.astype(str)

In [354]:
df['emojis'] = df['text'].apply(lambda x: extract_emojis(x))
df.head()

Unnamed: 0,author_id,username,author_followers,author_tweets,author_description,author_location,text,created_at,retweets,replies,...,place_id,place_id_check,place_name,country_code,country,place_type,bbox,mentioned_users,hashtags,emojis
0,1284693824887906304,LonnaMarie726,76,2441,Organized Chaos. Wild Unconditional Love. A Fi...,"Seattle, WA",@seattletimes Martin Luther King Jr was arrest...,2022-09-16 23:59:51+00:00,0,0,...,,5d058f2e9fe1516c,Halifax,CA,Canada,city,"[-64.237659, 43.366298, -59.385802, 45.27617]",[@seattletimes],,
1,1483912172,ajthompson13,4312,91662,"Investor NZ Manufacturing, author 'Life Changi...",Auckland,@Pongochch @MatthewHootonNZ @simonwilson I don...,2022-09-16 23:59:42+00:00,0,2,...,,5d058f2e9fe1516c,Halifax,CA,Canada,city,"[-64.237659, 43.366298, -59.385802, 45.27617]","[@Pongochch, @MatthewHootonNZ, @simonwilson]",,
2,1525319286985609217,DennaouiHibe,9,1580,Ey azgin Kamalistler..!\nEger secmek zorunda k...,,@PalEvePlus You have to also protest Israel fo...,2022-09-16 23:59:40+00:00,0,0,...,,5d058f2e9fe1516c,Halifax,CA,Canada,city,"[-64.237659, 43.366298, -59.385802, 45.27617]",[@PalEvePlus],[#SabraAndShatilla],
3,1502792954640887811,Fuckronaut666,5,668,Gymnosophist. Ekranoplan enthusiast. Here. Now.,,@Pal_action Aww it's so cute you think you shu...,2022-09-16 23:59:29+00:00,0,0,...,,5d058f2e9fe1516c,Halifax,CA,Canada,city,"[-64.237659, 43.366298, -59.385802, 45.27617]",[@Pal_action],,
4,1466199598515441668,oz_quality,31,1088,winning,,@XRPPHOENIXX But is it going to be a protest f...,2022-09-16 23:59:26+00:00,0,1,...,,5d058f2e9fe1516c,Halifax,CA,Canada,city,"[-64.237659, 43.366298, -59.385802, 45.27617]",[@XRPPHOENIXX],,


### Collect Links in text
Identify all links using a URL</br>
Create new feature containg all Links</br>
Remove all links from text - done in next section</br>
Maybe look into if 'www' syntax must also be used

In [355]:
def find_urls(string):
    try:
        urls = re.search("(?P<url>https?://[^\s]+)", string).group("url")
    except:
        return nan
    return urls

In [356]:
df['urls'] = df['text'].apply(lambda x: find_urls(x))
df.head()

Unnamed: 0,author_id,username,author_followers,author_tweets,author_description,author_location,text,created_at,retweets,replies,...,place_id_check,place_name,country_code,country,place_type,bbox,mentioned_users,hashtags,emojis,urls
0,1284693824887906304,LonnaMarie726,76,2441,Organized Chaos. Wild Unconditional Love. A Fi...,"Seattle, WA",@seattletimes Martin Luther King Jr was arrest...,2022-09-16 23:59:51+00:00,0,0,...,5d058f2e9fe1516c,Halifax,CA,Canada,city,"[-64.237659, 43.366298, -59.385802, 45.27617]",[@seattletimes],,,
1,1483912172,ajthompson13,4312,91662,"Investor NZ Manufacturing, author 'Life Changi...",Auckland,@Pongochch @MatthewHootonNZ @simonwilson I don...,2022-09-16 23:59:42+00:00,0,2,...,5d058f2e9fe1516c,Halifax,CA,Canada,city,"[-64.237659, 43.366298, -59.385802, 45.27617]","[@Pongochch, @MatthewHootonNZ, @simonwilson]",,,
2,1525319286985609217,DennaouiHibe,9,1580,Ey azgin Kamalistler..!\nEger secmek zorunda k...,,@PalEvePlus You have to also protest Israel fo...,2022-09-16 23:59:40+00:00,0,0,...,5d058f2e9fe1516c,Halifax,CA,Canada,city,"[-64.237659, 43.366298, -59.385802, 45.27617]",[@PalEvePlus],[#SabraAndShatilla],,
3,1502792954640887811,Fuckronaut666,5,668,Gymnosophist. Ekranoplan enthusiast. Here. Now.,,@Pal_action Aww it's so cute you think you shu...,2022-09-16 23:59:29+00:00,0,0,...,5d058f2e9fe1516c,Halifax,CA,Canada,city,"[-64.237659, 43.366298, -59.385802, 45.27617]",[@Pal_action],,,
4,1466199598515441668,oz_quality,31,1088,winning,,@XRPPHOENIXX But is it going to be a protest f...,2022-09-16 23:59:26+00:00,0,1,...,5d058f2e9fe1516c,Halifax,CA,Canada,city,"[-64.237659, 43.366298, -59.385802, 45.27617]",[@XRPPHOENIXX],,,


In [357]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9887 entries, 0 to 9886
Data columns (total 23 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   author_id           9887 non-null   int64 
 1   username            9887 non-null   object
 2   author_followers    9887 non-null   int64 
 3   author_tweets       9887 non-null   int64 
 4   author_description  8273 non-null   object
 5   author_location     6289 non-null   object
 6   text                9887 non-null   object
 7   created_at          9887 non-null   object
 8   retweets            9887 non-null   int64 
 9   replies             9887 non-null   int64 
 10  likes               9887 non-null   int64 
 11  quote_count         9887 non-null   int64 
 12  place_id            160 non-null    object
 13  place_id_check      9887 non-null   object
 14  place_name          9887 non-null   object
 15  country_code        9887 non-null   object
 16  country             9887

### Remove Unwanted Information and Clean Tweet text
To Clean Text:
* Convert to Lowercase
* Tokenise
* Tag Text
* Lemmatise Text

This includes:
* @mentions
* URLs
* Hashtags
* Emojis
* Punctuation
* Numbers
* Stop Words
* Single Letter Words
* Empty Tokens


In [358]:
# Define Emoji_patterns
emoji_pattern = re.compile("["
         u"\U0001F600-\U0001F64F"  # emoticons
         u"\U0001F300-\U0001F5FF"  # symbols & pictographs
         u"\U0001F680-\U0001F6FF"  # transport & map symbols
         u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
         u"\U00002702-\U000027B0"
         u"\U000024C2-\U0001F251"
         "]+", flags=re.UNICODE)

In [359]:
# Define the function to implement POS tagging:
def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN


# Define the main function to clean text in various ways:
def clean_text(text):
    
    # Apply regex expressions first before converting string to list of tokens/words:
    # 1. remove @usernames
    text = re.sub('@[^\s]+', '', text)
    
    # 2. remove URLs
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', '', text)
    
    # 3. remove hashtags entirely i.e. #hashtags
    text = re.sub(r'#([^\s]+)', '', text)
    
    # 4. remove emojis
    text = emoji_pattern.sub(r'', text)
    
    # 5. Convert text to lowercase
    text = text.lower()
    
    # 6. tokenize text and remove punctuation
    text = [word.strip(string.punctuation) for word in text.split(" ")]
    
    # 7. remove numbers
    text = [word for word in text if not any(c.isdigit() for c in word)]
    
    # 8. remove stop words
    stop = stopwords.words('english')
    text = [x for x in text if x not in stop]
    
    # 9. remove empty tokens
    text = [t for t in text if len(t) > 0]
    
    # 10. pos tag text and lemmatize text
    pos_tags = pos_tag(text)
    text = [WordNetLemmatizer().lemmatize(t[0], get_wordnet_pos(t[1])) for t in pos_tags]
    
    # 11. remove words with only one letter
    text = [t for t in text if len(t) > 1]
    
    # join all
    text = " ".join(text)
    
    return(text)

In [360]:
# Apply function on the column 'text':
df['cleaned_text'] = df['text'].apply(lambda x: clean_text(x))
df.head()

Unnamed: 0,author_id,username,author_followers,author_tweets,author_description,author_location,text,created_at,retweets,replies,...,place_name,country_code,country,place_type,bbox,mentioned_users,hashtags,emojis,urls,cleaned_text
0,1284693824887906304,LonnaMarie726,76,2441,Organized Chaos. Wild Unconditional Love. A Fi...,"Seattle, WA",@seattletimes Martin Luther King Jr was arrest...,2022-09-16 23:59:51+00:00,0,0,...,Halifax,CA,Canada,city,"[-64.237659, 43.366298, -59.385802, 45.27617]",[@seattletimes],,,,martin luther king jr arrest jailed time persp...
1,1483912172,ajthompson13,4312,91662,"Investor NZ Manufacturing, author 'Life Changi...",Auckland,@Pongochch @MatthewHootonNZ @simonwilson I don...,2022-09-16 23:59:42+00:00,0,2,...,Halifax,CA,Canada,city,"[-64.237659, 43.366298, -59.385802, 45.27617]","[@Pongochch, @MatthewHootonNZ, @simonwilson]",,,,don’t blame reason pull valid protest wise tha...
2,1525319286985609217,DennaouiHibe,9,1580,Ey azgin Kamalistler..!\nEger secmek zorunda k...,,@PalEvePlus You have to also protest Israel fo...,2022-09-16 23:59:40+00:00,0,0,...,Halifax,CA,Canada,city,"[-64.237659, 43.366298, -59.385802, 45.27617]",[@PalEvePlus],[#SabraAndShatilla],,,also protest israel let jewish fiction player ...
3,1502792954640887811,Fuckronaut666,5,668,Gymnosophist. Ekranoplan enthusiast. Here. Now.,,@Pal_action Aww it's so cute you think you shu...,2022-09-16 23:59:29+00:00,0,0,...,Halifax,CA,Canada,city,"[-64.237659, 43.366298, -59.385802, 45.27617]",[@Pal_action],,,,aww cute think shut israeli arm company anyway...
4,1466199598515441668,oz_quality,31,1088,winning,,@XRPPHOENIXX But is it going to be a protest f...,2022-09-16 23:59:26+00:00,0,1,...,Halifax,CA,Canada,city,"[-64.237659, 43.366298, -59.385802, 45.27617]",[@XRPPHOENIXX],,,,go protest people don’t energy heating protest...


In [361]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9887 entries, 0 to 9886
Data columns (total 24 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   author_id           9887 non-null   int64 
 1   username            9887 non-null   object
 2   author_followers    9887 non-null   int64 
 3   author_tweets       9887 non-null   int64 
 4   author_description  8273 non-null   object
 5   author_location     6289 non-null   object
 6   text                9887 non-null   object
 7   created_at          9887 non-null   object
 8   retweets            9887 non-null   int64 
 9   replies             9887 non-null   int64 
 10  likes               9887 non-null   int64 
 11  quote_count         9887 non-null   int64 
 12  place_id            160 non-null    object
 13  place_id_check      9887 non-null   object
 14  place_name          9887 non-null   object
 15  country_code        9887 non-null   object
 16  country             9887

In [375]:
df.drop('place_id', axis=1)
df.head(3)

Unnamed: 0,author_id,username,author_followers,author_tweets,author_description,author_location,text,created_at,retweets,replies,...,place_name,country_code,country,place_type,bbox,mentioned_users,hashtags,emojis,urls,cleaned_text
0,1284693824887906304,LonnaMarie726,76,2441,Organized Chaos. Wild Unconditional Love. A Fi...,"Seattle, WA",@seattletimes Martin Luther King Jr was arrest...,2022-09-16 23:59:51+00:00,0,0,...,Halifax,CA,Canada,city,"[-64.237659, 43.366298, -59.385802, 45.27617]",[@seattletimes],,,,martin luther king jr arrest jailed time persp...
1,1483912172,ajthompson13,4312,91662,"Investor NZ Manufacturing, author 'Life Changi...",Auckland,@Pongochch @MatthewHootonNZ @simonwilson I don...,2022-09-16 23:59:42+00:00,0,2,...,Halifax,CA,Canada,city,"[-64.237659, 43.366298, -59.385802, 45.27617]","[@Pongochch, @MatthewHootonNZ, @simonwilson]",,,,don’t blame reason pull valid protest wise tha...
2,1525319286985609217,DennaouiHibe,9,1580,Ey azgin Kamalistler..!\nEger secmek zorunda k...,,@PalEvePlus You have to also protest Israel fo...,2022-09-16 23:59:40+00:00,0,0,...,Halifax,CA,Canada,city,"[-64.237659, 43.366298, -59.385802, 45.27617]",[@PalEvePlus],[#SabraAndShatilla],,,also protest israel let jewish fiction player ...


In [374]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9887 entries, 0 to 9886
Data columns (total 24 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   author_id           9887 non-null   int64 
 1   username            9887 non-null   object
 2   author_followers    9887 non-null   int64 
 3   author_tweets       9887 non-null   int64 
 4   author_description  8273 non-null   object
 5   author_location     6289 non-null   object
 6   text                9887 non-null   object
 7   created_at          9887 non-null   object
 8   retweets            9887 non-null   int64 
 9   replies             9887 non-null   int64 
 10  likes               9887 non-null   int64 
 11  quote_count         9887 non-null   int64 
 12  place_id            160 non-null    object
 13  place_id_check      9887 non-null   object
 14  place_name          9887 non-null   object
 15  country_code        9887 non-null   object
 16  country             9887

### Convert Boundry Box to a set of coordinates of Latitude and Longitude
There are a few ways of doing this
* Take an average and find the middle of the Boundry Box
* Indentify where the location is using another API based on place_name
* Keep the location as a polygon and then place each user into a munucipality in hich the area is largest

In [372]:
df.to_csv('Dummy_Tweets2_cleaned.csv')

### Possible Still to do Cleaning
ensure that there are no NaN values, this can be done by either creating a custom value possibly using an average or alike. </br>
Or filling in a value such as n/a indicating that no value is available or provided.