# Twitter analysis for social unrest


In [16]:
import numpy as np
import pandas as pd

#Displaying data as dataframe
#pd.read_csv reads a comma-separated values (csv) file into dataframe.
df = pd.read_csv("data/protest.csv")

#Displaying certain columns from the dataframe to display - 
# df.loc accesses a group of rows and columns by label(s) from dataframe.
df = df.loc[0:,['text']]

#Output the dataframe 
df

Unnamed: 0,text
0,@dijdowell I had a very similar thought earlie...
1,Could have made her point without bringing up ...
2,"In Rage Over Sarah Everard Killing, ‘Women’s B..."
3,If you thought the right to protest was inalie...
4,@MrHarryCole @ColonelShotover Of course not th...
...,...
95,Inspiring to hear. It's about not allowing bul...
96,So why are the police having to protect the st...
97,@tomhiggo @LBC And you would trust any governm...
98,@melllsy @Nick71914256 @gavinessewing @metpoli...


### Preprocessing the tweets

#### Dropping duplicate tweets

In [17]:
# Removing duplicates from the dataframe
# pd.DataFrame.drop_duplicates - returns DataFrame with duplicate rows removed.
print('Dataframe size before removing duplicates', df.shape)
df.drop_duplicates(subset=['text'])
print('Dataframe size after removing duplicates', df.shape)

#Output the dataframe
df

Dataframe size before removing duplicates (100, 1)
Dataframe size after removing duplicates (100, 1)


Unnamed: 0,text
0,@dijdowell I had a very similar thought earlie...
1,Could have made her point without bringing up ...
2,"In Rage Over Sarah Everard Killing, ‘Women’s B..."
3,If you thought the right to protest was inalie...
4,@MrHarryCole @ColonelShotover Of course not th...
...,...
95,Inspiring to hear. It's about not allowing bul...
96,So why are the police having to protect the st...
97,@tomhiggo @LBC And you would trust any governm...
98,@melllsy @Nick71914256 @gavinessewing @metpoli...


#### Dropping empty tweets

In [18]:
# Removing empty tweets from the dataframe
# pd.DataFrame.dropna - removes missing tweets.
print('Dataframe size before removing empty tweets', df.shape)
df.dropna(subset = ['text'], inplace = True)
print('Dataframe size after removing empty tweets', df.shape)

#Output the dataframe
df

Dataframe size before removing empty tweets (100, 1)
Dataframe size after removing empty tweets (100, 1)


Unnamed: 0,text
0,@dijdowell I had a very similar thought earlie...
1,Could have made her point without bringing up ...
2,"In Rage Over Sarah Everard Killing, ‘Women’s B..."
3,If you thought the right to protest was inalie...
4,@MrHarryCole @ColonelShotover Of course not th...
...,...
95,Inspiring to hear. It's about not allowing bul...
96,So why are the police having to protect the st...
97,@tomhiggo @LBC And you would trust any governm...
98,@melllsy @Nick71914256 @gavinessewing @metpoli...


#### Extracting mentioned users from the tweets

In [19]:
# Remove all non english characters including emojis from tweets
def fetch_users_mentioned(tweet):
    words = tweet.split(' ')
    users = []
    processed_tweet = ''
    for word in words:
        if len(word):
            if (ord(word[0])) == 64:
                users.append(word[1:])
            else:
                processed_tweet += word
    
    return processed_tweet, users

for index, val in df.iterrows():
    df.at[index,'mentioned_users'] = fetch_users_mentioned(val['text'])[1]

#Output the dataframe
df

Unnamed: 0,text,mentioned_users
0,@dijdowell I had a very similar thought earlie...,dijdowell
1,Could have made her point without bringing up ...,[]
2,"In Rage Over Sarah Everard Killing, ‘Women’s B...",[]
3,If you thought the right to protest was inalie...,[]
4,@MrHarryCole @ColonelShotover Of course not th...,"[MrHarryCole, ColonelShotover]"
...,...,...
95,Inspiring to hear. It's about not allowing bul...,[]
96,So why are the police having to protect the st...,[]
97,@tomhiggo @LBC And you would trust any governm...,"[tomhiggo, LBC]"
98,@melllsy @Nick71914256 @gavinessewing @metpoli...,"[melllsy, Nick71914256, gavinessewing, metpoli..."


#### Dropping non-english words from tweets

In [20]:
# Remove all non english characters including emojis from tweets
def remove_nonenglish_char(tweet):
    words = tweet.split(' ')
    processed_tweet = ''
    for word in words:
        if len(word):
            if (ord(word[0]) >= 97 and ord(word[0]) <= 122) or ord(word[0]) >= 65 and ord(word[0]) <= 90:
                processed_tweet += f'{word} '
    
    return processed_tweet

for index, val in df.iterrows():
    df.at[index,'text'] = remove_nonenglish_char(val['text'])

#Output the dataframe
df

Unnamed: 0,text,mentioned_users
0,I had a very similar thought earlier. I've bee...,dijdowell
1,Could have made her point without bringing up ...,[]
2,"In Rage Over Sarah Everard Killing, Bargain’ I...",[]
3,If you thought the right to protest was inalie...,[]
4,Of course not this simple but optics kind of a...,"[MrHarryCole, ColonelShotover]"
...,...,...
95,Inspiring to hear. It's about not allowing bul...,[]
96,So why are the police having to protect the st...,[]
97,And you would trust any government not to decl...,"[tomhiggo, LBC]"
98,"Sure, you've been able to have a doorstep cand...","[melllsy, Nick71914256, gavinessewing, metpoli..."


#### Converting tweets to lowercase

In [21]:
#Converting tweet text to lowercase for tokenization
def convert_to_lowercase(tweet):
    return tweet.lower()

for index, val in df.iterrows():
    df.at[index,'text'] = convert_to_lowercase(val['text'])

#Output the dataframe
df

Unnamed: 0,text,mentioned_users
0,i had a very similar thought earlier. i've bee...,dijdowell
1,could have made her point without bringing up ...,[]
2,"in rage over sarah everard killing, bargain’ i...",[]
3,if you thought the right to protest was inalie...,[]
4,of course not this simple but optics kind of a...,"[MrHarryCole, ColonelShotover]"
...,...,...
95,inspiring to hear. it's about not allowing bul...,[]
96,so why are the police having to protect the st...,[]
97,and you would trust any government not to decl...,"[tomhiggo, LBC]"
98,"sure, you've been able to have a doorstep cand...","[melllsy, Nick71914256, gavinessewing, metpoli..."


#### Implement tokenizer