# Twitter analysis for social unrest


In [68]:
import numpy as np
import pandas as pd

#Displaying data as dataframe
#pd.read_csv reads a comma-separated values (csv) file into dataframe.
df = pd.read_csv("data/dummydata.csv")

#Displaying certain columns from the dataframe to display - 
# df.loc accesses a group of rows and columns by label(s) from dataframe.
df = df.loc[0:10000,['date','user_name', 'text', 'likes', 'retweets', 'location_name']]

#Output the dataframe 
df

Unnamed: 0,date,user_name,text,likes,retweets,location_name
0,2021-01-06,nick stripe,we can adjust by population to get crude exces...,255.0,108.0,
1,2021-01-06,femi,turning to labour or remainers like what would...,877.0,116.0,
2,2021-01-06,nick stripe,numbers of deaths are affected by population s...,213.0,75.0,
3,2021-01-06,,here is a list of governors who preside over s...,0.0,7612.0,
4,2021-01-06,,convince me with facts on death rates and infe...,2.0,0.0,
...,...,...,...,...,...,...
9996,2021-01-06,,i propose we swap all forms of travel with lau...,0.0,67.0,
9997,2021-01-06,,ive started my travel lets gooooo,0.0,0.0,
9998,2021-01-06,,who wants to drink the africa piss stain out o...,3.0,0.0,
9999,2021-01-06,,fill in the answer my favourite city is everyo...,3.0,0.0,


### Preprocessing the tweets

#### Dropping duplicate tweets

In [69]:
# Removing duplicates from the dataframe
# pd.DataFrame.drop_duplicates - returns DataFrame with duplicate rows removed.
print('Dataframe size before removing duplicates', df.shape)
df.drop_duplicates(subset=['text'])
print('Dataframe size after removing duplicates', df.shape)

#Output the dataframe
df

Dataframe size before removing duplicates (10001, 6)
Dataframe size after removing duplicates (10001, 6)


Unnamed: 0,date,user_name,text,likes,retweets,location_name
0,2021-01-06,nick stripe,we can adjust by population to get crude exces...,255.0,108.0,
1,2021-01-06,femi,turning to labour or remainers like what would...,877.0,116.0,
2,2021-01-06,nick stripe,numbers of deaths are affected by population s...,213.0,75.0,
3,2021-01-06,,here is a list of governors who preside over s...,0.0,7612.0,
4,2021-01-06,,convince me with facts on death rates and infe...,2.0,0.0,
...,...,...,...,...,...,...
9996,2021-01-06,,i propose we swap all forms of travel with lau...,0.0,67.0,
9997,2021-01-06,,ive started my travel lets gooooo,0.0,0.0,
9998,2021-01-06,,who wants to drink the africa piss stain out o...,3.0,0.0,
9999,2021-01-06,,fill in the answer my favourite city is everyo...,3.0,0.0,


#### Dropping empty tweets

In [70]:
# Removing empty tweets from the dataframe
# pd.DataFrame.dropna - removes missing tweets.
print('Dataframe size before removing empty tweets', df.shape)
df.dropna(subset = ['text', 'location_name', 'user_name'], inplace = True)
print('Dataframe size after removing empty tweets', df.shape)

#Output the dataframe
df

Dataframe size before removing empty tweets (10001, 6)
Dataframe size after removing empty tweets (2, 6)


Unnamed: 0,date,user_name,text,likes,retweets,location_name
118,2021-01-06,heerak christian kim for us congress virginia8,breaking news many americans left new york and...,1.0,1.0,"Arlington, VA"
6910,2021-01-06,mike valerio,from dc attorney general in the midst of this ...,183.0,92.0,"Washington, DC"


#### Dropping non-english words from tweets

In [72]:
# Remove all non english characters including emojis from tweets
def remove_nonenglish_char(tweet):
    words = tweet.split(' ')
    processed_tweet = ''
    for word in words:
        if len(word):
            if (ord(word[0]) >= 97 and ord(word[0]) <= 122) or ord(word[0]) >= 65 and ord(word[0]) <= 90:
                processed_tweet += f'{word} '
    
    return processed_tweet

for index, val in df.iterrows():
    df.at[index,'text'] = remove_nonenglish_char(val['text'])

#Output the dataframe
df

Unnamed: 0,date,user_name,text,likes,retweets,location_name
118,2021-01-06,heerak christian kim for us congress virginia8,breaking news many americans left new york and...,1.0,1.0,"Arlington, VA"
6910,2021-01-06,mike valerio,from dc attorney general in the midst of this ...,183.0,92.0,"Washington, DC"
