# Twitter analysis for social unrest


In [72]:
import numpy as np
import pandas as pd

#Displaying data as dataframe
#pd.read_csv reads a comma-separated values (csv) file into dataframe.
df = pd.read_csv("tweets.csv")

#Displaying certain columns from the dataframe to display - 
# df.loc accesses a group of rows and columns by label(s) from dataframe.
df = df.loc[0: 100,[ 'text', 'location', 'target']]

#Output the dataframe 
df

Unnamed: 0,text,location,target
0,"Communal violence in Bhainsa, Telangana. ""Ston...",,1
1,Telangana: Section 144 has been imposed in Bha...,,1
2,Arsonist sets cars ablaze at dealership https:...,New York City,1
3,Arsonist sets cars ablaze at dealership https:...,"Morgantown, WV",1
4,"""Lord Jesus, your love brings freedom and pard...",,0
...,...,...,...
96,nah I need calm down ðŸ’€ https://t.co/eOKphShN5R,DEPRESSED VIBES,0
97,thinking about how some1 spat blood at me duri...,,0
98,"If two quakes have about the same magnitude, y...",,0
99,An M4+~M5+ aftershock happens within 15 days a...,,1


### Preprocessing the tweets

#### Dropping duplicate tweets

In [73]:
# Removing duplicates from the dataframe
# pd.DataFrame.drop_duplicates - returns DataFrame with duplicate rows removed.
print('Dataframe size before removing duplicates', df.shape)
df.drop_duplicates(subset=['text'])
print('Dataframe size after removing duplicates', df.shape)

#Output the dataframe
df

Dataframe size before removing duplicates (101, 3)
Dataframe size after removing duplicates (101, 3)


Unnamed: 0,text,location,target
0,"Communal violence in Bhainsa, Telangana. ""Ston...",,1
1,Telangana: Section 144 has been imposed in Bha...,,1
2,Arsonist sets cars ablaze at dealership https:...,New York City,1
3,Arsonist sets cars ablaze at dealership https:...,"Morgantown, WV",1
4,"""Lord Jesus, your love brings freedom and pard...",,0
...,...,...,...
96,nah I need calm down ðŸ’€ https://t.co/eOKphShN5R,DEPRESSED VIBES,0
97,thinking about how some1 spat blood at me duri...,,0
98,"If two quakes have about the same magnitude, y...",,0
99,An M4+~M5+ aftershock happens within 15 days a...,,1


#### Dropping empty tweets

In [74]:
# Removing empty tweets from the dataframe
# pd.DataFrame.dropna - removes missing tweets.
print('Dataframe size before removing empty tweets', df.shape)
df.dropna(subset = ['text'], inplace = True)
print('Dataframe size after removing empty tweets', df.shape)

#Output the dataframe
df

Dataframe size before removing empty tweets (101, 3)
Dataframe size after removing empty tweets (101, 3)


Unnamed: 0,text,location,target
0,"Communal violence in Bhainsa, Telangana. ""Ston...",,1
1,Telangana: Section 144 has been imposed in Bha...,,1
2,Arsonist sets cars ablaze at dealership https:...,New York City,1
3,Arsonist sets cars ablaze at dealership https:...,"Morgantown, WV",1
4,"""Lord Jesus, your love brings freedom and pard...",,0
...,...,...,...
96,nah I need calm down ðŸ’€ https://t.co/eOKphShN5R,DEPRESSED VIBES,0
97,thinking about how some1 spat blood at me duri...,,0
98,"If two quakes have about the same magnitude, y...",,0
99,An M4+~M5+ aftershock happens within 15 days a...,,1


#### Dropping non-english words from tweets

In [75]:
# Remove all non english characters including emojis from tweets
def remove_nonenglish_char(tweet):
    words = tweet.split(' ')
    processed_tweet = ''
    for word in words:
        if len(word):
            if (ord(word[0]) >= 97 and ord(word[0]) <= 122) or ord(word[0]) >= 65 and ord(word[0]) <= 90:
                processed_tweet += f'{word} '
    
    return processed_tweet

for index, val in df.iterrows():
    df.at[index,'text'] = remove_nonenglish_char(val['text'])

#Output the dataframe
df

Unnamed: 0,text,location,target
0,"Communal violence in Bhainsa, Telangana. were ...",,1
1,Telangana: Section has been imposed in Bhainsa...,,1
2,Arsonist sets cars ablaze at dealership https:...,New York City,1
3,Arsonist sets cars ablaze at dealership https:...,"Morgantown, WV",1
4,"Jesus, your love brings freedom and pardon. Fi...",,0
...,...,...,...
96,nah I need calm down https://t.co/eOKphShN5R,DEPRESSED VIBES,0
97,thinking about how some1 spat blood at me duri...,,0
98,"If two quakes have about the same magnitude, y...",,0
99,An M4+~M5+ aftershock happens within days afte...,,1


#### Converting tweets to lowercase

In [76]:
#Converting tweet text to lowercase for tokenization
def convert_to_lowercase(tweet):
    return tweet.lower()

for index, val in df.iterrows():
    df.at[index,'text'] = convert_to_lowercase(val['text'])

#Output the dataframe
df

Unnamed: 0,text,location,target
0,"communal violence in bhainsa, telangana. were ...",,1
1,telangana: section has been imposed in bhainsa...,,1
2,arsonist sets cars ablaze at dealership https:...,New York City,1
3,arsonist sets cars ablaze at dealership https:...,"Morgantown, WV",1
4,"jesus, your love brings freedom and pardon. fi...",,0
...,...,...,...
96,nah i need calm down https://t.co/eokphshn5r,DEPRESSED VIBES,0
97,thinking about how some1 spat blood at me duri...,,0
98,"if two quakes have about the same magnitude, y...",,0
99,an m4+~m5+ aftershock happens within days afte...,,1


#### Remove hyperlinks

In [77]:
#Remove hyperlinks from the tweet text
def remove_hyperlinks(tweets):
    result = []
    for tweet in tweets:
        processed_tweet = ''
        if 'http' in tweet:
            words = tweet.split(' ')
            for word in words:
                if 'http' not in word:
                    processed_tweet += f'{word} '
        else:
            processed_tweet = tweet
        result.append(processed_tweet)
    return result


result = remove_hyperlinks(df['text'])
df['text'] = result

#Output the dataframe
df

Unnamed: 0,text,location,target
0,"communal violence in bhainsa, telangana. were ...",,1
1,telangana: section has been imposed in bhainsa...,,1
2,arsonist sets cars ablaze at dealership,New York City,1
3,arsonist sets cars ablaze at dealership,"Morgantown, WV",1
4,"jesus, your love brings freedom and pardon. fi...",,0
...,...,...,...
96,nah i need calm down,DEPRESSED VIBES,0
97,thinking about how some1 spat blood at me duri...,,0
98,"if two quakes have about the same magnitude, y...",,0
99,an m4+~m5+ aftershock happens within days afte...,,1


#### Remove NaN locations

In [78]:
#Remove NaN values from the location column
df = df.dropna(subset=["location"])
df.reset_index(drop=True)

Unnamed: 0,text,location,target
0,arsonist sets cars ablaze at dealership,New York City,1
1,arsonist sets cars ablaze at dealership,"Morgantown, WV",1
2,"if this child was chinese, this tweet would ha...",OC,0
3,several houses have been set ablaze in ngemsib...,"London, England",1
4,asansol: a bjp office in salanpur village was ...,Bharat,1
...,...,...,...
62,darkake(7 yeek possessor) was killed by posses...,"Planet Eyal, Shandral System",1
63,"stay informed, stay engaged with whats going o...","Brooklyn, NY",0
64,"aftershock vapor pro review aussie, aussie, au...","Melbourne, Victoria",0
65,this is an unfathomable number. i lived throug...,Piscataway land // DC,0
