# Twitter analysis for social unrest


In [1]:
import numpy as np
import pandas as pd

#Displaying data as dataframe
#pd.read_csv reads a comma-separated values (csv) file into dataframe.
trainingData = pd.read_csv("tweets.csv")

#Displaying certain columns from the dataframe to display - 
# df.loc accesses a group of rows and columns by label(s) from dataframe.
trainingData = trainingData.loc[:100,[ 'text', 'target']]

### Preprocessing the tweets

#### Dropping duplicate tweets

In [2]:
# Removing duplicates from the dataframe
def remove_duplicate_tweets(dataframe):
#     pd.DataFrame.drop_duplicates - returns DataFrame with duplicate rows removed.
    dataframe.drop_duplicates(subset=['text'])
    return dataframe

#### Dropping empty tweets

In [3]:
# Removing empty tweets from the dataframe
def remove_empty_tweets(dataframe):
    # pd.DataFrame.dropna - removes missing tweets.
    dataframe.dropna(subset = ['text'], inplace = True)
    return dataframe

#### Converting tweets to lowercase

In [4]:
#Converting tweet text to lowercase for tokenization
def convert_to_lowercase(dataframe):
    for index, val in dataframe.iterrows():
        dataframe.at[index,'text'] = val['text'].lower()
    return dataframe

#### Remove hyperlinks

In [5]:
#Remove hyperlinks from the tweet text
def remove_hyperlinks(dataframe):
    for index, val in dataframe.iterrows():
        processed_tweet = ''
        words = val['text'].split(' ')
        for word in words:
            if 'http' not in word:
                processed_tweet += f'{word} '
        dataframe.at[index,'text'] = processed_tweet
    
    return dataframe

#### Removing hashtags

In [6]:
def remove_hashtags(dataframe):
    for index, val in dataframe.iterrows():
        processed_tweet = ''
        words = val['text'].split(' ')
        for word in words:
            if '#' not in word:
                processed_tweet += f'{word} '
        dataframe.at[index,'text'] = processed_tweet
    
    return dataframe

#### Tokenize words

In [29]:
from nltk.tokenize import word_tokenize

def tokenize_words(dataframe):
    for index, val in dataframe.iterrows():
         words = word_tokenize(val['text'])
         words = filter(lambda x: len(x) > 2, words)
         dataframe.at[index,'text'] = ' '.join(words)
    return dataframe

#### Removing stopwords

In [30]:
from nltk.corpus import stopwords

def remove_stopwords(dataframe):
    stopwords_array = stopwords.words('english')
    for index, val in dataframe.iterrows():
        text = []
        words = val['text'].split(' ')
        for word in words:
            if word not in stopwords_array:
                text.append(word)
        dataframe.at[index,'text'] = ' '.join(text)   
    return dataframe

#### Implementing stemming

In [31]:
from nltk.stem import PorterStemmer

def stemming(dataframe):
    stemmer = PorterStemmer()
    for index, val in dataframe.iterrows():
        words = val['text'].split(' ')
        
        text = [stemmer.stem(word) for word in words]  
        dataframe.at[index,'text'] = ' '.join(text)
        
    return dataframe

### Processing training dataset

In [32]:
trainingData = remove_duplicate_tweets(trainingData)
trainingData = remove_empty_tweets(trainingData)
trainingData = convert_to_lowercase(trainingData)
trainingData = remove_hyperlinks(trainingData)
trainingData = remove_hashtags(trainingData)

trainingData.reset_index(inplace = True)
trainingData.drop(['index'], axis = 1, inplace = True)
trainingData.head()

Unnamed: 0,text,target
0,"commun violenc bhainsa, telangana. ""stone pelt...",1
1,telangana: section 144 impo bhainsa januari 13...,1
2,arsonist set car ablaz dealership,1
3,arsonist set car ablaz dealership,1
4,"""lord jesus, love bring freedom pardon. fill h...",0


### Processing testing dataset

In [33]:
testData = pd.read_csv("data/protest.csv")
testData = testData.loc[:, ['text']]
testData = remove_duplicate_tweets(testData)
testData = remove_empty_tweets(testData)
testData = convert_to_lowercase(testData)
testData = remove_hyperlinks(testData)
testData = remove_hashtags(testData)

testData.reset_index(inplace = True)
testData.drop(['index'], axis = 1, inplace = True)
testData.head()

Unnamed: 0,text
0,@0x49fa98 @fahadmalam are ancaps that protest ...
1,(now: counter-protesting non-violent protest a...
2,@philippinestar @paolosromero the pen is might...
3,modi ji it’s time you start being honest with ...
4,(now: counter-protesting non-violent protest a...
