# Create training dataset

In [994]:
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from datetime import timedelta


from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from statistics import multimode
from cmath import nan
pd.set_option("display.max_rows", None, "display.max_columns", None)



# Functions

In [995]:
def split_string(string):
    if string == string:
        string = string.replace("'", "")
        out = string.strip('][').split(', ')
        return out
    else:
        return nan

In [996]:
def get_subregion(dist1, dist2, place):
    row = dist1.loc[dist1['Place0']==place]
    shape = row['geometry'].to_list()
    shape = shape[0]

    places = dist2['geometry']
    res = places.within(shape)
    res = res.to_list()
    temp = dist2
    temp['res'] = res
    temp = temp.loc[temp['res']==True]
    return(temp)

### tweets

In [997]:
def plot_Tweets(dist, tweets, n):    
    Nu_Tweets = []
    for index, row in dist.iterrows():
        size = len(tweets.loc[tweets['Place' + str(n)]==row['Place'+ str(n)]])

        Nu_Tweets.append(size)

    Nu_Tweets
    dist['Tweets'] = Nu_Tweets

    plot = dist.explore(
        column="Tweets", # make choropleth based on "Data" column
        tooltip=['Place'+ str(n),'Tweets'], # show "Province, Data" value in tooltip (on hover)
        popup=True, # show all values in popup (on click)
        tiles="CartoDB positron", # use "CartoDB positron" tiles
        cmap="Paired", # https://matplotlib.org/stable/tutorials/colors/colormaps.html
        style_kwds=dict(color="black") # use black outline
        )
    return(plot)

In [998]:
def tweet_rate_av2(dates, tweets):
    tweet_day = [0]*10
    for i, date in enumerate(dates):
        day = date
        d1 = day - timedelta(days=5) 
        d2 = day + timedelta(days=5) 
        delta = (d2-d1).days
        for i in range(delta):
            day = d1+timedelta(days=i)
            NuTweets = len(tweets.loc[tweets['Date']==day]) 
            tweet_day[i]=tweet_day[i]+NuTweets

    # getting the average
    tweet_day =[x/len(dates) for x in tweet_day]

    # normalising
    #tweet_day =[x/max(tweet_day) for x in tweet_day]

    # Getting the average of the sets
    average1 = [sum(tweet_day)/len(tweet_day)]*10

    # this is just for the x axis
    date = list(range(1,10+1))


    plt.plot(date,tweet_day, "-r", label="Tweets")
    plt.plot(date,average1,"-b", label="Average Over All Set Dates")
    plt.axvline(x = 5, color = 'y', label = 'Centre (location of protest/non-protests)')
    plt.legend(loc="upper left")
    plt.ylabel('Number of Tweets')
    plt.xlabel('Date')
    plt.show()

In [999]:
def tweet_rate2(dates, tweets):
    for date in dates:
        day = date
        d1 = day - timedelta(days=5) 
        d2 = day + timedelta(days=5) 
        date2 = []
        tweet_day = []
        delta = (d2-d1).days

        for i in range(delta):
            day = d1+timedelta(days=i)
            NuTweets = len(tweets.loc[tweets['Date']==day]) 
            date2.append(i)
            tweet_day.append(NuTweets)

        # Normalising
        tweet_day =[x/max(tweet_day) for x in tweet_day]
        plt.plot(date2,tweet_day)
        plt.ylabel('Number of Tweets')
        plt.xlabel('Date') 

### Dates

In [1032]:
def get_dates(protests,tweets):
    start = tweets.tail(1)['Date'].to_list()
    start = start[0]
    end = tweets.head(1)['Date'].to_list()
    end = end[0]

    delta = (end-start).days
    delta = delta
    yes = []
    no = []
    protests2 = []

    for i in range(delta):
        date = start + timedelta(days=i)
        temp = protests.loc[protests['Date']==date]
        if temp.empty:
            no.append(date)
        else:
            yes.append(date)
    return(yes, no)

### Training set

In [1033]:
def get_start(protests, tweets):
    dates = get_dates(protests, tweets)
    yes = dates[0]
    no = dates[1]


    res = [1]*len(yes)
    data = {'Protest':res,'Date':yes}
    yes = pd.DataFrame(data)

    res = [0]*len(no)
    data = {'Protest':res,'Date':no}
    no = pd.DataFrame(data)

    # randomly sampling len(yes) samples from no such that there is an equal number of 
    # protests to non protest
    no = no.sample(len(yes), random_state = 1)

    # Joining the dataframes, resetting the index and randomising the rows. 
    data = pd.concat([yes,no])
    data = data.sample(frac=1, random_state = 2)
    data = data.reset_index(drop=True)

    return(data)

In [1034]:
# return(tweet_day,average,specifiedTweets,length)
# return(likes,followers,retweets,replies,Subjectivity,Polarity)
# return(centre)

In [1035]:
def get_tweets(date, tweets):
    day = date
    d1 = day - timedelta(days=5) 
    d2 = day + timedelta(days=5) 
    delta = (d2-d1).days
    delta = int(delta)
    tweet_day = [0]*(int(5*2))
    for i in range(10):
        day = d1+timedelta(days=i)
        NuTweets = len(tweets.loc[tweets['Date']==day]) 
        tweet_day[i]=tweet_day[i]+NuTweets
    # average 
    average = sum(tweet_day)/len(tweet_day)
    # normalising
    tweet_day =[x/max(tweet_day) for x in tweet_day]
    # # average 
    # average = sum(tweet_day)/len(tweet_day)
    # specific tweets.
    specifiedTweets = tweets[((tweets['Date'] >d1)&(tweets['Date'] <d2))]
    specifiedTweets = specifiedTweets.reset_index(drop=True)
    length = len(specifiedTweets)

    return(tweet_day,average,specifiedTweets,length)

In [1036]:
def concat_lists_to_list(column_name, df):
    content_list  = []
    for iter_t, tweet in df.iterrows():
        if tweet[column_name] == tweet[column_name]:
            for content in tweet[column_name]:
                content_list.append(content)
    return content_list

In [1037]:
def tweet_metrics(tweets):
    length = len(tweets)
    followers = (tweets['author_followers'].sum())/length
    retweets = (tweets['retweets'].sum())/length
    replies = (tweets['replies'].sum())/length
    likes = (tweets['likes'].sum())/length
    Subjectivity = (tweets['Subjectivity'].sum())/length
    Polarity = (tweets['Polarity'].sum())/length
    

    grievance_list = concat_lists_to_list('grievances', tweets)
    greivance_mode = multimode(grievance_list)

    triggers_list = concat_lists_to_list('triggers', tweets)
    triggers_mode = multimode(triggers_list)

    tactics_list = concat_lists_to_list('tactics', tweets)
    tactics_mode = multimode(tactics_list)

    actors_list = concat_lists_to_list('actors', tweets)
    actors_mode = multimode(actors_list)

    locations_list = concat_lists_to_list('locations', tweets)
    locations_mode = multimode(locations_list)

    weapons_list = concat_lists_to_list('weapons', tweets)
    weapons_mode = multimode(weapons_list)

    eventualities_list = concat_lists_to_list('eventualities', tweets)
    eventualities_mode = multimode(eventualities_list)

    curiosities_list = concat_lists_to_list('curiosities', tweets)
    curiosities_mode = multimode(curiosities_list)

    non_protests_list = concat_lists_to_list('non_protests', tweets)
    non_protests_mode = multimode(non_protests_list)

    universities_list = concat_lists_to_list('universities', tweets)
    universities_mode = multimode(universities_list)


    return(likes,followers,retweets,replies,Subjectivity,Polarity,greivance_mode,triggers_mode,
            tactics_mode,actors_mode,locations_mode,weapons_mode,eventualities_mode,
            curiosities_mode,non_protests_mode,universities_mode)

In [1038]:
def get_centre(tweets):
    # locations = tweets['geometry'].centroid
    # print(locations.head())
    # locations = locations.reset_index(drop=True)
    # centre = locations.centriod

    # its not working for now
    # centre = 'too bad'
    # return(centre)
    

    # We are going to return the place
    place = tweets['Place1'].value_counts().idxmax()
    return(place)
    

# Training Data

In [1039]:
def get_training(data, tweets):
    NuTweets = []
    average = []
    d1 = []
    d2 = []
    d3 = []
    d4 = []
    d5 = []
    d6 = []
    d7 = []
    d8 = []
    d9 = []
    d10 = []

    likes = []
    followers = []
    retweets = []
    replies = []
    sub = []
    pol = []
    griev = []
    trigg = []
    tact = []
    act = []
    loca = []
    weap = []
    even = []
    curi = []
    nonp = []
    uni = []

    centre = []


    for index, row in data.iterrows():
        date = row['Date']
        
        res1 = get_tweets(date,tweets)
        res2 = tweet_metrics(res1[2])
        res3 = get_centre(res1[2])


        average.append(res1[1])
        NuTweets.append(res1[3])
        d1.append(res1[0][0])
        d2.append(res1[0][1])
        d3.append(res1[0][2])
        d4.append(res1[0][3])
        d5.append(res1[0][4])
        d6.append(res1[0][5])
        d7.append(res1[0][6])
        d8.append(res1[0][7])
        d9.append(res1[0][8])
        d10.append(res1[0][9])


        likes.append(res2[0])
        followers.append(res2[1])
        retweets.append(res2[2])
        replies.append(res2[3])
        sub.append(res2[4])
        pol.append(res2[5])
        griev.append(res2[6])
        trigg.append(res2[7])
        tact.append(res2[8])
        act.append(res2[9])
        loca.append(res2[10])
        weap.append(res2[11])
        even.append(res2[12])
        curi.append(res2[13])
        nonp.append(res2[14])
        uni.append(res2[15])
        

        centre.append(res3)

    

    metrics = {'NuTweets':NuTweets,'d1':d1,'d2':d2,'d3':d3,'d4':d4,
               'd5':d5,'d6':d6,'d7':d7,'d8':d8,'d9':d9,'d10':d10,
               'average':average,'likes':likes,'followers':followers,
               'retweets':retweets,'replies':replies,'sub':sub,'pol':pol,
               'place':centre,'grievances':griev, 'triggers': trigg,
               'tactics': tact, 'actors': act, 'locations': loca,
               'weapons': weap, 'eventualities': even, 'curiosities': curi,
               'non_protests':nonp, 'universities':uni}
    metrics = pd.DataFrame(metrics)

        
    training = pd.concat([data,metrics.reindex(data.index)], axis=1)
    training = training.drop(['Date'], axis = 1)
    return(training)

# Models

In [1040]:
def logistic_regression(training):
    X_train,X_test,y_train,y_test = train_test_split(training.drop(['Protest','place', 'grievances', 'triggers', 'tactics', 'actors', 'locations', 'weapons', 'eventualities', 'curiosities', 'non_protests', 'universities'],axis = 1), training['Protest'],test_size = 0.3,random_state=7)
    # training the model
    logreg =  LogisticRegression(solver='lbfgs')
    logreg.fit(X_train,y_train)
    score = logreg.score(X_test,y_test)
    return(logreg,score)

In [1041]:
def niave(training):
    X_train,X_test,y_train,y_test = train_test_split(training.drop(['Protest','place', 'grievances', 'triggers', 'tactics', 'actors', 'locations', 'weapons', 'eventualities', 'curiosities', 'non_protests', 'universities'],axis = 1), training['Protest'],test_size = 0.2)
    naive = GaussianNB()
    naive.fit(X_train,y_train)
    score = naive.score(X_test,y_test)
    return(naive,score)

In [1042]:
def linearSVM(training):
    from sklearn.svm import LinearSVC
    X_train,X_test,y_train,y_test = train_test_split(training.drop(['Protest','place', 'grievances', 'triggers', 'tactics', 'actors', 'locations', 'weapons', 'eventualities', 'curiosities', 'non_protests', 'universities'],axis = 1), training['Protest'],test_size = 0.2)
    svm = LinearSVC()
    svm.fit(X_train,y_train)
    score = svm.score(X_test,y_test)
    return(svm,score)

# MAIN

### Downloading tweets and protests

In [1043]:
tweets = pd.read_csv('DATA/tweets.csv')

In [1044]:
tweets['grievances'] = tweets['grievances'].apply(lambda x: split_string(x))
tweets['triggers'] = tweets['triggers'].apply(lambda x: split_string(x)) 
tweets['tactics'] = tweets['tactics'].apply(lambda x: split_string(x)) 
tweets['actors'] = tweets['actors'].apply(lambda x: split_string(x)) 
tweets['locations'] = tweets['locations'].apply(lambda x: split_string(x)) 
tweets['weapons'] = tweets['weapons'].apply(lambda x: split_string(x)) 
tweets['eventualities'] = tweets['eventualities'].apply(lambda x: split_string(x)) 
tweets['curiosities'] = tweets['curiosities'].apply(lambda x: split_string(x)) 
tweets['non_protests'] = tweets['non_protests'].apply(lambda x: split_string(x)) 
tweets['universities'] = tweets['universities'].apply(lambda x: split_string(x))

In [1045]:

tweets['geometry'] = gpd.GeoSeries.from_wkt(tweets['geometry'])
tweets['Date']=pd.to_datetime(tweets['Date'], format='%Y %m %d')
tweets = tweets.drop(['Unnamed: 0','Analysis'],axis=1)

In [1046]:
protests = pd.read_csv('DATA/protests.csv')
protests['geometry'] = gpd.GeoSeries.from_wkt(protests['geometry'])
protests['Date']=pd.to_datetime(protests['Date'], format='%Y %m %d')
protests = protests.drop(['Unnamed: 0'],axis=1)
protests.drop_duplicates(['Date','Place0'],keep= 'first',inplace =True)

In [1047]:
tweets.head(1)

Unnamed: 0,text,Date,geometry,author_followers,retweets,replies,likes,quote_count,Subjectivity,Polarity,grievances,triggers,tactics,actors,locations,weapons,eventualities,curiosities,non_protests,universities,Place0,Place1,Place2,Place3
0,#KZN We are on thee way! Hitting up Richards B...,2017-12-31,POINT (28.23140 -25.75450),437807,0,1,21,0,0.066667,0.0,,,[Gathering],[Political Party],,,,,,[University of Pretoria],Gauteng,City of Tshwane,City of Tshwane,56


In [1048]:
protests.head(1)

Unnamed: 0,Date,notes,geometry,Place0,Place1,Place2,Place3
0,2017-10-25,A few hundred Fees Must Fall protesters attemp...,POINT (18.47640 -33.96330),Western Cape,City of Cape Town,City of Cape Town,58


# Lets test protest in a chosen place

In [1049]:
places = tweets['Place0'].unique()
print(places)

['Gauteng' 'KwaZulu-Natal' 'Western Cape' 'Free State' 'Eastern Cape'
 'Mpumalanga' 'North West' 'Limpopo' 'Nothern Cape' '0']


In [1050]:
print(protests['Place0'].unique())

['Western Cape' 'KwaZulu-Natal' 'Gauteng' 'Eastern Cape' 'Free State'
 'North West' 'Limpopo']


In [1051]:
place = 'Gauteng'
place2 = 'KwaZulu-Natal'

In [1052]:
# use below is if you want protests in all of South Africa
# protests = protests.loc[protests['Place0'].notna()]
# tweets = tweets.loc[tweets['Place0'].notna()]

# use below if you only want a particular province
# protests = protests.loc[(protests['Place0'] == place) | (protests['Place0'] == place2)]
# tweets = tweets.loc[(tweets['Place0'] == place) | (tweets['Place0'] == place2)]

In [None]:
# Expected Time for 20k tweets = seconds
data = get_start(protests,tweets)
training = get_training(data,tweets)
training.head(5)

count =  730
66


Unnamed: 0,Protest,NuTweets,d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,average,likes,followers,retweets,replies,sub,pol,place,grievances,triggers,tactics,actors,locations,weapons,eventualities,curiosities,non_protests,universities
0,1,182,0.48,0.44,0.26,0.18,0.18,1.0,0.56,0.34,0.28,0.4,20.6,1.06044,38626.78022,1.807692,0.313187,0.357232,0.065136,City of Tshwane,[Education],[Court hearing],[Gathering],[Political Party],[Tertiary Edu],[Police weapons],[Police attack],[Special Keywords],"[Election campaigns, Exhibitions]",[University of Pretoria]
1,1,171,0.25,1.0,0.71875,0.40625,0.59375,0.34375,0.65625,0.53125,0.75,0.34375,17.9,0.754386,75824.216374,1.017544,0.298246,0.431467,0.092549,City of Tshwane,[Education],[Working conditions],[Gathering],[Political Party],[Tertiary Edu],[Police weapons],[Police attack],[Special Keywords],[Election campaigns],[University of Pretoria]
2,0,147,0.48,0.44,0.96,1.0,0.52,0.6,0.56,0.44,0.52,0.84,15.9,0.843537,57772.62585,1.435374,0.292517,0.336645,-0.020014,City of Tshwane,[Mining],[Court hearing],[Gathering],[Political Party],[Tertiary Edu],[Police weapons],[],[Special Keywords],[Election campaigns],[University of Pretoria]
3,1,155,1.0,0.71875,0.40625,0.59375,0.34375,0.65625,0.53125,0.75,0.34375,0.5,18.7,0.890323,64687.264516,1.225806,0.290323,0.430132,0.084657,City of Tshwane,[Education],"[Arrests, Working conditions, Court hearing]",[Gathering],[Political Party],[Tertiary Edu],[Police weapons],[Police attack],[Special Keywords],"[16 Days of activism, Election campaigns]",[University of Pretoria]
4,0,146,0.714286,0.714286,1.0,0.857143,0.714286,0.714286,0.761905,0.666667,0.666667,0.857143,16.1,1.554795,55115.760274,1.068493,0.328767,0.325272,0.035308,City of Tshwane,[Education],[Court hearing],[Gathering],[Political Party],[School],[Police weapons],[],[Special Keywords],[Election campaigns],[University of Pretoria]


In [None]:
len(training)

66

# Creating a simple model

In [None]:
log = logistic_regression(training)
log[1]

0.8

In [None]:
# log = naive(training)
# log[1]

In [None]:
log = linearSVM(training)
log[1]



0.6428571428571429

In [None]:
training.head()

Unnamed: 0,Protest,NuTweets,d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,average,likes,followers,retweets,replies,sub,pol,place,grievances,triggers,tactics,actors,locations,weapons,eventualities,curiosities,non_protests,universities
0,1,182,0.48,0.44,0.26,0.18,0.18,1.0,0.56,0.34,0.28,0.4,20.6,1.06044,38626.78022,1.807692,0.313187,0.357232,0.065136,City of Tshwane,[Education],[Court hearing],[Gathering],[Political Party],[Tertiary Edu],[Police weapons],[Police attack],[Special Keywords],"[Election campaigns, Exhibitions]",[University of Pretoria]
1,1,171,0.25,1.0,0.71875,0.40625,0.59375,0.34375,0.65625,0.53125,0.75,0.34375,17.9,0.754386,75824.216374,1.017544,0.298246,0.431467,0.092549,City of Tshwane,[Education],[Working conditions],[Gathering],[Political Party],[Tertiary Edu],[Police weapons],[Police attack],[Special Keywords],[Election campaigns],[University of Pretoria]
2,0,147,0.48,0.44,0.96,1.0,0.52,0.6,0.56,0.44,0.52,0.84,15.9,0.843537,57772.62585,1.435374,0.292517,0.336645,-0.020014,City of Tshwane,[Mining],[Court hearing],[Gathering],[Political Party],[Tertiary Edu],[Police weapons],[],[Special Keywords],[Election campaigns],[University of Pretoria]
3,1,155,1.0,0.71875,0.40625,0.59375,0.34375,0.65625,0.53125,0.75,0.34375,0.5,18.7,0.890323,64687.264516,1.225806,0.290323,0.430132,0.084657,City of Tshwane,[Education],"[Arrests, Working conditions, Court hearing]",[Gathering],[Political Party],[Tertiary Edu],[Police weapons],[Police attack],[Special Keywords],"[16 Days of activism, Election campaigns]",[University of Pretoria]
4,0,146,0.714286,0.714286,1.0,0.857143,0.714286,0.714286,0.761905,0.666667,0.666667,0.857143,16.1,1.554795,55115.760274,1.068493,0.328767,0.325272,0.035308,City of Tshwane,[Education],[Court hearing],[Gathering],[Political Party],[School],[Police weapons],[],[Special Keywords],[Election campaigns],[University of Pretoria]


In [None]:
training.to_csv('DATA/training_data.csv')