# Create training dataset

In [442]:
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from datetime import timedelta


from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from statistics import multimode
from cmath import nan
pd.set_option("display.max_rows", None, "display.max_columns", None)



In [443]:
sa_training = pd.read_csv('DATA/super_data.csv')

# Functions

In [444]:
def split_string(string):
    if string == string:
        string = string.replace("'", "")
        out = string.strip('][').split(', ')
        return out
    else:
        return nan

In [445]:
def get_subregion(dist1, dist2, place):
    row = dist1.loc[dist1['Place0']==place]
    shape = row['geometry'].to_list()
    shape = shape[0]

    places = dist2['geometry']
    res = places.within(shape)
    res = res.to_list()
    temp = dist2
    temp['res'] = res
    temp = temp.loc[temp['res']==True]
    return(temp)

### tweets

In [446]:
def plot_Tweets(dist, tweets, n):    
    Nu_Tweets = []
    for index, row in dist.iterrows():
        size = len(tweets.loc[tweets['Place' + str(n)]==row['Place'+ str(n)]])

        Nu_Tweets.append(size)

    Nu_Tweets
    dist['Tweets'] = Nu_Tweets

    plot = dist.explore(
        column="Tweets", # make choropleth based on "Data" column
        tooltip=['Place'+ str(n),'Tweets'], # show "Province, Data" value in tooltip (on hover)
        popup=True, # show all values in popup (on click)
        tiles="CartoDB positron", # use "CartoDB positron" tiles
        cmap="Paired", # https://matplotlib.org/stable/tutorials/colors/colormaps.html
        style_kwds=dict(color="black") # use black outline
        )
    return(plot)

In [447]:
def tweet_rate_av2(dates, tweets):
    tweet_day = [0]*2
    for i, date in enumerate(dates):
        day = date
        d1 = day - timedelta(days=2) 
        d2 = day + timedelta(days=2) 
        delta = (d2-d1).days
        for i in range(delta):
            day = d1+timedelta(days=i)
            NuTweets = len(tweets.loc[tweets['Date']==day]) 
            tweet_day[i]=tweet_day[i]+NuTweets

    # getting the average
    tweet_day =[x/len(dates) for x in tweet_day]

    # normalising
    #tweet_day =[x/max(tweet_day) for x in tweet_day]

    # Getting the average of the sets
    average1 = [sum(tweet_day)/len(tweet_day)]*2

    # this is just for the x axis
    date = list(range(1,10+1))


    plt.plot(date,tweet_day, "-r", label="Tweets")
    plt.plot(date,average1,"-b", label="Average Over All Set Dates")
    plt.axvline(x = 5, color = 'y', label = 'Centre (location of protest/non-protests)')
    plt.legend(loc="upper left")
    plt.ylabel('Number of Tweets')
    plt.xlabel('Date')
    plt.show()

In [448]:
def tweet_rate2(dates, tweets):
    for date in dates:
        day = date
        d1 = day - timedelta(days=5) 
        d2 = day + timedelta(days=5) 
        date2 = []
        tweet_day = []
        delta = (d2-d1).days

        for i in range(delta):
            day = d1+timedelta(days=i)
            NuTweets = len(tweets.loc[tweets['Date']==day]) 
            date2.append(i)
            tweet_day.append(NuTweets)

        # Normalising
        
        if len(tweet_day) == 0:
            tweet_day = [0]*(int(5*2))
        else:
            if max(tweet_day) == 0:
                tweet_day = [0]*(int(5*2))
            else:
                tweet_day =[x/max(tweet_day) for x in tweet_day]


        plt.plot(date2,tweet_day)
        plt.ylabel('Number of Tweets')
        plt.xlabel('Date') 

### Dates

In [449]:
def get_dates(protests,tweets):
    start = tweets.tail(1)['Date'].to_list()
    start = start[0]
    end = tweets.head(1)['Date'].to_list()
    end = end[0]

    delta = (end-start).days
    delta = delta
    yes = []
    no = []
    protests2 = []

    for i in range(delta):
        date = start + timedelta(days=i)
        temp = protests.loc[protests['Date']==date]
        if temp.empty:
            no.append(date)
        else:
            yes.append(date)
    return(yes, no)

### Training set

In [450]:
def get_start(protests, tweets, x):
    dates = get_dates(protests, tweets)
    yes = dates[0]
    no = dates[1]


    res = [1]*len(yes)
    data = {'Protest':res,'Date':yes}
    yes = pd.DataFrame(data)

    res = [0]*len(no)
    data = {'Protest':res,'Date':no}
    no = pd.DataFrame(data)

    # randomly sampling len(yes) samples from no such that there is an equal number of 
    # protests to non protest
    #x = 46 # 0.83 & 0.75
    # x = 54 # 0.87 0.53


    no = no.sample(len(yes), random_state = x)

    # Joining the dataframes, resetting the index and randomising the rows. 
    data = pd.concat([yes,no])
    data = data.sample(frac=1, random_state = x)
    data = data.reset_index(drop=True)

    return(data)

In [451]:
# return(tweet_day,average,specifiedTweets,length)
# return(likes,followers,retweets,replies,Subjectivity,Polarity)
# return(centre)

In [452]:
def get_tweets(date, tweets):
    day = date
    d1 = day - timedelta(days=5) 
    d2 = day + timedelta(days=5) 
    delta = (d2-d1).days
    delta = int(delta)
    tweet_day = [0]*(int(5*2))
    tweet_day_zero = tweet_day
    for i in range(10):
        day = d1+timedelta(days=i)
        NuTweets = len(tweets.loc[tweets['Date']==day]) 
        tweet_day[i]=tweet_day[i]+NuTweets
    # average 
    average = sum(tweet_day)/len(tweet_day)
    # normalising
    if len(tweet_day) == 0:
        tweet_day = tweet_day_zero
    else:
        if max(tweet_day) == 0:
            tweet_day = [0]*(int(5*2))
        else:
            tweet_day =[x/max(tweet_day) for x in tweet_day]
    # # average 
    # average = sum(tweet_day)/len(tweet_day)
    # specific tweets.
    specifiedTweets = tweets[((tweets['Date'] >d1)&(tweets['Date'] <d2))]
    specifiedTweets = specifiedTweets.reset_index(drop=True)
    length = len(specifiedTweets)

    return(tweet_day,average,specifiedTweets,length)

In [453]:
def concat_lists_to_list(column_name, df):
    content_list  = []
    for iter_t, tweet in df.iterrows():
        if tweet[column_name] == tweet[column_name]:
            for content in tweet[column_name]:
                content_list.append(content)
    return content_list

In [454]:
def tweet_metrics(tweets):
    length = len(tweets)
    if length == 0:
        length = 1
    followers = (tweets['author_followers'].sum())/length
    retweets = (tweets['retweets'].sum())/length
    replies = (tweets['replies'].sum())/length
    likes = (tweets['likes'].sum())/length
    Subjectivity = (tweets['Subjectivity'].sum())/length
    Polarity = (tweets['Polarity'].sum())/length
    

    grievance_list = concat_lists_to_list('grievances', tweets)
    greivance_mode = multimode(grievance_list)

    triggers_list = concat_lists_to_list('triggers', tweets)
    triggers_mode = multimode(triggers_list)

    tactics_list = concat_lists_to_list('tactics', tweets)
    tactics_mode = multimode(tactics_list)

    actors_list = concat_lists_to_list('actors', tweets)
    actors_mode = multimode(actors_list)

    locations_list = concat_lists_to_list('locations', tweets)
    locations_mode = multimode(locations_list)

    weapons_list = concat_lists_to_list('weapons', tweets)
    weapons_mode = multimode(weapons_list)

    eventualities_list = concat_lists_to_list('eventualities', tweets)
    eventualities_mode = multimode(eventualities_list)

    curiosities_list = concat_lists_to_list('curiosities', tweets)
    curiosities_mode = multimode(curiosities_list)

    non_protests_list = concat_lists_to_list('non_protests', tweets)
    non_protests_mode = multimode(non_protests_list)

    universities_list = concat_lists_to_list('universities', tweets)
    universities_mode = multimode(universities_list)


    return(likes,followers,retweets,replies,Subjectivity,Polarity,greivance_mode,triggers_mode,
            tactics_mode,actors_mode,locations_mode,weapons_mode,eventualities_mode,
            curiosities_mode,non_protests_mode,universities_mode)

In [455]:
def get_centre(tweets):
    # locations = tweets['geometry'].centroid
    # print(locations.head())
    # locations = locations.reset_index(drop=True)
    # centre = locations.centriod

    # its not working for now
    # centre = 'too bad'
    # return(centre)
    

    # We are going to return the place
    if len(tweets['Place1'].value_counts()) == 0:
        place = nan
    else:
        place = tweets['Place1'].value_counts().idxmax()
    return(place)
    

# Training Data

In [456]:
def get_training(data, tweets):
    NuTweets = []
    average = []
    d1 = []
    d2 = []
    d3 = []
    d4 = []
    d5 = []
    d6 = []
    d7 = []
    d8 = []
    d9 = []
    d10 = []

    likes = []
    followers = []
    retweets = []
    replies = []
    sub = []
    pol = []
    griev = []
    trigg = []
    tact = []
    act = []
    loca = []
    weap = []
    even = []
    curi = []
    nonp = []
    uni = []

    centre = []


    for index, row in data.iterrows():
        date = row['Date']
        
        res1 = get_tweets(date,tweets)
        res2 = tweet_metrics(res1[2])
        res3 = get_centre(res1[2])


        average.append(res1[1])
        NuTweets.append(res1[3])
        d1.append(res1[0][0])
        d2.append(res1[0][1])
        d3.append(res1[0][2])
        d4.append(res1[0][3])
        d5.append(res1[0][4])
        d6.append(res1[0][5])
        d7.append(res1[0][6])
        d8.append(res1[0][7])
        d9.append(res1[0][8])
        d10.append(res1[0][9])


        likes.append(res2[0])
        followers.append(res2[1])
        retweets.append(res2[2])
        replies.append(res2[3])
        sub.append(res2[4])
        pol.append(res2[5])
        griev.append(res2[6])
        trigg.append(res2[7])
        tact.append(res2[8])
        act.append(res2[9])
        loca.append(res2[10])
        weap.append(res2[11])
        even.append(res2[12])
        curi.append(res2[13])
        nonp.append(res2[14])
        uni.append(res2[15])
        

        centre.append(res3)

    

    metrics = {'NuTweets':NuTweets,'d1':d1,'d2':d2,'d3':d3,'d4':d4,
               'd5':d5,'d6':d6,'d7':d7,'d8':d8,'d9':d9,'d10':d10,
               'average':average,'likes':likes,'followers':followers,
               'retweets':retweets,'replies':replies,'sub':sub,'pol':pol,
               'place':centre,'grievances':griev, 'triggers': trigg,
               'tactics': tact, 'actors': act, 'locations': loca,
               'weapons': weap, 'eventualities': even, 'curiosities': curi,
               'non_protests':nonp, 'universities':uni}
    metrics = pd.DataFrame(metrics)

        
    training = pd.concat([data,metrics.reindex(data.index)], axis=1)
    training = training.drop(['Date'], axis = 1)
    return(training)

# Models

In [457]:
def logistic_regression(training):
    X_train,X_test,y_train,y_test = train_test_split(training.drop(['Protest','place', 'grievances', 'triggers', 'tactics', 'actors', 'locations', 'weapons', 'eventualities', 'curiosities', 'non_protests', 'universities'],axis = 1), training['Protest'],test_size = 0.3,random_state=1)
    # training the model
    logreg =  LogisticRegression(solver='lbfgs')
    logreg.fit(X_train,y_train)
    score = logreg.score(X_test,y_test)
    return(logreg,score)

In [458]:
def niave(training):
    from sklearn.naive_bayes import GaussianNB
    X_train,X_test,y_train,y_test = train_test_split(training.drop(['Protest','place', 'grievances', 'triggers', 'tactics', 'actors', 'locations', 'weapons', 'eventualities', 'curiosities', 'non_protests', 'universities'],axis = 1), training['Protest'],test_size = 0.2, random_state = 0)
    naive = GaussianNB()
    naive.fit(X_train,y_train)
    score = naive.score(X_test,y_test)
    return(naive,score)

In [459]:
def linearSVM(training):
    from sklearn.svm import LinearSVC
    X_train,X_test,y_train,y_test = train_test_split(training.drop(['Protest','place', 'grievances', 'triggers', 'tactics', 'actors', 'locations', 'weapons', 'eventualities', 'curiosities', 'non_protests', 'universities'],axis = 1), training['Protest'],test_size = 0.2, random_state = 0)
    svm = LinearSVC()
    svm.fit(X_train,y_train)
    score = svm.score(X_test,y_test)
    return(svm,score)

In [460]:
def random_forest_classifier(training):
    from sklearn.ensemble import RandomForestClassifier
    X_train,X_test,y_train,y_test = train_test_split(training.drop(['Protest','place', 'grievances', 'triggers', 'tactics', 'actors', 'locations', 'weapons', 'eventualities', 'curiosities', 'non_protests', 'universities'],axis = 1), training['Protest'],test_size = 0.2, random_state = 0)
    forest = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
    forest.fit(X_train, y_train)
    score = forest.score(X_test,y_test)
    return(forest,score)

In [461]:
def decision_tree_classifier(training):
    from sklearn.tree import DecisionTreeClassifier
    X_train,X_test,y_train,y_test = train_test_split(training.drop(['Protest','place', 'grievances', 'triggers', 'tactics', 'actors', 'locations', 'weapons', 'eventualities', 'curiosities', 'non_protests', 'universities'],axis = 1), training['Protest'],test_size = 0.2, random_state = 0)
    tree = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
    tree.fit(X_train, y_train)
    score = tree.score(X_test,y_test)
    return(tree,score)

In [462]:
def k_means(training):
    from sklearn import datasets
    from sklearn.cluster import KMeans
    X_train,X_test,y_train,y_test = train_test_split(training.drop(['Protest','place', 'grievances', 'triggers', 'tactics', 'actors', 'locations', 'weapons', 'eventualities', 'curiosities', 'non_protests', 'universities'],axis = 1), training['Protest'],test_size = 0.2, random_state = 0)
    model = KMeans(n_clusters=3)
    model.fit(X_train, y_train)
    score = model.score(X_test,y_test)
    return(model,score)

# MAIN

### Downloading tweets and protests

In [463]:
tweets = pd.read_csv('DATA/tweets.csv')

In [464]:
tweets['grievances'] = tweets['grievances'].apply(lambda x: split_string(x))
tweets['triggers'] = tweets['triggers'].apply(lambda x: split_string(x)) 
tweets['tactics'] = tweets['tactics'].apply(lambda x: split_string(x)) 
tweets['actors'] = tweets['actors'].apply(lambda x: split_string(x)) 
tweets['locations'] = tweets['locations'].apply(lambda x: split_string(x)) 
tweets['weapons'] = tweets['weapons'].apply(lambda x: split_string(x)) 
tweets['eventualities'] = tweets['eventualities'].apply(lambda x: split_string(x)) 
tweets['curiosities'] = tweets['curiosities'].apply(lambda x: split_string(x)) 
tweets['non_protests'] = tweets['non_protests'].apply(lambda x: split_string(x)) 
tweets['universities'] = tweets['universities'].apply(lambda x: split_string(x))

In [465]:

tweets['geometry'] = gpd.GeoSeries.from_wkt(tweets['geometry'])
tweets['Date']=pd.to_datetime(tweets['Date'], format='%Y %m %d')
tweets = tweets.drop(['Unnamed: 0','Analysis'],axis=1)

In [466]:
protests = pd.read_csv('DATA/protests.csv')
protests['geometry'] = gpd.GeoSeries.from_wkt(protests['geometry'])
protests['Date']=pd.to_datetime(protests['Date'], format='%Y %m %d')
protests = protests.drop(['Unnamed: 0'],axis=1)
protests.drop_duplicates(['Date','Place0'],keep= 'first',inplace =True)
print(len(protests))

1029


In [467]:
tweets.head(1)

Unnamed: 0,text,Date,geometry,author_followers,retweets,replies,likes,quote_count,Subjectivity,Polarity,grievances,triggers,tactics,actors,locations,weapons,eventualities,curiosities,non_protests,universities,Place0,Place1,Place2,Place3
0,"@Cristiano your attitude stinks, if a player m...",2022-10-09,POINT (28.23140 -25.75450),11305,0,0,0,0,0.5,-0.6,,,[Attack],,,,,,,[University of Pretoria],Gauteng,City of Tshwane,City of Tshwane,56


In [468]:
protests.head(1)

Unnamed: 0,Date,notes,geometry,Place0,Place1,Place2,Place3
0,2022-09-28,"On 28 September 2022, hundreds of students blo...",POINT (28.81580 -28.52420),Free State,Thabo Mofutsanyane,Maluti a Phofung,27


# Lets test protest in a chosen place

In [469]:
places = protests['Place0'].unique()
print(places)

['Free State' 'KwaZulu-Natal' 'Gauteng' 'Limpopo' 'Eastern Cape'
 'Western Cape' 'Mpumalanga' 'Nothern Cape' '0' 'North West']


In [470]:
print(tweets['Place0'].unique())

['Gauteng' 'Western Cape' 'Eastern Cape' 'KwaZulu-Natal' 'Free State'
 'Nothern Cape' 'Limpopo' 'Mpumalanga' 'North West' '0']


In [471]:
# sa_training = pd.DataFrame()
# for place in places:
#     p = protests.loc[(protests['Place0'] == place)]
#     t = tweets.loc[(tweets['Place0'] == place)]
#     if p.empty == False:
#         data = get_start(p, t, 8)
#         training = get_training(data, t)
#         sa_training = pd.concat([sa_training, training])
# sa_training.reset_index(drop = True, inplace = True)

In [472]:
outcome = []

for x in range(1,20):
    sa_training = pd.DataFrame()
    for place in places:
        p = protests.loc[(protests['Place0'] == place)]
        t = tweets.loc[(tweets['Place0'] == place)]
        if p.empty == False:
            data = get_start(p, t, x)
            training = get_training(data, t)
            sa_training = pd.concat([sa_training, training])
    sa_training.reset_index(drop = True, inplace = True)

    sa_training.to_csv('DATA/temp.csv')
    sa_training = pd.read_csv('DATA/temp.csv')
    

    tree = decision_tree_classifier(training)
    tree = tree[1]

    forest = random_forest_classifier(training)
    forest = forest[1]

    n = niave(training)
    n = n[1]

    km = k_means(training)
    km = km[1]

    models = [x, tree, forest, n, km]
    outcome.append(models)
    print(x)


In [473]:
# results = pd.DataFrame(outcome)
# results.to_csv('results.csv')

In [474]:
# Expected Time for 20k tweets = seconds
# data = get_start(protests,tweets)
# training = get_training(data,tweets)
# training.head(5)

In [475]:
temp = sa_training

In [476]:
sa_training.to_csv('DATA/super_data.csv')

In [477]:
sa_training = pd.read_csv('DATA/super_data.csv')

In [478]:
sa_training.head()

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Protest,NuTweets,d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,average,likes,followers,retweets,replies,sub,pol,place,grievances,triggers,tactics,actors,locations,weapons,eventualities,curiosities,non_protests,universities
0,0,0,0,1,47,0.0,0.0,0.25,0.416667,0.166667,0.666667,0.166667,0.916667,1.0,0.333333,4.7,4.617021,8219.595745,8.468085,0.425532,0.300603,0.074874,Mangaung,['Labour related'],['Court hearing'],"['Meeting', 'March']",['Union'],['Tertiary Edu'],[],['Police attack'],['Special Keywords'],[],['Central University of Technology']
1,1,1,1,1,18,0.333333,0.333333,0.666667,1.0,1.0,1.0,0.333333,0.666667,0.666667,0.333333,1.9,3.0,12640.777778,1.111111,0.722222,0.308773,0.163376,Mangaung,['National govt'],"['Working conditions', 'Crime Event']",['Disrupt'],"['Political Party', 'Civic org']",['Court'],['Police weapons'],[],[],['16 Days of activism'],['Central University of Technology']
2,2,2,2,1,19,0.6,0.6,0.4,0.2,1.0,0.6,0.2,0.2,0.6,0.0,2.2,1.052632,2978.947368,0.421053,0.421053,0.345727,0.01014,Mangaung,['Labour related'],['Working conditions'],['Disrupt'],['Political Party'],['Tertiary Edu'],[],[],[],['Other sport'],['Central University of Technology']
3,3,3,3,0,29,1.0,0.285714,0.714286,0.142857,0.857143,0.714286,0.428571,0.142857,0.571429,0.285714,3.6,3.103448,25035.103448,1.310345,1.137931,0.379747,-0.00831,Mangaung,['Values'],['Working conditions'],['March'],"['Political Party', 'Civic org']","['Tertiary Edu', 'School']",[],[],[],"['Other sport', '16 Days of activism']",['Central University of Technology']
4,4,4,4,0,17,0.2,0.4,0.4,0.6,0.4,0.4,0.2,0.0,0.0,1.0,1.8,7.117647,7745.411765,3.882353,0.764706,0.298517,0.057292,Mangaung,"['Healthcare', 'Education']","['Working conditions', 'Arrests', 'Court heari...",['Disrupt'],['Political Party'],['School'],['Police weapons'],[],['Special Keywords'],"['Election campaigns', 'Other sport']",['Central University of Technology']


In [479]:
sa_training.shape

(2054, 33)

# Creating a simple model

In [480]:
forest = random_forest_classifier(sa_training)
forest[1]

0.5985401459854015

In [481]:
tree = decision_tree_classifier(sa_training)
tree[1]

0.5401459854014599

In [482]:
log = logistic_regression(sa_training)
log[1]

0.5526742301458671

In [483]:
log = niave(sa_training)
log[1]

0.5523114355231143

In [484]:
log = linearSVM(sa_training)
log[1]



0.5036496350364964

In [485]:
sa_training.head()

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Protest,NuTweets,d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,average,likes,followers,retweets,replies,sub,pol,place,grievances,triggers,tactics,actors,locations,weapons,eventualities,curiosities,non_protests,universities
0,0,0,0,1,47,0.0,0.0,0.25,0.416667,0.166667,0.666667,0.166667,0.916667,1.0,0.333333,4.7,4.617021,8219.595745,8.468085,0.425532,0.300603,0.074874,Mangaung,['Labour related'],['Court hearing'],"['Meeting', 'March']",['Union'],['Tertiary Edu'],[],['Police attack'],['Special Keywords'],[],['Central University of Technology']
1,1,1,1,1,18,0.333333,0.333333,0.666667,1.0,1.0,1.0,0.333333,0.666667,0.666667,0.333333,1.9,3.0,12640.777778,1.111111,0.722222,0.308773,0.163376,Mangaung,['National govt'],"['Working conditions', 'Crime Event']",['Disrupt'],"['Political Party', 'Civic org']",['Court'],['Police weapons'],[],[],['16 Days of activism'],['Central University of Technology']
2,2,2,2,1,19,0.6,0.6,0.4,0.2,1.0,0.6,0.2,0.2,0.6,0.0,2.2,1.052632,2978.947368,0.421053,0.421053,0.345727,0.01014,Mangaung,['Labour related'],['Working conditions'],['Disrupt'],['Political Party'],['Tertiary Edu'],[],[],[],['Other sport'],['Central University of Technology']
3,3,3,3,0,29,1.0,0.285714,0.714286,0.142857,0.857143,0.714286,0.428571,0.142857,0.571429,0.285714,3.6,3.103448,25035.103448,1.310345,1.137931,0.379747,-0.00831,Mangaung,['Values'],['Working conditions'],['March'],"['Political Party', 'Civic org']","['Tertiary Edu', 'School']",[],[],[],"['Other sport', '16 Days of activism']",['Central University of Technology']
4,4,4,4,0,17,0.2,0.4,0.4,0.6,0.4,0.4,0.2,0.0,0.0,1.0,1.8,7.117647,7745.411765,3.882353,0.764706,0.298517,0.057292,Mangaung,"['Healthcare', 'Education']","['Working conditions', 'Arrests', 'Court heari...",['Disrupt'],['Political Party'],['School'],['Police weapons'],[],['Special Keywords'],"['Election campaigns', 'Other sport']",['Central University of Technology']


In [486]:
test = pd.read_csv('DATA/super_data.csv')

In [487]:
test.head()

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Protest,NuTweets,d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,average,likes,followers,retweets,replies,sub,pol,place,grievances,triggers,tactics,actors,locations,weapons,eventualities,curiosities,non_protests,universities
0,0,0,0,1,47,0.0,0.0,0.25,0.416667,0.166667,0.666667,0.166667,0.916667,1.0,0.333333,4.7,4.617021,8219.595745,8.468085,0.425532,0.300603,0.074874,Mangaung,['Labour related'],['Court hearing'],"['Meeting', 'March']",['Union'],['Tertiary Edu'],[],['Police attack'],['Special Keywords'],[],['Central University of Technology']
1,1,1,1,1,18,0.333333,0.333333,0.666667,1.0,1.0,1.0,0.333333,0.666667,0.666667,0.333333,1.9,3.0,12640.777778,1.111111,0.722222,0.308773,0.163376,Mangaung,['National govt'],"['Working conditions', 'Crime Event']",['Disrupt'],"['Political Party', 'Civic org']",['Court'],['Police weapons'],[],[],['16 Days of activism'],['Central University of Technology']
2,2,2,2,1,19,0.6,0.6,0.4,0.2,1.0,0.6,0.2,0.2,0.6,0.0,2.2,1.052632,2978.947368,0.421053,0.421053,0.345727,0.01014,Mangaung,['Labour related'],['Working conditions'],['Disrupt'],['Political Party'],['Tertiary Edu'],[],[],[],['Other sport'],['Central University of Technology']
3,3,3,3,0,29,1.0,0.285714,0.714286,0.142857,0.857143,0.714286,0.428571,0.142857,0.571429,0.285714,3.6,3.103448,25035.103448,1.310345,1.137931,0.379747,-0.00831,Mangaung,['Values'],['Working conditions'],['March'],"['Political Party', 'Civic org']","['Tertiary Edu', 'School']",[],[],[],"['Other sport', '16 Days of activism']",['Central University of Technology']
4,4,4,4,0,17,0.2,0.4,0.4,0.6,0.4,0.4,0.2,0.0,0.0,1.0,1.8,7.117647,7745.411765,3.882353,0.764706,0.298517,0.057292,Mangaung,"['Healthcare', 'Education']","['Working conditions', 'Arrests', 'Court heari...",['Disrupt'],['Political Party'],['School'],['Police weapons'],[],['Special Keywords'],"['Election campaigns', 'Other sport']",['Central University of Technology']
