In [1]:
import os
os.chdir('/home/composersyf/Documents/Political Data Science Project')

In [2]:
import numpy as np
import pandas as pd
import re

In [3]:
import glob
all_at_filenames=glob.glob("TwitterData/* -at,*")
persons=[]
for name in all_at_filenames:
    persons.append(name.split(",")[1].split("-")[0])
persons=np.unique(persons)
persons

array(['BernieSanders', 'HillaryClinton', 'JohnKasich', 'SenRubioPress',
       'SenSanders', 'realDonaldTrump', 'tedcruz'], 
      dtype='<U15')

In [4]:
person_name_dict={"Donald Trump": "realDonaldTrump",
                 "Hillary Clinton": "HillaryClinton",
                 "Bernie Sanders": ["BernieSanders","SenSanders"],
                 "Ted Cruz": "tedcruz",
                 "John Kasich":"JohnKasich",
                 "Marco Rubio":"SenRubioPress"}

## A summary of @realDonaldTrump tweets during on 2016-06-01, as a preliminary exploratory data analysis

In [5]:
trump_filenames=glob.glob("TwitterData/* -at,realDonaldTrump*2016.06.01*")
all_tweets=[]
all_usernames=[]
for f in trump_filenames:
    with open(f) as file:
        file_texts=file.readlines()
        file_texts=file_texts[1:]
    tweets=[None]*len(file_texts)
    usernames=[None]*len(file_texts)
    for i,text in enumerate(file_texts):
        splitted_text=text.split("\",\"")
        tweets[i]=splitted_text[11]
        usernames[i]=splitted_text[6]
    all_tweets.extend(tweets)
    all_usernames.extend(usernames)
tweets_data=np.array(all_usernames+all_tweets)
tweets_data=tweets_data.reshape(2,len(all_tweets))
tweets_data=tweets_data.T

In [6]:
#There are some duplicated tweets (with same tweet texts and same UserName), so they need to be removed
tweets_data_df=pd.DataFrame(tweets_data)
print(np.where(tweets_data_df.duplicated()==True))
tweets_data_df=tweets_data_df.drop_duplicates()

(array([   174,    176,   1273,   1306,   1329,   3332,   4215,   6038,
         7252,  10744,  11036,  11521,  11577,  13052,  13333,  13589,
        14578,  14598,  14621,  15699,  15790,  15893,  15936,  17934,
        18823,  21186,  21438,  21524,  22178,  22233,  24384,  24487,
        24716,  25424,  25690,  27625,  31651,  31707,  32254,  32893,
        35316,  37489,  40347,  41326,  41392,  42668,  45745,  46695,
        47723,  50395,  50865,  52688,  52894,  55673,  56841,  57876,
        58011,  58321,  58335,  58349,  58567,  58606,  58738,  59426,
        60410,  61603,  63691,  64048,  65022,  65991,  66744,  67355,
        69884,  71278,  74045,  75419,  77140,  78172,  78423,  79260,
        79518,  80916,  83035,  87123,  90164,  90276,  91200,  92162,
        92887,  93155,  93221,  93961,  94050,  94593,  94697,  94788,
        95664,  96699,  99029,  99292,  99659, 100244, 103592, 103653,
       104555, 104788, 105134, 105271, 105669, 105753, 106193, 106878,
     

In [7]:
tweets_data_df.shape
#Total number of tweets @realDonaldTrump on 06-01-2016 is 242,882

242882

In [6]:
from nltk.tokenize import TweetTokenizer

#### Analyze the hashtags

In [9]:
tknzr = TweetTokenizer()
hashtag_regex=re.compile("^#")
all_hashtags=[]
for i in range(tweets_data_df.shape[0]):
    tokens=tknzr.tokenize(tweets_data_df.iloc[i,1])
    for token in tokens:
        if hashtag_regex.findall(token)==["#"] and token!="#": #it was found that '#' is not acutally a hashtag, so it should be removed
            all_hashtags.append(token)

#re.search("^\#",tknzr.tokenize(tweets_data_df.iloc[8,1])[5])

In [10]:
print(len(all_hashtags)) #total number of hashtags used by the all tweets @realDonaldTrump on 06-01-2016
print(len(np.unique(all_hashtags))) #total number of unique hashtags

99547
10908


In [7]:
from collections import Counter
hashtag_counts=Counter(all_hashtags)
hashtag_counts.most_common(50) #return the 50 most popular hashtags for @realDonaldTrump on 06-01-2016

In [12]:
#some noticeable hashtags:
#MSM: Mainstream Media
#tcot: "Top Conservatives on Twitter"
#Vets: Veterans
#Hannity: a hashtag closely related to Sean Hannity
#ICYMI: In case you missed it
#fringecandidates: a hashtag closely related to the potential third party presidential candidate Gary Johnson
#dtmag: the abridged hashtag of #TRUMPmagazine
#LDTPoll: ??
#greta: a hashtag closely related to Greta Van Susteren
#pjnet: Patriot Journalist Network
#RollingThunder: Rolling Thunder advocacy group (motorcycle rally)
#DontheCon: Trump is a con man!!!!!
#tlot: Top Libertarians on Twitter
#BestCampaignEver: ??
#2A: Support for all Americans' 2nd Amendment rights to keep and bear arms

In [12]:
#calculate the total percentage of those top 50 hashtags
counts_sum=0
for i in hashtag_counts.most_common(50):
    counts_sum+=i[1]
print(counts_sum/len(all_hashtags))

#calculate the total percentage of those top 100 hashtags
counts_sum=0
for i in hashtag_counts.most_common(100):
    counts_sum+=i[1]
print(counts_sum/len(all_hashtags))

#calculate the total percentage of those top 20 hashtags
counts_sum=0
for i in hashtag_counts.most_common(20):
    counts_sum+=i[1]
print(counts_sum/len(all_hashtags))

0.5252795162084242
0.6131073764151607
0.4179031010477463


#### Analyze the tweets that contain the @ not exclusively for realDonaldTrump

In [17]:
tknzr = TweetTokenizer()
at_regex=re.compile("^@")
all_at={}
for i in range(tweets_data_df.shape[0]):
    tokens=tknzr.tokenize(tweets_data_df.iloc[i,1])
    for token in tokens:
        if at_regex.findall(token)==["@"] and token!="@" and token!="@realDonaldTrump" and token!="@RealDonaldTrump": #it was found that '#' is not acutally a hashtag, so it should be removed
            try:
                all_at[i].append(token)
            except KeyError:
                all_at[i]=[token]

In [21]:
print(len(list(all_at.values())))
print(list(all_at.values())[0:50])
at_opponent=[]
for k in all_at.keys():
    for at in all_at[k]:
        if at.lower() in ['@berniesanders', '@hillaryclinton', '@johnkasich', '@senrubiopress','@sensanders',  '@tedcruz']:
            at_opponent.append(k)
            break

168648
[['@SweetzonWheels', '@chrgdup1973', '@realDrumpf'], ['@HillaryClinton'], ['@AdBell45', '@winegirl73'], ['@realDrumpf'], ['@mmfa'], ['@ChatRevolve'], ['@TomLlamasABC', '@ABC'], ['@BBCWorld', '@SenSanders'], ['@AllyTaft', '@Randyh14', '@friedman_h'], ['@briefermadness'], ['@kanyewest', '@POTUS', '@jack'], ['@HillaryClinton'], ['@FriendsofJimmys', '@FrankyLamouche'], ['@Trump_Videos'], ['@TheLastRefuge2', '@NolteNC'], ['@davidaxelrod'], ['@MuncTomm', '@lee_baier', '@scotthwk62'], ['@scapesrus'], ['@starknightz'], ['@peterdaou'], ['@bigop1'], ['@TrumpsGucciGirl'], ['@DanScavino'], ['@chriskyleband', '@FLTrumpTeam'], ['@GovGaryJohnson'], ['@PimpBillClinton', '@LILBTHEBASEDGOD'], ['@ZivaBranstetter', '@samanthavicent', '@MayorBartlett', '@readfrontier'], ['@Destinbeach22'], ['@FoxNews'], ['@SovernNation'], ['@WayneDupreeShow'], ['@Chairmnoomowmow'], ['@kimguilfoyle', '@OutnumberedFNC'], ['@DiamondandSilk'], ['@Me262A1', '@Mischief2You', '@OhmyGalt', '@CalvinTurnquest', '@PalmBeachGOP

In [33]:
print(len(at_opponent))
len(at_opponent)/tweets_data_df.shape[0]*100
#~5.18% of the tweets have both @realDonaldTrump and @ other candidates

12593


5.1848222593687465

In [30]:
at_opponent_2=[]
for k in all_at.keys():
    for at in all_at[k]:
        if at.lower() == '@hillaryclinton':
            at_opponent_2.append(k)
            break

In [32]:
print(len(at_opponent_2))
len(at_opponent_2)/tweets_data_df.shape[0]*100
#Particularly, ~ 4.41% of the tweets have both @realDonaldTrump and @HillaryClinton

10703


4.406666611770325

## A summary of @realDonaldTrump tweets during 2016-05-26 to 2016-06-01

In [8]:
trump_filenames=glob.glob("TwitterData/* -at,realDonaldTrump*")
trump_filenames_table=[]
for fname in trump_filenames:
    trump_filenames_table.append(fname)
    trump_filenames_table.extend(fname.split("]")[1].split(".")[:3])
trump_filenames_table=np.array(trump_filenames_table)
trump_filenames_table=trump_filenames_table.reshape(len(trump_filenames_table)//4,4)
trump_filenames_table=pd.DataFrame(trump_filenames_table)
trump_filenames_table.columns=["filename","year","month","day"]
trump_filenames_table.day=trump_filenames_table["day"].astype(int)
trump_filenames_table.month=trump_filenames_table["month"].astype(int)
trump_filenames_table.year=trump_filenames_table["year"].astype(int)
trump_filenames_table=trump_filenames_table[(trump_filenames_table.day>=26) | (trump_filenames_table.day==1)]
trump_filenames_table=trump_filenames_table.sort_values(["year","month","day"])
trump_filenames_table

Unnamed: 0,filename,year,month,day
14,"TwitterData/[ BATCH -at,realDonaldTrump-STATUS...",2016,5,26
15,"TwitterData/[ BATCH -at,realDonaldTrump-STATUS...",2016,5,26
53,"TwitterData/[ BATCH -at,realDonaldTrump-STATUS...",2016,5,26
73,"TwitterData/[ BATCH -at,realDonaldTrump-STATUS...",2016,5,26
110,"TwitterData/[ BATCH -at,realDonaldTrump-STATUS...",2016,5,26
111,"TwitterData/[ BATCH -at,realDonaldTrump-STATUS...",2016,5,26
112,"TwitterData/[ BATCH -at,realDonaldTrump-STATUS...",2016,5,26
130,"TwitterData/[ BATCH -at,realDonaldTrump-STATUS...",2016,5,26
137,"TwitterData/[ BATCH -at,realDonaldTrump-STATUS...",2016,5,26
149,"TwitterData/[ BATCH -at,realDonaldTrump-STATUS...",2016,5,26


In [9]:
days=np.unique(trump_filenames_table.day)
total_tweet_counts={}
hashtag_counts={}
at_opponent_counts_1={}
at_opponent_counts_2={}

for d in days: #loop through the entire week, on a daily basis
    fnames=np.array(trump_filenames_table[trump_filenames_table.day==d].iloc[:,0])
    all_tweets=[]
    all_usernames=[]
    for f in fnames:
        with open(f) as file:
            file_texts=file.readlines()
            file_texts=file_texts[1:]
        tweets=[None]*len(file_texts)
        usernames=[None]*len(file_texts)
        for i,text in enumerate(file_texts):
            splitted_text=text.split("\",\"")
            tweets[i]=splitted_text[11]
            usernames[i]=splitted_text[6]
        all_tweets.extend(tweets)
        all_usernames.extend(usernames)
    tweets_data=np.array(all_usernames+all_tweets)
    tweets_data=tweets_data.reshape(2,len(all_tweets))
    tweets_data=tweets_data.T
    #remove duplicates
    tweets_data_df=pd.DataFrame(tweets_data)
    tweets_data_df=tweets_data_df.drop_duplicates()
    total_tweet_counts[str(d)]=tweets_data_df.shape[0]
    
    #NLP: tokenize; counting number of distinct hashtags
    tknzr = TweetTokenizer()
    hashtag_regex=re.compile("^#")
    all_hashtags=[]
    for i in range(tweets_data_df.shape[0]):
        tokens=tknzr.tokenize(tweets_data_df.iloc[i,1])
        for token in tokens:
            if hashtag_regex.findall(token)==["#"] and token!="#": #it was found that '#' is not acutally a hashtag, so it should be removed
                all_hashtags.append(token)
    hashtag_counts[str(d)]=Counter(all_hashtags[:])
    
    tknzr = TweetTokenizer()
    at_regex=re.compile("^@")
    all_at={}
    #count the number of tweets that contain both @realDonaldTrump and @ other candidates (particularly @HillaryClinton)
    for i in range(tweets_data_df.shape[0]):
        tokens=tknzr.tokenize(tweets_data_df.iloc[i,1])
        for token in tokens:
            if at_regex.findall(token)==["@"] and token!="@" and token!="@realDonaldTrump" and token!="@RealDonaldTrump": #it was found that '#' is not acutally a hashtag, so it should be removed
                try:
                    all_at[i].append(token)
                except KeyError:
                    all_at[i]=[token]
    at_opponent_counts=0
    for k in all_at.keys():
        for at in all_at[k]:
            if at.lower() in ['@berniesanders', '@hillaryclinton','@johnkasich','@senrubiopress','@sensanders','@tedcruz']:
                at_opponent_counts+=1
                break
    at_opponent_counts_1[d]=at_opponent_counts
    at_opponent_counts=0
    for k in all_at.keys():
        for at in all_at[k]:
            if at.lower() == '@hillaryclinton':
                at_opponent_counts+=1
                break
    at_opponent_counts_2[d]=at_opponent_counts

In [10]:
total_tweet_counts #daily total tweet counts

{'1': 242882,
 '26': 251465,
 '27': 300386,
 '28': 221817,
 '29': 183855,
 '30': 183892,
 '31': 185945}

In [11]:
at_opponent_counts_1

{1: 12593, 26: 22713, 27: 48702, 28: 38140, 29: 14886, 30: 13811, 31: 7703}

In [12]:
at_opponent_counts_2

{1: 10703, 26: 15020, 27: 11787, 28: 10061, 29: 6291, 30: 9307, 31: 4962}

In [13]:
sum(at_opponent_counts_1.values())/sum(total_tweet_counts.values())*100
#10.1% of the tweets have both @realDonaldTrump and @ other candiates

10.097042366717996

In [14]:
sum(at_opponent_counts_2.values())/sum(total_tweet_counts.values())*100
#4.3% of the tweets have both @realDonaldTrump and @HillaryClinton

4.338885343787773

In [15]:
total_hashtag_counts=Counter([])
for v in hashtag_counts.values():
    total_hashtag_counts=total_hashtag_counts+v
print(total_hashtag_counts.most_common(50))
sum_of_hashtag_counts=0
for v in total_hashtag_counts.values():
    sum_of_hashtag_counts+=v
print('\n')
print(sum_of_hashtag_counts) #sum of all hashtag counts

#top 50 hashtags
sum_of_hashtag_counts_top=0
for v in total_hashtag_counts.most_common(50):
    sum_of_hashtag_counts_top+=v[1]
print('\n')
print(sum_of_hashtag_counts_top/sum_of_hashtag_counts)

#top 20 hashtags
sum_of_hashtag_counts_top=0
for v in total_hashtag_counts.most_common(20):
    sum_of_hashtag_counts_top+=v[1]
print(sum_of_hashtag_counts_top/sum_of_hashtag_counts)

#top 100 hashtags
sum_of_hashtag_counts_top=0
for v in total_hashtag_counts.most_common(100):
    sum_of_hashtag_counts_top+=v[1]
print(sum_of_hashtag_counts_top/sum_of_hashtag_counts)

[('#Trump2016', 97663), ('#ChickenTrump', 40357), ('#MakeAmericaGreatAgain', 36114), ('#TrumpTrain', 27608), ('#MAGA', 23984), ('#Trump', 13674), ('#BernieTrumpDebate', 11870), ('#NeverTrump', 10644), ('#RollingThunder', 8541), ('#FeelTheBern', 6024), ('#NeverHillary', 5736), ('#TeamTrump', 5538), ('#CrookedHillary', 5486), ('#tcot', 5448), ('#TRUMP', 5287), ('#dtmag', 4672), ('#2A', 4518), ('#veterans', 3649), ('#MeetTheTrumps', 3586), ('#AmericaFirst', 3557), ('#TrumpRally', 3528), ('#GOP', 3437), ('#VoteTrump', 3398), ('#SDTrumpRally', 3289), ('#MemorialDay2016', 3266), ('#MemorialDayWeekend', 3259), ('#NRA', 3104), ('#POTUS', 2666), ('#MemorialDay', 2517), ('#trump', 2481), ('#WhinyLittleBitch', 2445), ('#VoteTrump2016', 2416), ('#WomenForTrump', 2371), ('#MSM', 2250), ('#Trump2', 2175), ('#DonaldTrump', 2137), ('#LatinosForTrump', 2065), ('#Hillary', 2051), ('#AlwaysTrump', 2015), ('#LESM', 1948), ('#HillaryClinton', 1939), ('#nevertrump', 1926), ('#Anaheim', 1918), ('#trump2016',

In [19]:
date_dict={"1":"2016-06-01","26":"2016-05-26","27":"2016-05-27","28":"2016-05-28","29":"2016-05-29",
          "30":"2016-05-30","31":"2016-05-31"}
date_dict_2={"1":1,"26":26,"27":27,"28":28,"29":29,
            "30":30,"31":31}

for d in sorted(hashtag_counts.keys()):
    print(date_dict[d])
    print(hashtag_counts[d].most_common(50))

2016-06-01
[('#Trump2016', 11753), ('#MakeAmericaGreatAgain', 4274), ('#MAGA', 3839), ('#TrumpTrain', 2625), ('#Trump', 2124), ('#veterans', 2087), ('#NeverTrump', 1983), ('#MSM', 1706), ('#tcot', 1501), ('#TrumpPressConference', 1430), ('#Vets', 1325), ('#Hannity', 1273), ('#CrookedHillary', 864), ('#ICYMI', 860), ('#ChickenTrump', 793), ('#Veterans', 742), ('#NeverHillary', 653), ('#fringecandidates', 645), ('#trump', 571), ('#TRUMP', 553), ('#dtmag', 504), ('#LDTPoll', 482), ('#greta', 473), ('#VoteTrump', 443), ('#pjnet', 435), ('#GOP', 419), ('#trump2016', 413), ('#RollingThunder', 402), ('#EndCommonCore', 382), ('#nevertrump', 378), ('#VoteTrump2016', 377), ('#DonaldTrump', 371), ('#DonTheCon', 353), ('#DumpTrump', 350), ('#Election2016', 348), ('#TrumpUniversity', 339), ('#NewMexico', 324), ('#Massachusetts', 319), ('#2ndAmendment', 318), ('#gayrights', 313), ('#CAforTrump', 311), ('#tlot', 310), ('#CNN', 307), ('#Concerned', 304), ('#AlwaysTrump', 296), ('#BestCampaignEver', 28

## A summary of @HillaryClinton tweets during 2016-05-26 to 2016-06-01

In [20]:
clinton_filenames=glob.glob("TwitterData/* -at,HillaryClinton*")
clinton_filenames_table=[]
for fname in clinton_filenames:
    clinton_filenames_table.append(fname)
    clinton_filenames_table.extend(fname.split("]")[1].split(".")[:3])
clinton_filenames_table=np.array(clinton_filenames_table)
clinton_filenames_table=clinton_filenames_table.reshape(len(clinton_filenames_table)//4,4)
clinton_filenames_table=pd.DataFrame(clinton_filenames_table)
clinton_filenames_table.columns=["filename","year","month","day"]
clinton_filenames_table.day=clinton_filenames_table["day"].astype(int)
clinton_filenames_table.month=clinton_filenames_table["month"].astype(int)
clinton_filenames_table.year=clinton_filenames_table["year"].astype(int)
clinton_filenames_table=clinton_filenames_table[(clinton_filenames_table.day>=26) | (clinton_filenames_table.day==1)]
clinton_filenames_table=clinton_filenames_table.sort_values(["year","month","day"])
clinton_filenames_table

Unnamed: 0,filename,year,month,day
34,"TwitterData/[ BATCH -at,HillaryClinton-STATUS]...",2016,5,26
61,"TwitterData/[ BATCH -at,HillaryClinton-STATUS]...",2016,5,26
102,"TwitterData/[ BATCH -at,HillaryClinton-STATUS]...",2016,5,26
106,"TwitterData/[ BATCH -at,HillaryClinton-STATUS]...",2016,5,26
22,"TwitterData/[ BATCH -at,HillaryClinton-STATUS]...",2016,5,27
39,"TwitterData/[ BATCH -at,HillaryClinton-STATUS]...",2016,5,27
55,"TwitterData/[ BATCH -at,HillaryClinton-STATUS]...",2016,5,27
73,"TwitterData/[ BATCH -at,HillaryClinton-STATUS]...",2016,5,27
74,"TwitterData/[ BATCH -at,HillaryClinton-STATUS]...",2016,5,27
82,"TwitterData/[ BATCH -at,HillaryClinton-STATUS]...",2016,5,27


In [21]:
days=np.unique(clinton_filenames_table.day)
total_tweet_counts_2={}
hashtag_counts_2={}
clinton_at_opponent_counts_1={}
clinton_at_opponent_counts_2={}
for d in days:
    fnames=np.array(clinton_filenames_table[clinton_filenames_table.day==d].iloc[:,0])
    all_tweets=[]
    all_usernames=[]
    for f in fnames:
        with open(f) as file:
            file_texts=file.readlines()
            file_texts=file_texts[1:]
        tweets=[None]*len(file_texts)
        usernames=[None]*len(file_texts)
        for i,text in enumerate(file_texts):
            splitted_text=text.split("\",\"")
            tweets[i]=splitted_text[11]
            usernames[i]=splitted_text[6]
        all_tweets.extend(tweets)
        all_usernames.extend(usernames)
    tweets_data=np.array(all_usernames+all_tweets)
    tweets_data=tweets_data.reshape(2,len(all_tweets))
    tweets_data=tweets_data.T
    #remove duplicates
    tweets_data_df=pd.DataFrame(tweets_data)
    tweets_data_df=tweets_data_df.drop_duplicates()
    total_tweet_counts_2[str(d)]=tweets_data_df.shape[0]
    
    #NLP: tokenize; counting number of distinct hashtags
    tknzr = TweetTokenizer()
    hashtag_regex=re.compile("^#")
    all_hashtags=[]
    for i in range(tweets_data_df.shape[0]):
        tokens=tknzr.tokenize(tweets_data_df.iloc[i,1])
        for token in tokens:
            if hashtag_regex.findall(token)==["#"] and token!="#": #it was found that '#' is not acutally a hashtag, so it should be removed
                all_hashtags.append(token)
    hashtag_counts_2[str(d)]=Counter(all_hashtags[:])
    
    tknzr = TweetTokenizer()
    at_regex=re.compile("^@")
    all_at={}
    #count the number of tweets that contain both @realDonaldTrump and @ other candidates (particularly @HillaryClinton)
    for i in range(tweets_data_df.shape[0]):
        tokens=tknzr.tokenize(tweets_data_df.iloc[i,1])
        for token in tokens:
            if at_regex.findall(token)==["@"] and token!="@" and token.lower()!="@hillaryclinton": #it was found that '#' is not acutally a hashtag, so it should be removed
                try:
                    all_at[i].append(token)
                except KeyError:
                    all_at[i]=[token]
    at_opponent_counts=0
    for k in all_at.keys():
        for at in all_at[k]:
            if at.lower() in ['@berniesanders', '@realdonaldtrump','@johnkasich','@senrubiopress','@sensanders','@tedcruz']:
                at_opponent_counts+=1
                break
    clinton_at_opponent_counts_1[d]=at_opponent_counts
    at_opponent_counts=0
    for k in all_at.keys():
        for at in all_at[k]:
            if at.lower() == '@realdonaldtrump':
                at_opponent_counts+=1
                break
    clinton_at_opponent_counts_2[d]=at_opponent_counts

In [50]:
total_tweet_counts_2 #daily total tweet counts

{'1': 75397,
 '26': 43815,
 '27': 91239,
 '28': 67673,
 '29': 51439,
 '30': 52673,
 '31': 53127}

In [51]:
clinton_at_opponent_counts_1

{1: 15415, 26: 10498, 27: 16273, 28: 14071, 29: 9476, 30: 12735, 31: 8497}

In [52]:
clinton_at_opponent_counts_2

{1: 10530, 26: 7294, 27: 11653, 28: 10013, 29: 6328, 30: 9264, 31: 4923}

In [54]:
sum(clinton_at_opponent_counts_1.values())/sum(total_tweet_counts_2.values())*100
#20.0% of the tweets have both @HillaryClinton and @ other candiates

19.975284992064093

In [55]:
sum(clinton_at_opponent_counts_2.values())/sum(total_tweet_counts_2.values())*100
#13.8% of the tweets have both @HillaryClinton and @realDonaldTrump

13.782751405149266

In [56]:
total_hashtag_counts_2=Counter([])
for v in hashtag_counts_2.values():
    total_hashtag_counts_2=total_hashtag_counts_2+v
print(total_hashtag_counts_2.most_common(50))
sum_of_hashtag_counts_2=0
for v in total_hashtag_counts_2.values():
    sum_of_hashtag_counts_2+=v
print(sum_of_hashtag_counts_2) #sum of all hashtag counts

#top 50 hashtags
sum_of_hashtag_counts_top_2=0
for v in total_hashtag_counts_2.most_common(50):
    sum_of_hashtag_counts_top_2+=v[1]
print(sum_of_hashtag_counts_top_2/sum_of_hashtag_counts_2)

#top 20 hashtags
sum_of_hashtag_counts_top_2=0
for v in total_hashtag_counts_2.most_common(20):
    sum_of_hashtag_counts_top_2+=v[1]
print(sum_of_hashtag_counts_top_2/sum_of_hashtag_counts_2)

#top 100 hashtags
sum_of_hashtag_counts_top_2=0
for v in total_hashtag_counts_2.most_common(100):
    sum_of_hashtag_counts_top_2+=v[1]
print(sum_of_hashtag_counts_top_2/sum_of_hashtag_counts_2)

[('#ImWithHer', 24261), ('#FeelTheBern', 7348), ('#Trump2016', 6248), ('#NeverHillary', 6153), ('#HillaryClinton', 4674), ('#DropOutBernie', 4491), ('#MAGA', 4207), ('#CrookedHillary', 3966), ('#Hillary2016', 3927), ('#DropOutHillary', 3915), ('#BernieTrumpDebate', 3880), ('#ShesWithUs', 3757), ('#CAPrimary', 3480), ('#BernieSanders', 3194), ('#BernieOrBust', 3061), ('#HRCIsOurNominee', 2898), ('#NEVERTRUMP', 2737), ('#LOVETRUMPSHATE', 2714), ('#SallyRideDay', 2664), ('#Trump', 2540), ('#2A', 2480), ('#GOP', 2443), ('#CaliforniaPrimary', 2440), ('#ChickenTrump', 2431), ('#Benghazi', 2317), ('#GOPdebate', 2317), ('#TBT', 2273), ('#Dems', 2170), ('#Republicans', 2161), ('#Lovetrumpshate', 2118), ('#MOMS', 2117), ('#GUNS', 1663), ('#NRA', 1655), ('#moneygrubber', 1461), ('#SelfServingSa', 1392), ('#tcot', 1302), ('#imwithher', 1213), ('#NeverTrump', 1202), ('#MakeAmericaGreatAgain', 1104), ('#Hillary', 1031), ('#HillaryForPrison2016', 1030), ('#ChickenHillary', 949), ('#FeeltheBern', 881)

In [15]:
clinton_filenames=glob.glob("TwitterData/* -at,HillaryClinton*")
clinton_filenames_table=[]
for fname in clinton_filenames:
    clinton_filenames_table.append(fname)
    clinton_filenames_table.extend(fname.split("]")[1].split(".")[:3])
clinton_filenames_table=np.array(clinton_filenames_table)
clinton_filenames_table=clinton_filenames_table.reshape(len(clinton_filenames_table)//4,4)
clinton_filenames_table=pd.DataFrame(clinton_filenames_table)
clinton_filenames_table.columns=["filename","year","month","day"]
clinton_filenames_table.day=clinton_filenames_table["day"].astype(int)
clinton_filenames_table.month=clinton_filenames_table["month"].astype(int)
clinton_filenames_table.year=clinton_filenames_table["year"].astype(int)
clinton_filenames_table=clinton_filenames_table.sort_values(["year","month","day"])
clinton_filenames_table

Unnamed: 0,filename,year,month,day
13,"TwitterData/[ BATCH -at,HillaryClinton-STATUS]...",2016,5,19
60,"TwitterData/[ BATCH -at,HillaryClinton-STATUS]...",2016,5,19
69,"TwitterData/[ BATCH -at,HillaryClinton-STATUS]...",2016,5,19
92,"TwitterData/[ BATCH -at,HillaryClinton-STATUS]...",2016,5,19
2,"TwitterData/[ BATCH -at,HillaryClinton-STATUS]...",2016,5,20
3,"TwitterData/[ BATCH -at,HillaryClinton-STATUS]...",2016,5,20
8,"TwitterData/[ BATCH -at,HillaryClinton-STATUS]...",2016,5,20
10,"TwitterData/[ BATCH -at,HillaryClinton-STATUS]...",2016,5,20
15,"TwitterData/[ BATCH -at,HillaryClinton-STATUS]...",2016,5,20
16,"TwitterData/[ BATCH -at,HillaryClinton-STATUS]...",2016,5,20


In [16]:
np.unique(clinton_filenames_table.day)

array([ 1, 19, 20, 24, 25, 26, 27, 28, 29, 30, 31])

In [17]:
sanders_filenames_1=glob.glob("TwitterData/* -at,BernieSanders*")
sanders_filenames_1=glob.glob("TwitterData/* -at,SenSanders*")
sanders_filenames_table=[]
for fname in sanders_filenames:
    sanders_filenames_table.append(fname)
    sanders_filenames_table.extend(fname.split("]")[1].split(".")[:3])
sanders_filenames_table=np.array(sanders_filenames_table)
sanders_filenames_table=sanders_filenames_table.reshape(len(sanders_filenames_table)//4,4)
sanders_filenames_table=pd.DataFrame(sanders_filenames_table)
sanders_filenames_table.columns=["filename","year","month","day"]
sanders_filenames_table.day=sanders_filenames_table["day"].astype(int)
sanders_filenames_table.month=sanders_filenames_table["month"].astype(int)
sanders_filenames_table.year=sanders_filenames_table["year"].astype(int)
sanders_filenames_table=sanders_filenames_table.sort_values(["year","month","day"])
sanders_filenames_table

Unnamed: 0,filename,year,month,day
47,"TwitterData/[ BATCH -at,BernieSanders-STATUS]2...",2016,5,19
53,"TwitterData/[ BATCH -at,BernieSanders-STATUS]2...",2016,5,19
67,"TwitterData/[ BATCH -at,BernieSanders-STATUS]2...",2016,5,19
69,"TwitterData/[ BATCH -at,BernieSanders-STATUS]2...",2016,5,19
91,"TwitterData/[ BATCH -at,BernieSanders-STATUS]2...",2016,5,19
96,"TwitterData/[ BATCH -at,BernieSanders-STATUS]2...",2016,5,19
98,"TwitterData/[ BATCH -at,BernieSanders-STATUS]2...",2016,5,19
113,"TwitterData/[ BATCH -at,SenSanders-STATUS]2016...",2016,5,19
140,"TwitterData/[ BATCH -at,SenSanders-STATUS]2016...",2016,5,19
146,"TwitterData/[ BATCH -at,SenSanders-STATUS]2016...",2016,5,19


In [18]:
np.unique(sanders_filenames_table.day)

array([ 1, 19, 20, 23, 24, 25, 26, 27, 28, 29, 30, 31])