In [21]:
# Importing data wrangling libraries
import pandas as pd
import data_cleaning
# Widen the size of each cell
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [51]:
# Loading the mined data
moi_1 = pd.read_csv('20200215_152805_moi_tweets.csv')
moi_2 = pd.read_csv('20200214_160524_moi_tweets.csv')
moi_3 = pd.read_csv('20200214_214556_moi_tweets.csv')
moi_4 = pd.read_csv('20200215_004925_moi_tweets.csv')
moi_5 = pd.read_csv('20200215_110135_moi_tweets.csv')

# Concatenating the scraped tweets into one big dataset
moi = pd.concat([moi_1, moi_2, moi_3, moi_4, moi_5], axis=0)
moi.head()

Unnamed: 0,username,acctdesc,location,following,followers,totaltweets,usercreatedts,tweetcreatedts,retweetcount,text,hashtags
0,sheinnerstands,mute art,water,72,566,3826,2019-03-23 21:06:03,2020-02-06 23:59:32,0,"Some will psychoanalyse, call on deities and s...",[]
1,konicsid,#바비 : 𝐥𝐨𝐥 𝐝𝐚𝐭𝐬 𝐦𝐲 𝐛𝐞𝐥𝐨𝐯𝐞𝐝 𝐛𝐫𝐨𝐭𝐡𝐞𝐫,bobby love bot,332,858,58539,2018-08-01 10:36:31,2020-02-06 23:59:16,0,happy birthday moi lov 💕💞💓💗💝💘💖 @331adore https...,[]
2,cliffblack15,,,0,1,3357,2019-10-31 19:32:29,2020-02-06 23:59:16,0,MY UNDERSTN IS THAT TIM AND ARIA WRE BOTH IN O...,[]
3,Maurice_ireri,"Rockstar engineer,inventor,innovator &entrepre...",,674,1173,3716,2012-02-01 17:24:43,2020-02-06 23:59:15,40,Moi's unexpected turn up for the murdered Robe...,[]
4,cest_moi_kayle,big cowboy DUMMY!!!,,163,68,2337,2016-11-26 02:42:21,2020-02-06 23:58:50,4,listening to the love club by lorde on repeat ...,[]


In [53]:
print('Size of the dataset is ', moi.shape[0], 'observations')

Size of the dataset is  90000 observations


In [54]:
# Initial EDA check of the dataframe to check for percentage of observations that are null.
data_cleaning.initial_eda_checks(moi)

Total and percentage NaN
           Total   Percent
location  25280  0.280889
acctdesc  12166  0.135178


> Location and Account description have null values as some twitter users chose not to disclose that information.

In [55]:
# Dataframe info
moi.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 90000 entries, 0 to 14999
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   username        90000 non-null  object
 1   acctdesc        77834 non-null  object
 2   location        64720 non-null  object
 3   following       90000 non-null  int64 
 4   followers       90000 non-null  int64 
 5   totaltweets     90000 non-null  int64 
 6   usercreatedts   90000 non-null  object
 7   tweetcreatedts  90000 non-null  object
 8   retweetcount    90000 non-null  int64 
 9   text            90000 non-null  object
 10  hashtags        90000 non-null  object
dtypes: int64(4), object(7)
memory usage: 8.2+ MB


In [56]:
# Converting some of the columns into their relevant data types for ease of data wrangling
moi['usercreatedts'] = pd.to_datetime(moi['usercreatedts'])
moi['tweetcreatedts'] = pd.to_datetime(moi['tweetcreatedts'])
moi.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 90000 entries, 0 to 14999
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   username        90000 non-null  object        
 1   acctdesc        77834 non-null  object        
 2   location        64720 non-null  object        
 3   following       90000 non-null  int64         
 4   followers       90000 non-null  int64         
 5   totaltweets     90000 non-null  int64         
 6   usercreatedts   90000 non-null  datetime64[ns]
 7   tweetcreatedts  90000 non-null  datetime64[ns]
 8   retweetcount    90000 non-null  int64         
 9   text            90000 non-null  object        
 10  hashtags        90000 non-null  object        
dtypes: datetime64[ns](2), int64(4), object(5)
memory usage: 8.2+ MB


In [57]:
# Finding the range of the time tweets were tweeted
range = moi.tweetcreatedts.max() - moi.tweetcreatedts.min()
range

Timedelta('7 days 16:13:08')

In [58]:
moi.tweetcreatedts.min() 

Timestamp('2020-02-06 20:31:24')

In [59]:
moi.tweetcreatedts.max()

Timestamp('2020-02-14 12:44:32')

> The tweets collected were from the 6th to the 14th of February

## Checking for duplicated entries

Since we performed scraping close to each other, it's possible the same tweets were scaped more than once.

In [61]:
# Checking for number of unique users in the dataset.
moi.username.value_counts()
# Some users are more proactive than others


Moi_Diddy         750
YBTunz_MOI        658
as_moi            461
moi_ho            406
cyprianongeri1    319
                 ... 
sadiq_Qureshi_      1
Halima_lami         1
OlajumokeOom        1
ManLikeAkoh         1
Asamoh_             1
Name: username, Length: 8782, dtype: int64

In [62]:
# It's normal to have a repeated username as a user can tweet multiple times throughout the period.
moi[moi['username']=='Moi_Diddy']

Unnamed: 0,username,acctdesc,location,following,followers,totaltweets,usercreatedts,tweetcreatedts,retweetcount,text,hashtags
2211,Moi_Diddy,Check out {Slick 2} a short film im in starrin...,,735,2687,179705,2009-06-03 05:52:43,2020-02-06 20:41:27,0,Yo this how ppl was running into the pole down...,[]
2341,Moi_Diddy,Check out {Slick 2} a short film im in starrin...,,735,2687,179705,2009-06-03 05:52:43,2020-02-06 20:37:42,0,@BRWN_EYEGURL Really be b*tchin &amp; question...,[]
2382,Moi_Diddy,Check out {Slick 2} a short film im in starrin...,,735,2687,179705,2009-06-03 05:52:43,2020-02-06 20:36:22,0,@AintShit_CoolJ It’s where u said earlier?,[]
2400,Moi_Diddy,Check out {Slick 2} a short film im in starrin...,,735,2687,179705,2009-06-03 05:52:43,2020-02-06 20:35:40,0,@AintShit_CoolJ I can def see this happening🤣🤣...,[]
4710,Moi_Diddy,Check out {Slick 2} a short film im in starrin...,,735,2687,179705,2009-06-03 05:52:43,2020-02-06 20:41:27,0,Yo this how ppl was running into the pole down...,[]
...,...,...,...,...,...,...,...,...,...,...,...
12542,Moi_Diddy,Check out {Slick 2} a short film im in starrin...,,735,2687,179705,2009-06-03 05:52:43,2020-02-07 23:47:02,0,@jubaby2011 @Tre_Colion He must be only showin...,[]
12601,Moi_Diddy,Check out {Slick 2} a short film im in starrin...,,735,2687,179705,2009-06-03 05:52:43,2020-02-07 23:23:01,0,@Hfrinks He going fck u up watch🤣🤣...,[]
13961,Moi_Diddy,Check out {Slick 2} a short film im in starrin...,,735,2687,179705,2009-06-03 05:52:43,2020-02-07 19:04:37,0,@ThiqChic It really is tho...,[]
14518,Moi_Diddy,Check out {Slick 2} a short film im in starrin...,,735,2687,179705,2009-06-03 05:52:43,2020-02-07 18:16:38,0,@Humble_CG See nah...,[]


In [63]:
# Dropping the duplicated rows
print('Initial size of the dataset before dropping duplicated rows is:', moi.shape)
moi.drop_duplicates(keep=False, inplace=True)
print('Current size of dataset after dropping duplicated rows, if any, is: ', moi.shape)

Initial size of the dataset before dropping duplicated rows is: (90000, 11)
Current size of dataset after dropping duplicated rows, if any, is:  (23151, 11)


In [64]:
# Doing the EDA checks again
data_cleaning.initial_eda_checks(moi)

Total and percentage NaN
           Total   Percent
location   5331  0.230271
acctdesc   2253  0.097318


In [67]:
moi.location.value_counts()

Nairobi, Kenya        3550
Kenya                 1608
Nairobi                748
Africa.                268
Mombasa, Kenya         263
                      ... 
Kakamega Kenya           1
nairobi, Kenya           1
some place higher        1
72h/Y00RIM_              1
Aberdeen, Scotland       1
Name: location, Length: 1738, dtype: int64

## Data Cleaning

In [71]:
# Dropping tweets whose user didn't disclose their location
moi = moi.dropna(axis=0, subset=['location'])
data_cleaning.initial_eda_checks(moi)

Total and percentage NaN
           Total   Percent
acctdesc    924  0.051852


## Exploratory Data Analysis

In [72]:
moi.head()

Unnamed: 0,username,acctdesc,location,following,followers,totaltweets,usercreatedts,tweetcreatedts,retweetcount,text,hashtags
1,konicsid,#바비 : 𝐥𝐨𝐥 𝐝𝐚𝐭𝐬 𝐦𝐲 𝐛𝐞𝐥𝐨𝐯𝐞𝐝 𝐛𝐫𝐨𝐭𝐡𝐞𝐫,bobby love bot,332,858,58539,2018-08-01 10:36:31,2020-02-06 23:59:16,0,happy birthday moi lov 💕💞💓💗💝💘💖 @331adore https...,[]
11,EcoInternetDrGB,#Environment #Climate 24/7 by Dr. Glen Barry w...,"New York, NY",26718,34422,928057,2013-01-11 14:43:30,2020-02-06 23:58:03,0,Obama chez moi! The invasion of metropolitan F...,[]
12,HJankhel,‏‏‏لر او بر یو افغان,"Paris, France",191,105,3012,2018-03-11 11:56:29,2020-02-06 23:57:49,5,Released Geelaman Wazir #DontDeportGeelaman @U...,"[{'text': 'DontDeportGeelaman', 'indices': [42..."
15,Husmo,An ordinary mwananchi who is lucky to have the...,Nairobi,632,1261,60217,2009-10-11 22:14:54,2020-02-06 23:56:56,91,There's something the reactions to Moi's death...,[]
18,Acjuice_,Dj Juice🍊 || Don’t get lost in the sauce,961 🇱🇧| 514 | Nawf,1205,1668,14454,2018-06-14 11:11:40,2020-02-06 23:55:20,1,Entk man moi ma date ma rancer so hope yall ha...,[]


In [73]:
moi.tail()

Unnamed: 0,username,acctdesc,location,following,followers,totaltweets,usercreatedts,tweetcreatedts,retweetcount,text,hashtags
14985,OchiengOpinya,"A staunch Catholic Ragen Chapel, Dad, Husband ...","Mombasa, Kenya",3561,1355,6491,2017-11-02 07:40:35,2020-02-07 17:33:47,0,@2Carolzuena @ledamalekina @omwanza We are sti...,[]
14988,ishaqearly,Humantarian || Single || Tweets are mine || RT...,Motherland,3759,2367,7550,2019-05-03 10:49:43,2020-02-07 17:33:22,0,Thread 👇\n\nDid u know that Jomo Kenyatta rule...,[]
14993,Tuko_co_ke,Follow TUKO for reliable breaking news on Poli...,"Nairobi, Kenya",2764,83161,63846,2015-04-02 13:49:55,2020-02-07 17:32:55,0,Tim died days apart with the demise of former ...,"[{'text': 'tukonews', 'indices': [219, 228]}, ..."
14994,MKhanDawar4,Ptm beest,"Jazan, Kingdom of Saudi Arabia",4625,5416,84472,2019-05-07 13:00:24,2020-02-07 17:32:51,47,Dear @moi_bahrain just see the love &amp; resp...,"[{'text': 'DontDeportGeelaman', 'indices': [12..."
14996,Newsmongerng,Nigeria's Top News Website,"Lagos, Nigeria",242,153,32355,2012-03-24 20:57:10,2020-02-07 17:32:40,0,Kenya to hold state funeral for ‘iron fist’ ex...,[]


In [74]:
# Location of where the tweets are from
moi.location.value_counts()

Nairobi, Kenya                  3550
Kenya                           1608
Nairobi                          748
Africa.                          268
Mombasa, Kenya                   263
                                ... 
72h/Y00RIM_                        1
South West, England                1
c'est yalent btw hein              1
Bangtan | Stray Kids | Ateez       1
134340                             1
Name: location, Length: 1738, dtype: int64

In [84]:
# From the EDA, an observation was made that some of the tweets were from locations not in Kenya
# Selecting a slice of dataframe from locations that appears more than 5 times in the dataframe.
value_counts = moi['location'].value_counts()
to_remove = value_counts[value_counts <= 5].index
moi_remove = moi[moi.location.isin(to_remove)]
moi_remove.head()

Unnamed: 0,username,acctdesc,location,following,followers,totaltweets,usercreatedts,tweetcreatedts,retweetcount,text,hashtags
1,konicsid,#바비 : 𝐥𝐨𝐥 𝐝𝐚𝐭𝐬 𝐦𝐲 𝐛𝐞𝐥𝐨𝐯𝐞𝐝 𝐛𝐫𝐨𝐭𝐡𝐞𝐫,bobby love bot,332,858,58539,2018-08-01 10:36:31,2020-02-06 23:59:16,0,happy birthday moi lov 💕💞💓💗💝💘💖 @331adore https...,[]
26,Genn_Scog,Coming together is a beginning; keeping togeth...,⚡🍁⚡🍁⚡Canada⚡🍁⚡🍁⚡,30249,39829,120033,2014-02-18 17:37:28,2020-02-06 23:53:08,29,Strongly condemn the arrest of #HumanRightsAct...,"[{'text': 'HumanRightsActivists', 'indices': [..."
32,QaziZada0093,‏‏‏‏‏🇦🇫\nلر او بر یو افغانPTM,"Brussels, Belgium",754,142,62,2019-09-03 09:54:22,2020-02-06 23:52:13,11,Bahrain government is directly responsible for...,"[{'text': 'DontDeportGelaaman', 'indices': [20..."
41,Azazel_Gadriel,Témoins du mal | Attends patiemment la nécrose...,La cour des miracles,442,1889,319611,2010-11-13 03:37:10,2020-02-06 23:50:05,3,Popsmoke: Meet The Woo 2 \n\nMoi: https://t.co...,[]
47,GURAND0,ㅤㅤㅤㅤㅤㅤㅤ\n ʺ One Kingdom's fall is another King...,#SHINDO : I'm quite ( Grand ),123,60,9292,2019-09-02 11:56:23,2020-02-06 23:49:33,0,"@ZUKCHlNl enjoy suffering, moi friend",[]


> Due to the nature of the scraping search word used, we seem to have collected tweets with the word 'moi' in them as shown in the above slices

In [85]:
# Dropping the rows of data whose locations occured less than 2 times
moi = moi[~moi.location.isin(to_remove)]

In [86]:
moi.shape

(15260, 11)

In [88]:
moi.head()

Unnamed: 0,username,acctdesc,location,following,followers,totaltweets,usercreatedts,tweetcreatedts,retweetcount,text,hashtags
11,EcoInternetDrGB,#Environment #Climate 24/7 by Dr. Glen Barry w...,"New York, NY",26718,34422,928057,2013-01-11 14:43:30,2020-02-06 23:58:03,0,Obama chez moi! The invasion of metropolitan F...,[]
12,HJankhel,‏‏‏لر او بر یو افغان,"Paris, France",191,105,3012,2018-03-11 11:56:29,2020-02-06 23:57:49,5,Released Geelaman Wazir #DontDeportGeelaman @U...,"[{'text': 'DontDeportGeelaman', 'indices': [42..."
15,Husmo,An ordinary mwananchi who is lucky to have the...,Nairobi,632,1261,60217,2009-10-11 22:14:54,2020-02-06 23:56:56,91,There's something the reactions to Moi's death...,[]
34,MrLewisVuitton,🇰🇪 #ArsenalFCObserver RT ≠ Endorsement......If...,✈️,614,2561,393091,2011-02-06 19:44:11,2020-02-06 23:51:29,358,"Now for Deja Vu to text us \n""Join us for our ...",[]
74,RedXRISTOS,Loyalty To Country ALWAYS.\nLoyalty To Governm...,Kenya~Vatican~Anfield,1966,2025,49242,2009-02-25 11:11:47,2020-02-06 23:43:04,0,"@DonaldBKipkorir ""Raila Odinga &amp; his famil...",[]
