### Import Data

In [1]:
!pip install swifter

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting swifter
  Downloading swifter-1.3.4.tar.gz (830 kB)
[K     |████████████████████████████████| 830 kB 6.7 MB/s 
Collecting psutil>=5.6.6
  Downloading psutil-5.9.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (281 kB)
[K     |████████████████████████████████| 281 kB 27.3 MB/s 
Collecting jedi>=0.10
  Downloading jedi-0.18.1-py2.py3-none-any.whl (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 14.1 MB/s 
Building wheels for collected packages: swifter
  Building wheel for swifter (setup.py) ... [?25l[?25hdone
  Created wheel for swifter: filename=swifter-1.3.4-py3-none-any.whl size=16322 sha256=6b33cec9a1265a43ad83c76c6073e0d606f6915ea1bef0a658f4fad66843e6fb
  Stored in directory: /root/.cache/pip/wheels/29/a7/0e/3a8f17ac69d759e1e93647114bc9bdc95957e5b0cbfd405205
Successfully built swifter
Installing collected 

In [1]:
import pandas as pd 
import re
from collections import Counter
from pathlib import Path
import swifter

In [2]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# path for data files
basepath='/content/drive/MyDrive/datasets/data'

In [4]:
# craete PosixPath
data_folder = Path(basepath)

In [5]:
# print data folder
print(data_folder)

/content/drive/MyDrive/datasets/data


In [6]:
# Craete path to the dataset 
# we can use / opertaor to join folders with files or subfolders
file = data_folder/"trump_tweets.csv"

In [7]:
# Load the dataset using pd.read_csv
df = pd.read_csv(file)

In [8]:
# check the top five rows of the data set
df.head()

Unnamed: 0,text,Username,Timestamp
0,RT @CaslerNoel: Trump didn’t order all those f...,MwhalenCy,Sun Jul 11 21:57:37 +0000 2021
1,RT @bellausa17: @POTUS Biden pandering again a...,java1836,Sun Jul 11 21:57:37 +0000 2021
2,"RT @realLizUSA: ""There are now two sets of law...",EricDrevon,Sun Jul 11 21:57:37 +0000 2021
3,RT @Blklivesmatter: Biden is currently sending...,kacekochel,Sun Jul 11 21:57:38 +0000 2021
4,💯 true!,frank_venezia,Sun Jul 11 21:57:38 +0000 2021


# Q1 Extracting hashtags from the tweets

In [9]:
# Inspect tweet texts - we will use rows 140 - 150
for text in df['text'][140:145]:
  print (text)

#TRUMP 47*
#AMERICA 1ST #MAGA #CPAC* THERES NOTHING LIKE IT_SO #FREEDOM LOVING AMERICANS_CAN GET AWAY FROM THE BULL… https://t.co/NCxINeoqvd
RT @TeaPainUSA: Trump will continue to divide the GOP until it's only him and Don Jr. left. 

https://t.co/ONPWBrtUbi
RT @kelly2277: 🔥Trump’s Incompetent Team Waited For Wisconsin Election Updates And Blamed A “Delay” On A Conspiracy Theory BUT The Idiots H…
RT @RSBNetwork: President Trump roasting Hunter Biden!!! https://t.co/291BoMKDXo
RT @ShutUpAmanda: They already chose Trump. https://t.co/dXtMQR0EpG


In [14]:
# write a regular expression to extract hashtags 
search_hashtags = re.compile(r"(?:\#+[\w]+[\w\'\-]*[\w]+)")

In [15]:
# apply function to craete new column 'hashtags'
df['hashtags'] = df['text'].swifter.apply(lambda x : re.findall(search_hashtags, x) )

Pandas Apply:   0%|          | 0/200 [00:00<?, ?it/s]

In [16]:
# check rows 140 - 150 of dataframe for column hashtags
df.hashtags[140:145]

140    [#TRUMP, #AMERICA, #MAGA, #CPAC, #FREEDOM]
141                                            []
142                                            []
143                                            []
144                                            []
Name: hashtags, dtype: object

In [17]:
# drop columns Username, Timestamp
df = df.drop(['Username','Timestamp'],axis=1)

In [18]:
# check first ten rows of the dataset
df.loc[140]

text        #TRUMP 47*\n#AMERICA 1ST #MAGA #CPAC* THERES N...
hashtags           [#TRUMP, #AMERICA, #MAGA, #CPAC, #FREEDOM]
Name: 140, dtype: object

# Q2: Removing URLs from tweets

There are multiple URLs present in individual tweet's `text` Remove the URL from the tweets.


In [20]:
# create new column clean_text. We will remove urls from the text column to create new column
df['clean_text'] = df['text'].swifter.apply(lambda x: re.sub('https?[A-Za-z0-9:/._\-]+','',x))

Pandas Apply:   0%|          | 0/200 [00:00<?, ?it/s]

In [22]:
# print rows 140-150 from clean_text column to see if the urls have been removed
for text in df['clean_text'][140:150]:
  print(text)

#TRUMP 47*
#AMERICA 1ST #MAGA #CPAC* THERES NOTHING LIKE IT_SO #FREEDOM LOVING AMERICANS_CAN GET AWAY FROM THE BULL… 
RT @TeaPainUSA: Trump will continue to divide the GOP until it's only him and Don Jr. left. 


RT @kelly2277: 🔥Trump’s Incompetent Team Waited For Wisconsin Election Updates And Blamed A “Delay” On A Conspiracy Theory BUT The Idiots H…
RT @RSBNetwork: President Trump roasting Hunter Biden!!! 
RT @ShutUpAmanda: They already chose Trump. 
@ReadingJudith He’s another dangerous grifter and agitator allowed to flourish thanks to Trump &amp; the GOP.  He’s rotten snot.
@snarkiekimmie @ananavarro They hate communists and the dem party is overflowing with em. Duhhh. Yeah Trump was suc… 
RT @CNN: Trump doesn't have a strong case against Big Tech for deplatforming him. Private companies aren't required to provide him a platfo…
RT @kelly2277: 🔥Trump’s Incompetent Team Waited For Wisconsin Election Updates And Blamed A “Delay” On A Conspiracy Theory BUT The Idiots H…
RT @prchovanec

# Q3 Extract Top 10 Mentions and add mentions as new column

Many of the tweets have mentions of people in the form *@username*, for example see the following tweet - 

RT @kelly2277: 🔥Trump’s Incompetent Team Waited For Wisconsin Election Updates

Here @kelly2277 is a mention. You need to extract mentions from all the tweets and find which are the top 10 mentions

In [23]:
# craete column mentions that has @mentions in tweets
df['mentions'] = df['text'].swifter.apply(lambda x: re.findall('@([\w\-]+):?',x) if len(re.findall('@([\w\-]+):?',x))>0 else None )

Pandas Apply:   0%|          | 0/200 [00:00<?, ?it/s]

In [24]:
df['mentions']

0                                           [CaslerNoel]
1                                    [bellausa17, POTUS]
2                                           [realLizUSA]
3                                       [Blklivesmatter]
4                                                   None
                             ...                        
195    [Vesemirr, C_Stroop, Mr_JamesLandis, mattsheff...
196                       [JennaEllisEsq, GOPChairwoman]
197                                        [ElectionWiz]
198                        [chsbulldogs92, ericswalwell]
199                                     [RonFilipkowski]
Name: mentions, Length: 200, dtype: object

In [25]:
# combine mention in a single list
mentions = df['mentions']
mentions_combined=[]
for mention in mentions:
  if mention != None:
    mentions_combined.extend(mention)

In [26]:
# use Counter to get top mentions
top_mentions = Counter(mentions_combined).most_common(10)

In [27]:
# print top mentions
top_mentions

[('glennkirschner2', 7),
 ('atrupar', 7),
 ('CaslerNoel', 6),
 ('realLizUSA', 6),
 ('TeaPainUSA', 6),
 ('SwainForSenate', 5),
 ('CPAC', 4),
 ('RSBNetwork', 4),
 ('Out5p0ken', 4),
 ('JennaEllisEsq', 4)]

# Q4 Count Words
Count the number of words 'trump' or 'Trump' appearing in every tweet. Add this as an additional feature to the data set.

In [28]:
df['n_trumps'] = df['text'].swifter.apply(lambda x: len(re.findall('[Tt]rump',x)))

Pandas Apply:   0%|          | 0/200 [00:00<?, ?it/s]

In [29]:
df['n_trumps'] 

0      1
1      0
2      0
3      1
4      0
      ..
195    0
196    0
197    0
198    1
199    1
Name: n_trumps, Length: 200, dtype: int64

In [30]:
total_trump_mentions = df['n_trumps'].sum()
total_trump_mentions

127