# Table of Contents
 1. About the Dataset
 2. Regex for Cleaning Text Data
 3. Regex for Text Data Extraction
 4. Regex Challenge


## 1. About the Dataset

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!unzip /content/drive/MyDrive/Courses/GettingStartedWithNPLcourseFromAnalysticsVidhya/Section4TextProcessingHandlingtextdata/Regular_Expressions_in_Python.zip

Archive:  /content/drive/MyDrive/Courses/GettingStartedWithNPLcourseFromAnalysticsVidhya/Section4TextProcessingHandlingtextdata/Regular_Expressions_in_Python.zip
   creating: Regular_Expressions_in_Python/
  inflating: Regular_Expressions_in_Python/45RegularExpressionsinPythonPart2.ipynb  
   creating: Regular_Expressions_in_Python/RegEx_on_real_word_dataset/
  inflating: Regular_Expressions_in_Python/RegEx_on_real_word_dataset/4.6 Using RegEx on Real-World Dataset.ipynb  
  inflating: Regular_Expressions_in_Python/RegEx_on_real_word_dataset/tweets.csv  


In [3]:
import pandas as pd

#Loading the dataset
df = pd.read_csv("Regular_Expressions_in_Python/RegEx_on_real_word_dataset/tweets.csv", encoding = "ISO-8859-1")

# Printing first 5 rows
df.head()

Unnamed: 0.1,Unnamed: 0,X,text,favorited,favoriteCount,replyToSN,created,truncated,replyToSID,id,replyToUID,statusSource,screenName,retweetCount,isRetweet,retweeted
0,1,1,RT @rssurjewala: Critical question: Was PayTM ...,False,0,,2016-11-23 18:40:30,False,,8.014957e+17,,"<a href=""http://twitter.com/download/android"" ...",HASHTAGFARZIWAL,331,True,False
1,2,2,RT @Hemant_80: Did you vote on #Demonetization...,False,0,,2016-11-23 18:40:29,False,,8.014957e+17,,"<a href=""http://twitter.com/download/android"" ...",PRAMODKAUSHIK9,66,True,False
2,3,3,"RT @roshankar: Former FinSec, RBI Dy Governor,...",False,0,,2016-11-23 18:40:03,False,,8.014955e+17,,"<a href=""http://twitter.com/download/android"" ...",rahulja13034944,12,True,False
3,4,4,RT @ANI_news: Gurugram (Haryana): Post office ...,False,0,,2016-11-23 18:39:59,False,,8.014955e+17,,"<a href=""http://twitter.com/download/android"" ...",deeptiyvd,338,True,False
4,5,5,RT @satishacharya: Reddy Wedding! @mail_today ...,False,0,,2016-11-23 18:39:39,False,,8.014954e+17,,"<a href=""http://cpimharyana.com"" rel=""nofollow...",CPIMBadli,120,True,False


In [4]:
# Looking at some Tweets
for index, tweet in enumerate(df["text"][10:15]):
    print(index+1,".",tweet)

1 . Many opposition leaders are with @narendramodi on the #Demonetization 
And respect their decision,but support opposition just b'coz of party
2 . RT @Joydas: Question in Narendra Modi App where PM is taking feedback if people support his #DeMonetization strategy https://t.co/pYgK8Rmg7r
3 . @Jaggesh2 Bharat band on 28??<ed><U+00A0><U+00BD><ed><U+00B8><U+0082>Those who  are protesting #demonetization  are all different party leaders.
4 . RT @Atheist_Krishna: The effect of #Demonetization !!
. https://t.co/A8of7zh2f5
5 . RT @sona2905: When I explained #Demonetization to myself and tried to put it down in my words which are not laced with any heavy technical


## 2. Regex for Cleaning Text Data

In [5]:
import re

### a. Removing `RT`

In [6]:
# Removing RT from a single Tweet
text = "RT @Joydas: Question in Narendra Modi App where PM is taking feedback if people support his #DeMonetization strategy https://t.co/pYgK8Rmg7r"
clean_text = re.sub('RT ','', text)

print("Text before:\n", text)
print("Text after:\n", clean_text)

Text before:
 RT @Joydas: Question in Narendra Modi App where PM is taking feedback if people support his #DeMonetization strategy https://t.co/pYgK8Rmg7r
Text after:
 @Joydas: Question in Narendra Modi App where PM is taking feedback if people support his #DeMonetization strategy https://t.co/pYgK8Rmg7r


In [7]:
# Tweets before removal
df['text'].head()

0    RT @rssurjewala: Critical question: Was PayTM ...
1    RT @Hemant_80: Did you vote on #Demonetization...
2    RT @roshankar: Former FinSec, RBI Dy Governor,...
3    RT @ANI_news: Gurugram (Haryana): Post office ...
4    RT @satishacharya: Reddy Wedding! @mail_today ...
Name: text, dtype: object

In [8]:
# Removing RT from all the tweets
df['text']=df['text'].apply(lambda x: re.sub('RT ','',x))

In [9]:
# Tweets after removal
df['text'].head()

0    @rssurjewala: Critical question: Was PayTM inf...
1    @Hemant_80: Did you vote on #Demonetization on...
2    @roshankar: Former FinSec, RBI Dy Governor, CB...
3    @ANI_news: Gurugram (Haryana): Post office emp...
4    @satishacharya: Reddy Wedding! @mail_today car...
Name: text, dtype: object

### b. Removing `<U+..>` like symbols

In [10]:
# Removing <U+..> like symbols from a single tweet
text = "@Jaggesh2 Bharat band on 28??<ed><U+00A0><U+00BD><ed><U+00B8><U+0082>Those who  are protesting #demonetization  are all different party leaders"
clean_text = re.sub('<U\+[A-Z0-9]+>','', text)

print("Text before:\n", text)
print("Text after:\n", clean_text)

Text before:
 @Jaggesh2 Bharat band on 28??<ed><U+00A0><U+00BD><ed><U+00B8><U+0082>Those who  are protesting #demonetization  are all different party leaders
Text after:
 @Jaggesh2 Bharat band on 28??<ed><ed>Those who  are protesting #demonetization  are all different party leaders


**Note** that although we have gotten rid of majority of symbols, `<ed>` is still present. I leave this as an exercise for you to try out.

In [11]:
# Removing <U+..> like symbols from all the tweets
df['text']=df['text'].apply(lambda x: re.sub('<U\+[A-Z0-9]+>', '', x))

### c. Fixing the `&` and `&amp;`

In [12]:
# Replacing &amp with & in a single tweet
text = "RT @harshkkapoor: #DeMonetization survey results after 24 hours 5Lacs opinions Amazing response &amp; Commitment in fight against Blackmoney"
clean_text = re.sub('&amp;','&', text)

print("Text before:\n", text)
print("Text after:\n", clean_text)

Text before:
 RT @harshkkapoor: #DeMonetization survey results after 24 hours 5Lacs opinions Amazing response &amp; Commitment in fight against Blackmoney
Text after:
 RT @harshkkapoor: #DeMonetization survey results after 24 hours 5Lacs opinions Amazing response & Commitment in fight against Blackmoney


In [13]:
# Replacing &amp with & in all the tweets
df['text']=df['text'].apply(lambda x: re.sub('&amp', '&', x))

## 3. Regex for Text Data Extraction
### a. Extracting platform type of tweets

In [14]:
# Getting number of tweets per platform type
platform_count = df["statusSource"].value_counts()

In [15]:
platform_count

statusSource
<a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a>      7642
<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>                        2548
<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>        2093
<a href="https://about.twitter.com/products/tweetdeck" rel="nofollow">TweetDeck</a>        492
<a href="https://mobile.twitter.com" rel="nofollow">Twitter Lite</a>                       263
                                                                                          ... 
<a href="http://pnllg.com" rel="nofollow">PNLLG </a>                                         1
<a href="http://www.toi.in" rel="nofollow">cmssocialservice</a>                              1
<a href="http://sites.google.com/site/yorufukurou/" rel="nofollow">YoruFukurou</a>           1
<a href="https://panel.socialpilot.co/" rel="nofollow">SocialPilot.co</a>                    1
<a href="https://twitter.com/download

In [16]:
#List platforms that have more than 100 tweets
top_platforms = platform_count.loc[platform_count>100]
top_platforms

statusSource
<a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a>    7642
<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>                      2548
<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>      2093
<a href="https://about.twitter.com/products/tweetdeck" rel="nofollow">TweetDeck</a>      492
<a href="https://mobile.twitter.com" rel="nofollow">Twitter Lite</a>                     263
<a href="https://mobile.twitter.com" rel="nofollow">Mobile Web (M5)</a>                  178
<a href="http://www.facebook.com/twitter" rel="nofollow">Facebook</a>                    167
<a href="http://twitter.com/#!/download/ipad" rel="nofollow">Twitter for iPad</a>        165
<a href="http://www.twitter.com" rel="nofollow">Twitter for Windows Phone</a>            139
<a href="http://onlywire.com/" rel="nofollow">OnlyWire / Official App</a>                136
<a href="http://www.twitter.com" rel="nofollow">Twitter f

In [17]:
def platform_type(x):
    ser = re.search( r"android|iphone|web|windows|mobile|google|facebook|ipad|tweetdeck|onlywire", x, re.IGNORECASE)
    if ser:
        return ser.group()
    else:
        return None

#reset index of the series
top_platforms = top_platforms.reset_index()["index"]

#extract platform types
top_platforms.apply(lambda x: platform_type(x))

KeyError: 'index'

### b. Extracting hashtags from the tweets

In [18]:
# Extract first hashtag from a tweet
text = "RT @Atheist_Krishna: The effect of #Demonetization !!\r\n. https://t.co/A8of7zh2f5"
hashtag = re.search('#\w+', text)

print("Tweet:\n", text)
print("Hashtag:\n", hashtag.group())

Tweet:
 RT @Atheist_Krishna: The effect of #Demonetization !!
. https://t.co/A8of7zh2f5
Hashtag:
 #Demonetization


In [19]:
# Extract multiple hastags from a tweet
text = """RT @kapil_kausik: #Doltiwal I mean #JaiChandKejriwal is "hurt" by #Demonetization as the same has rendered USELESS <ed><U+00A0><U+00BD><ed><U+00B1><U+0089> "acquired funds" No wo"""
hashtags = re.findall('#\w+', text)

print("Tweet:\n", text)
print("Hashtag:\n", hashtags)

Tweet:
 RT @kapil_kausik: #Doltiwal I mean #JaiChandKejriwal is "hurt" by #Demonetization as the same has rendered USELESS <ed><U+00A0><U+00BD><ed><U+00B1><U+0089> "acquired funds" No wo
Hashtag:
 ['#Doltiwal', '#JaiChandKejriwal', '#Demonetization']


In [20]:
df['hashtags']=df['text'].apply(lambda x: re.findall('#\w+', x))

In [21]:
df[['text','hashtags']].head()

Unnamed: 0,text,hashtags
0,@rssurjewala: Critical question: Was PayTM inf...,[#Demonetization]
1,@Hemant_80: Did you vote on #Demonetization on...,[#Demonetization]
2,"@roshankar: Former FinSec, RBI Dy Governor, CB...",[#Demonetization]
3,@ANI_news: Gurugram (Haryana): Post office emp...,[#demonetization]
4,@satishacharya: Reddy Wedding! @mail_today car...,"[#demonetization, #ReddyWedding]"


## 4. Regex Challenge

Now that you have learned all the concepts regarding regex and have also seen it in action, it's time for you to utilize that to solve a challenge all by yourself. Here are some of the tasks that you have to do -

# Assigment 1

**Difficulty - Easy**

There are multiple URLs present in individual tweet's `text` and they don't neccessarily provide useful information so we can get rid of them. For example -  

*@Joydas: Question in Narendra Modi App where PM is taking feedback if people support his #DeMonetization strategy https://t.co/pYgK8Rmg7r*


We can very well remove the URL as it isn't providing much useful information.


In [44]:
# on index 3 (row 4) there is a link ......
df['text'][3]

'@ANI_news: Gurugram (Haryana): Post office employees provide cash exchange to patients in hospitals #demonetization \x85'

In [35]:
# removing the link in a single tweet
text = "@Atheist_Krishna: BEFORE and AFTER Gandhi ji heard they are standing there against #Demonetization\r\n. https://t.co/9NheK63TPg"
# regex formula to get the link.
pattern = 'https:\/\/[\w.\/]+'
# delete the link
clean_text = re.sub(pattern,'', text)

print("Text before:\n", text)
print("Text after:\n", clean_text)

Text before:
 @Atheist_Krishna: BEFORE and AFTER Gandhi ji heard they are standing there against #Demonetization
. https://t.co/9NheK63TPg
Text after:
 @Atheist_Krishna: BEFORE and AFTER Gandhi ji heard they are standing there against #Demonetization
. 


### making a new column:
 links: move all the links one each row to that column

In [36]:
# make a new column, and its value is only the links on each row.
df['links']=df['text'].apply(lambda x: re.findall('https:\/\/[\w.\/]+', x))

In [38]:
# index 3 there is a link
df['text'][3]

'@ANI_news: Gurugram (Haryana): Post office employees provide cash exchange to patients in hospitals #demonetization https://t.co/uGMxUP9\x85'

as we can see on index 3 (row4) there is a link

In [37]:
# the same link you ca find on links columns, index 3
df['links'].head()

0                           []
1                           []
2                           []
3       [https://t.co/uGMxUP9]
4    [https://t.co/u7gLNrq31F]
Name: links, dtype: object

### Removing the links on text column

In [39]:
# Removing the links in all the tweets
df['text']=df['text'].apply(lambda x: re.sub('https:\/\/[\w.\/]+', '', x))

In [40]:
# we removed the link on the index 3,
df['text'][3]

'@ANI_news: Gurugram (Haryana): Post office employees provide cash exchange to patients in hospitals #demonetization \x85'

as you can see, on index 3 (row 4) there is no link, that mean we removed the links on the text columns, but we can find the link on link column

In [43]:
df[['text', 'links']]

Unnamed: 0,text,links
0,@rssurjewala: Critical question: Was PayTM inf...,[]
1,@Hemant_80: Did you vote on #Demonetization on...,[]
2,"@roshankar: Former FinSec, RBI Dy Governor, CB...",[]
3,@ANI_news: Gurugram (Haryana): Post office emp...,[https://t.co/uGMxUP9]
4,@satishacharya: Reddy Wedding! @mail_today car...,[https://t.co/u7gLNrq31F]
...,...,...
14935,@saxenavishakha: Ghost of demonetization retur...,[https://t.co/kQsBesTIUs]
14936,N d modi fans-d true nationalists of the count...,[https://t.co/9mgMEFu2sl]
14937,@bharat_builder: Lol. Demonetization has fixed...,[]
14938,@Stupidosaur: @Vidyut B team of BJP. CIA baby....,[]


# Assigment 2:  Extract Top 100 mentions

**Difficulty - Medium**

Many of the tweets have mentions of people in the form *@username*, for example see the following tweet -

*@Joydas: Question in Narendra Modi App where PM is taking feedback if people support his #DeMonetization strategy https://t.co/pYgK8Rmg7r*

Here *@Joydas* is a mention. You need to extract mentions from all the tweets and find which are the top 100 usernames.

In [90]:
# Function for extracting mentions from the tweet
def mention(x):
    found=re.findall(r'@\w+',x)
    if found:
        return found
    return None



In [91]:

# Extract mentions from all the tweets
arr=df['text'].apply(lambda x : mention(x))



In [92]:

# Combining all the mentions into a list
mentions_arr=[]


In [93]:

for x in arr:
    if x != None:
        mentions_arr.extend(x)



In [94]:
arr



0                       [@rssurjewala]
1                         [@Hemant_80]
2                         [@roshankar]
3                          [@ANI_news]
4        [@satishacharya, @mail_today]
                     ...              
14935                [@saxenavishakha]
14936                             None
14937                [@bharat_builder]
14938          [@Stupidosaur, @Vidyut]
14939                        [@Vidyut]
Name: text, Length: 14940, dtype: object

In [95]:

mentions_arr[:10]



['@rssurjewala',
 '@Hemant_80',
 '@roshankar',
 '@ANI_news',
 '@satishacharya',
 '@mail_today',
 '@DerekScissors1',
 '@ambazaarmag',
 '@gauravcsawant',
 '@Joydeep_911']

In [96]:

# Getting top 100 mentions
mentions_count=pd.Series(mentions_arr).value_counts().head(100)




In [97]:
mentions_count

@evanspiegel        1311
@URautelaForever    1273
@narendramodi       1138
@gauravcsawant       541
@ModiBharosa         540
                    ... 
@rupasubramanya       30
@hi_paresh            30
@sanjayuv             30
@upma23               29
@MinhazMerchant       29
Name: count, Length: 100, dtype: int64

#  Assignment 3: Display how many users are mentioned on each row, and aslo display the top 100 row that have the most users mentioned on a single tweet

In [49]:
df['text'][5]

'@DerekScissors1: India\x92s #demonetization: #Blackmoney a symptom, not the disease  via @ambazaarmag'

In [51]:
# firstly we have to display the users that are mentioned on each tweet or row.
text = "@DerekScissors1: India\x92s #demonetization: #Blackmoney a symptom, not the disease  via @ambazaarmag"
# regex formula to get the mentioned users..
pattern = '@[\w]+'
# find the mentioned users.
clean_text = re.findall(pattern, text)

print("Text before:\n", text)
print("Text after:\n", clean_text)
print(len(clean_text))

Text before:
 @DerekScissors1: Indias #demonetization: #Blackmoney a symptom, not the disease  via @ambazaarmag
Text after:
 ['@DerekScissors1', '@ambazaarmag']
2


### Creating a new columns for mentioned users

In [52]:
# make a new column, and its value is only the mentioned users on each row.
df['mentioned_users']=df['text'].apply(lambda x: re.findall('@[\w]+', x))

In [55]:
# we already now on index 5 row(6) there is 2 users mentioned lets check...
df['mentioned_users'][5]

['@DerekScissors1', '@ambazaarmag']

In [56]:
# counting how many users mentioned are there on index 5
len(df['mentioned_users'][5])

2

### a new column for counting the users mentioned

In [65]:
# make a new column, and its value is only the mentioned users on each row.
# df['count_mentioned_users']=df['mentioned_users'].apply(lambda x: len(x) , x)
df['count_mentioned_users'] = df['mentioned_users'].apply(lambda x: len(x))

In [66]:
df['count_mentioned_users']

0        1
1        1
2        1
3        1
4        2
        ..
14935    1
14936    0
14937    1
14938    2
14939    1
Name: count_mentioned_users, Length: 14940, dtype: int64

In [74]:
df['mentioned_users'][14936]

[]

we checked 14936 row and we get 0 , so we did it, i mean we created a new column that displays only how many users are mentioned on each tweet

In [75]:
df.head()

Unnamed: 0.1,Unnamed: 0,X,text,favorited,favoriteCount,replyToSN,created,truncated,replyToSID,id,replyToUID,statusSource,screenName,retweetCount,isRetweet,retweeted,hashtags,links,mentioned_users,count_mentioned_users
0,1,1,@rssurjewala: Critical question: Was PayTM inf...,False,0,,2016-11-23 18:40:30,False,,8.014957e+17,,"<a href=""http://twitter.com/download/android"" ...",HASHTAGFARZIWAL,331,True,False,[#Demonetization],[],[@rssurjewala],1
1,2,2,@Hemant_80: Did you vote on #Demonetization on...,False,0,,2016-11-23 18:40:29,False,,8.014957e+17,,"<a href=""http://twitter.com/download/android"" ...",PRAMODKAUSHIK9,66,True,False,[#Demonetization],[],[@Hemant_80],1
2,3,3,"@roshankar: Former FinSec, RBI Dy Governor, CB...",False,0,,2016-11-23 18:40:03,False,,8.014955e+17,,"<a href=""http://twitter.com/download/android"" ...",rahulja13034944,12,True,False,[#Demonetization],[],[@roshankar],1
3,4,4,@ANI_news: Gurugram (Haryana): Post office emp...,False,0,,2016-11-23 18:39:59,False,,8.014955e+17,,"<a href=""http://twitter.com/download/android"" ...",deeptiyvd,338,True,False,[#demonetization],[https://t.co/uGMxUP9],[@ANI_news],1
4,5,5,@satishacharya: Reddy Wedding! @mail_today car...,False,0,,2016-11-23 18:39:39,False,,8.014954e+17,,"<a href=""http://cpimharyana.com"" rel=""nofollow...",CPIMBadli,120,True,False,"[#demonetization, #ReddyWedding]",[https://t.co/u7gLNrq31F],"[@satishacharya, @mail_today]",2


##: displaying the top 100 users mentioned on the tweet

In [84]:
sorted_df = df.sort_values('count_mentioned_users', ascending=False)
top_100 = sorted_df.head(100)

In [86]:
top_100[['text','mentioned_users', 'count_mentioned_users']].head()

Unnamed: 0,text,mentioned_users,count_mentioned_users
13857,@dewindia11: @VPra52 @RaajnathSing @hsr315 @UJ...,"[@dewindia11, @VPra52, @RaajnathSing, @hsr315,...",13
14010,@dewindia11: @VPra52 @RaajnathSing @hsr315 @UJ...,"[@dewindia11, @VPra52, @RaajnathSing, @hsr315,...",13
14009,@dewindia11: @VPra52 @RaajnathSing @hsr315 @UJ...,"[@dewindia11, @VPra52, @RaajnathSing, @hsr315,...",13
14002,@dewindia11: @VPra52 @RaajnathSing @hsr315 @UJ...,"[@dewindia11, @VPra52, @RaajnathSing, @hsr315,...",13
13999,@dewindia11: @VPra52 @RaajnathSing @hsr315 @UJ...,"[@dewindia11, @VPra52, @RaajnathSing, @hsr315,...",13


In [87]:
top_100['count_mentioned_users'].value_counts().sum()

100

In [88]:
top_100.shape

(100, 20)

In [89]:
df['text'][13857]

'@dewindia11: @VPra52 @RaajnathSing @hsr315 @UJ1701 @AAPlogical @JTIn71 @Arv_Ga @Amit_Talwar2 @goeloverseas @rks17353 @1975Raman @khanfah\x85'

### Solution - 2