In [1]:
import re
import string
import nltk
import numpy as np
import pandas as pd
# nltk.download('stopwords')
import plotly.graph_objects as go
from mysutils.text import remove_urls
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
# from nltk.


In [2]:
df=pd.read_csv('Elon_musk.csv',encoding='cp1252',usecols=['Text'])
df.head()

Unnamed: 0,Text
0,@kunalb11 I’m an alien
1,@ID_AA_Carmack Ray tracing on Cyberpunk with H...
2,@joerogan @Spotify Great interview!
3,@gtera27 Doge is underestimated
4,@teslacn Congratulations Tesla China for amazi...


In [3]:
df.iloc[1,]

Text    @ID_AA_Carmack Ray tracing on Cyberpunk with H...
Name: 1, dtype: object

In [4]:
    # creating new_df with all lower case
    new_df=pd.DataFrame(df['Text'].apply(lambda x : x.strip().lower()))

In [5]:
new_df.head()

Unnamed: 0,Text
0,@kunalb11 i’m an alien
1,@id_aa_carmack ray tracing on cyberpunk with h...
2,@joerogan @spotify great interview!
3,@gtera27 doge is underestimated
4,@teslacn congratulations tesla china for amazi...


In [6]:
new_df.shape

(1999, 1)

In [7]:
# Lets see all unique first letters
print(set([x[0] for x in new_df['Text']]))

{'v', 'l', 'b', 'a', 'r', '<', 'x', '1', 'h', '2', 'e', 'j', '9', 'w', 'i', 'd', 'o', 'u', '5', 'n', 'ð', 's', '“', 'c', '4', 'p', '.', 't', 'f', 'g', 'y', 'q', '@', 'm', '0'}


In [8]:
# Lets see how many of them starts with @
len(new_df[new_df['Text'].str.contains('@')])

1678

In [9]:
fig=go.Figure(go.Pie(labels=['contains @','Others'],values=[1678,1999-1678],pull=[0,0.1]))
fig.update_layout(autosize=False,title='Tweets with user tags')
fig.show()

In [10]:
# How many of them contains links 
count=len(new_df[new_df['Text'].str.contains('https://')])
fig=go.Figure(go.Pie(labels=['Contains URLs','Others'],values=[count,1999-count],pull=[0,0.1]))
fig.update_layout(autosize=False,title='Tweets with URLs')
fig.show()

In [11]:
# We dont need @user_names so removing them 
new_df['removed user']=new_df['Text'].apply(lambda x: ' '.join(TweetTokenizer(strip_handles=True).tokenize(x)))
new_df.head()

Unnamed: 0,Text,removed user
0,@kunalb11 i’m an alien,i ’ m an alien
1,@id_aa_carmack ray tracing on cyberpunk with h...,ray tracing on cyberpunk with hdr is next-leve...
2,@joerogan @spotify great interview!,great interview !
3,@gtera27 doge is underestimated,doge is underestimated
4,@teslacn congratulations tesla china for amazi...,congratulations tesla china for amazing execut...


In [12]:
def row_text(text):
    # Script to remove emojis from text
    regrex_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    text= regrex_pattern.sub(r'',text)

    # converting all text to lowercase letters
    text=text.lower()

    # removing urls
    text=remove_urls(text)

    # remove words inside < >
    text = re.sub('<.*?>+', '', text)
    
    # removing punctuation
    text=re.sub('[%s]' % re.escape(string.punctuation), '', text)

    # removing '\n' which is a new line
    text = re.sub('\n', '', text)

    text = re.sub('\w*\d\w*', '', text)
    
    return text


    


In [13]:
# Applying the text column
new_df['cleaned tweets']=new_df['removed user'].apply(lambda x: row_text(x))

In [14]:
new_df

Unnamed: 0,Text,removed user,cleaned tweets
0,@kunalb11 i’m an alien,i ’ m an alien,i ’ m an alien
1,@id_aa_carmack ray tracing on cyberpunk with h...,ray tracing on cyberpunk with hdr is next-leve...,ray tracing on cyberpunk with hdr is nextlevel...
2,@joerogan @spotify great interview!,great interview !,great interview
3,@gtera27 doge is underestimated,doge is underestimated,doge is underestimated
4,@teslacn congratulations tesla china for amazi...,congratulations tesla china for amazing execut...,congratulations tesla china for amazing execut...
...,...,...,...
1994,"@flcnhvy true, it sounds so surreal, but the n...","true , it sounds so surreal , but the negative...",true it sounds so surreal but the negative p...
1995,@ppathole make sure to read ur terms &amp; con...,make sure to read ur terms & conditions before...,make sure to read ur terms conditions before ...
1996,@teslagong @ppathole samwise gamgee,samwise gamgee,samwise gamgee
1997,@ppathole altho dumb and dumber is <u+0001f525...,altho dumb and dumber is <u+0001f525> <u+0001f...,altho dumb and dumber is


In [15]:
# Splitting them to remove stopp word
new_df['removed stop words']=new_df['cleaned tweets'].apply(lambda y: [y for y in y.split() if y not in set(stopwords.words('english'))])

In [16]:
new_df

Unnamed: 0,Text,removed user,cleaned tweets,removed stop words
0,@kunalb11 i’m an alien,i ’ m an alien,i ’ m an alien,"[’, alien]"
1,@id_aa_carmack ray tracing on cyberpunk with h...,ray tracing on cyberpunk with hdr is next-leve...,ray tracing on cyberpunk with hdr is nextlevel...,"[ray, tracing, cyberpunk, hdr, nextlevel, tried]"
2,@joerogan @spotify great interview!,great interview !,great interview,"[great, interview]"
3,@gtera27 doge is underestimated,doge is underestimated,doge is underestimated,"[doge, underestimated]"
4,@teslacn congratulations tesla china for amazi...,congratulations tesla china for amazing execut...,congratulations tesla china for amazing execut...,"[congratulations, tesla, china, amazing, execu..."
...,...,...,...,...
1994,"@flcnhvy true, it sounds so surreal, but the n...","true , it sounds so surreal , but the negative...",true it sounds so surreal but the negative p...,"[true, sounds, surreal, negative, propaganda, ..."
1995,@ppathole make sure to read ur terms &amp; con...,make sure to read ur terms & conditions before...,make sure to read ur terms conditions before ...,"[make, sure, read, ur, terms, conditions, clic..."
1996,@teslagong @ppathole samwise gamgee,samwise gamgee,samwise gamgee,"[samwise, gamgee]"
1997,@ppathole altho dumb and dumber is <u+0001f525...,altho dumb and dumber is <u+0001f525> <u+0001f...,altho dumb and dumber is,"[altho, dumb, dumber]"


In [17]:
new_df.loc[275]

Text                  @valaafshar much more than this &amp; increasi...
removed user                   much more than this & increasing rapidly
cleaned tweets                  much more than this  increasing rapidly
removed stop words                          [much, increasing, rapidly]
Name: 275, dtype: object

In [26]:
words=set()
for i in new_df['removed stop words']:
    words=words.union(set(i))

In [29]:
words=set()
for i in new_df['removed stop words']:
    words=words.union(set(i))
words=sorted(list(words))
words

['aber',
 'able',
 'abo',
 'aboard',
 'abort',
 'absence',
 'absolute',
 'absolutely',
 'absorb',
 'absorption',
 'absurd',
 'absurdly',
 'ac',
 'academia',
 'accel',
 'accelera',
 'accelerate',
 'accelerated',
 'accelerating',
 'acceleration',
 'accelerator',
 'accept',
 'acceptable',
 'accepted',
 'access',
 'accessible',
 'accidental',
 'accidents',
 'accommodating',
 'account',
 'accura',
 'accuracy',
 'accurate',
 'ace',
 'achieve',
 'achieved',
 'achievement',
 'achieving',
 'achy',
 'acquisition',
 'across',
 'action',
 'active',
 'activity',
 'actual',
 'actuall',
 'actually',
 'actuaries',
 'adagio',
 'add',
 'added',
 'adding',
 'additive',
 'addressed',
 'administered',
 'adults',
 'advanc',
 'advance',
 'advanced',
 'advantage',
 'adventure',
 'advertising',
 'advice',
 'advised',
 'aero',
 'afb',
 'affair',
 'affects',
 'affordable',
 'africa',
 'afternoon',
 'age',
 'ages',
 'ago',
 'agony',
 'agree',
 'agreed',
 'ah',
 'ahead',
 'ahem',
 'ai',
 'aim',
 'aiming',
 'air',
