In [82]:
import numpy as np
import pandas as pd

# **Load & Lower**

In [83]:
df = pd.read_csv('/kaggle/input/tweets/sample.csv')
df['text_lower'] = df['text'].str.lower()
tweets = df[['text','text_lower']]
tweets.head()

Unnamed: 0,text,text_lower
0,@AppleSupport causing the reply to be disregar...,@applesupport causing the reply to be disregar...
1,@105835 Your business means a lot to us. Pleas...,@105835 your business means a lot to us. pleas...
2,@76328 I really hope you all change but I'm su...,@76328 i really hope you all change but i'm su...
3,@105836 LiveChat is online at the moment - htt...,@105836 livechat is online at the moment - htt...
4,@VirginTrains see attached error message. I've...,@virgintrains see attached error message. i've...


# **Remove Punctuation**

In [84]:
import string

# Function ----------------------
def remove_punctuation(text):
    translator = str.maketrans('','',string.punctuation)
    text_without_punctuation = text.translate(translator)
    return text_without_punctuation

#Execution ----------------------
for index, row in tweets.iterrows():
    tweets.loc[index, 'text_no_punctuation'] = remove_punctuation(row['text_lower'])
tweets.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets.loc[index, 'text_no_punctuation'] = remove_punctuation(row['text_lower'])


Unnamed: 0,text,text_lower,text_no_punctuation
0,@AppleSupport causing the reply to be disregar...,@applesupport causing the reply to be disregar...,applesupport causing the reply to be disregard...
1,@105835 Your business means a lot to us. Pleas...,@105835 your business means a lot to us. pleas...,105835 your business means a lot to us please ...
2,@76328 I really hope you all change but I'm su...,@76328 i really hope you all change but i'm su...,76328 i really hope you all change but im sure...
3,@105836 LiveChat is online at the moment - htt...,@105836 livechat is online at the moment - htt...,105836 livechat is online at the moment https...
4,@VirginTrains see attached error message. I've...,@virgintrains see attached error message. i've...,virgintrains see attached error message ive tr...


# **Remove Stopwords**

In [85]:
from nltk.corpus import stopwords

# Function ----------------------
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in stopwords.words('english')])

#Execution ----------------------
tweets['text_no_stopwords'] = tweets.text_no_punctuation.apply(lambda x : remove_stopwords(x))
tweets.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets['text_no_stopwords'] = tweets.text_no_punctuation.apply(lambda x : remove_stopwords(x))


Unnamed: 0,text,text_lower,text_no_punctuation,text_no_stopwords
0,@AppleSupport causing the reply to be disregar...,@applesupport causing the reply to be disregar...,applesupport causing the reply to be disregard...,applesupport causing reply disregarded tapped ...
1,@105835 Your business means a lot to us. Pleas...,@105835 your business means a lot to us. pleas...,105835 your business means a lot to us please ...,105835 business means lot us please dm name zi...
2,@76328 I really hope you all change but I'm su...,@76328 i really hope you all change but i'm su...,76328 i really hope you all change but im sure...,76328 really hope change im sure wont dont
3,@105836 LiveChat is online at the moment - htt...,@105836 livechat is online at the moment - htt...,105836 livechat is online at the moment https...,105836 livechat online moment httpstcosy94vtu8...
4,@VirginTrains see attached error message. I've...,@virgintrains see attached error message. i've...,virgintrains see attached error message ive tr...,virgintrains see attached error message ive tr...


# **Remove urls**

In [86]:
import re

# Function ----------------------
def remove_url(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'',text)

#Execution ----------------------
tweets['text_no_urls'] = tweets.text_no_stopwords.apply(lambda x : remove_url(x))
tweets.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets['text_no_urls'] = tweets.text_no_stopwords.apply(lambda x : remove_url(x))


Unnamed: 0,text,text_lower,text_no_punctuation,text_no_stopwords,text_no_urls
0,@AppleSupport causing the reply to be disregar...,@applesupport causing the reply to be disregar...,applesupport causing the reply to be disregard...,applesupport causing reply disregarded tapped ...,applesupport causing reply disregarded tapped ...
1,@105835 Your business means a lot to us. Pleas...,@105835 your business means a lot to us. pleas...,105835 your business means a lot to us please ...,105835 business means lot us please dm name zi...,105835 business means lot us please dm name zi...
2,@76328 I really hope you all change but I'm su...,@76328 i really hope you all change but i'm su...,76328 i really hope you all change but im sure...,76328 really hope change im sure wont dont,76328 really hope change im sure wont dont
3,@105836 LiveChat is online at the moment - htt...,@105836 livechat is online at the moment - htt...,105836 livechat is online at the moment https...,105836 livechat online moment httpstcosy94vtu8...,105836 livechat online moment httpstcosy94vtu8...
4,@VirginTrains see attached error message. I've...,@virgintrains see attached error message. i've...,virgintrains see attached error message ive tr...,virgintrains see attached error message ive tr...,virgintrains see attached error message ive tr...


# **Remove HTML**

In [87]:
# Function ----------------------
def remove_html(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

#Execution ----------------------
tweets['text_no_html'] = tweets.text_no_urls.apply(lambda x : remove_html(x))
tweets.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets['text_no_html'] = tweets.text_no_urls.apply(lambda x : remove_html(x))


Unnamed: 0,text,text_lower,text_no_punctuation,text_no_stopwords,text_no_urls,text_no_html
0,@AppleSupport causing the reply to be disregar...,@applesupport causing the reply to be disregar...,applesupport causing the reply to be disregard...,applesupport causing reply disregarded tapped ...,applesupport causing reply disregarded tapped ...,applesupport causing reply disregarded tapped ...
1,@105835 Your business means a lot to us. Pleas...,@105835 your business means a lot to us. pleas...,105835 your business means a lot to us please ...,105835 business means lot us please dm name zi...,105835 business means lot us please dm name zi...,105835 business means lot us please dm name zi...
2,@76328 I really hope you all change but I'm su...,@76328 i really hope you all change but i'm su...,76328 i really hope you all change but im sure...,76328 really hope change im sure wont dont,76328 really hope change im sure wont dont,76328 really hope change im sure wont dont
3,@105836 LiveChat is online at the moment - htt...,@105836 livechat is online at the moment - htt...,105836 livechat is online at the moment https...,105836 livechat online moment httpstcosy94vtu8...,105836 livechat online moment httpstcosy94vtu8...,105836 livechat online moment httpstcosy94vtu8...
4,@VirginTrains see attached error message. I've...,@virgintrains see attached error message. i've...,virgintrains see attached error message ive tr...,virgintrains see attached error message ive tr...,virgintrains see attached error message ive tr...,virgintrains see attached error message ive tr...


# **Remove Emojis**

In [88]:
# Function ----------------------
def remove_emojis(text):
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # Emoticons
        u"\U0001F300-\U0001F5FF"  # Symbols & Pictographs
        u"\U0001F680-\U0001F6FF"  # Transport & Map Symbols
        u"\U0001F700-\U0001F77F"  # Alchemical Symbols
        u"\U0001F780-\U0001F7FF"  # Geometric Shapes
        u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
        u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        u"\U0001FA00-\U0001FA6F"  # Extended-A
        u"\U0001FA70-\U0001FAFF"  # Extended-B
        u"\U0001F004-\U0001F0CF"  # Miscellaneous Symbols and Pictographs
        u"\U0001F170-\U0001F251"  # Enclosed Ideographic Supplement
        u"\U00002702-\U000027B0"  # Dingbats
                           "]+", flags=re.UNICODE)
    
    text_without_emojis = emoji_pattern.sub(r'', text)
    
    return text_without_emojis

#Execution ----------------------
tweets['text_no_emojis'] = tweets.text_no_html.apply(lambda x : remove_emojis(x))
tweets.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets['text_no_emojis'] = tweets.text_no_html.apply(lambda x : remove_emojis(x))


Unnamed: 0,text,text_lower,text_no_punctuation,text_no_stopwords,text_no_urls,text_no_html,text_no_emojis
0,@AppleSupport causing the reply to be disregar...,@applesupport causing the reply to be disregar...,applesupport causing the reply to be disregard...,applesupport causing reply disregarded tapped ...,applesupport causing reply disregarded tapped ...,applesupport causing reply disregarded tapped ...,applesupport causing reply disregarded tapped ...
1,@105835 Your business means a lot to us. Pleas...,@105835 your business means a lot to us. pleas...,105835 your business means a lot to us please ...,105835 business means lot us please dm name zi...,105835 business means lot us please dm name zi...,105835 business means lot us please dm name zi...,105835 business means lot us please dm name zi...
2,@76328 I really hope you all change but I'm su...,@76328 i really hope you all change but i'm su...,76328 i really hope you all change but im sure...,76328 really hope change im sure wont dont,76328 really hope change im sure wont dont,76328 really hope change im sure wont dont,76328 really hope change im sure wont dont
3,@105836 LiveChat is online at the moment - htt...,@105836 livechat is online at the moment - htt...,105836 livechat is online at the moment https...,105836 livechat online moment httpstcosy94vtu8...,105836 livechat online moment httpstcosy94vtu8...,105836 livechat online moment httpstcosy94vtu8...,105836 livechat online moment httpstcosy94vtu8...
4,@VirginTrains see attached error message. I've...,@virgintrains see attached error message. i've...,virgintrains see attached error message ive tr...,virgintrains see attached error message ive tr...,virgintrains see attached error message ive tr...,virgintrains see attached error message ive tr...,virgintrains see attached error message ive tr...


# **Test**

In [89]:
import random
index = random.randint(0,10)
print(tweets.size)
print(index)

print("text------------------------------------------")
print(tweets['text'][index])
print("text_lower------------------------------------------")
print(tweets['text_lower'][index])
print("text_no_punctuation------------------------------------------")
print(tweets['text_no_punctuation'][index])
print("text_no_stopwords------------------------------------------")
print(tweets['text_no_stopwords'][index])
print("text_no_urls------------------------------------------")
print(tweets['text_no_urls'][index])
print("text_no_html------------------------------------------")
print(tweets['text_no_html'][index])
print("text_no_emojis------------------------------------------")
print(tweets['text_no_emojis'][index])

651
5
text------------------------------------------
@105836 Have you tried from another device, Miriam ^MM
text_lower------------------------------------------
@105836 have you tried from another device, miriam ^mm
text_no_punctuation------------------------------------------
105836 have you tried from another device miriam mm
text_no_stopwords------------------------------------------
105836 tried another device miriam mm
text_no_urls------------------------------------------
105836 tried another device miriam mm
text_no_html------------------------------------------
105836 tried another device miriam mm
text_no_emojis------------------------------------------
105836 tried another device miriam mm


# **Stemming**

In [90]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

# Fucntion ------------------
def stemming(text):
    return " ".join([stemmer.stem(word) for word in text.split()])

# Execution ------------------
tweets['text_stemmed'] = tweets.text_no_emojis.apply(lambda x : stemming(x))
tweets.head()

Unnamed: 0,text,text_lower,text_no_punctuation,text_no_stopwords,text_no_urls,text_no_html,text_no_emojis,text_stemmed
0,@AppleSupport causing the reply to be disregar...,@applesupport causing the reply to be disregar...,applesupport causing the reply to be disregard...,applesupport causing reply disregarded tapped ...,applesupport causing reply disregarded tapped ...,applesupport causing reply disregarded tapped ...,applesupport causing reply disregarded tapped ...,applesupport caus repli disregard tap notif ke...
1,@105835 Your business means a lot to us. Pleas...,@105835 your business means a lot to us. pleas...,105835 your business means a lot to us please ...,105835 business means lot us please dm name zi...,105835 business means lot us please dm name zi...,105835 business means lot us please dm name zi...,105835 business means lot us please dm name zi...,105835 busi mean lot us pleas dm name zip code...
2,@76328 I really hope you all change but I'm su...,@76328 i really hope you all change but i'm su...,76328 i really hope you all change but im sure...,76328 really hope change im sure wont dont,76328 really hope change im sure wont dont,76328 really hope change im sure wont dont,76328 really hope change im sure wont dont,76328 realli hope chang im sure wont dont
3,@105836 LiveChat is online at the moment - htt...,@105836 livechat is online at the moment - htt...,105836 livechat is online at the moment https...,105836 livechat online moment httpstcosy94vtu8...,105836 livechat online moment httpstcosy94vtu8...,105836 livechat online moment httpstcosy94vtu8...,105836 livechat online moment httpstcosy94vtu8...,105836 livechat onlin moment httpstcosy94vtu8k...
4,@VirginTrains see attached error message. I've...,@virgintrains see attached error message. i've...,virgintrains see attached error message ive tr...,virgintrains see attached error message ive tr...,virgintrains see attached error message ive tr...,virgintrains see attached error message ive tr...,virgintrains see attached error message ive tr...,virgintrain see attach error messag ive tri le...


# **Lemmatization**

In [91]:
import spacy
nlp = spacy.load('en_core_web_sm')

# Fucntion ------------------
def lemmatizing(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc])

# Execution ------------------
tweets['text_lemmatized'] = tweets.text_no_emojis.apply(lambda x : lemmatizing(x))
tweets.head()

Unnamed: 0,text,text_lower,text_no_punctuation,text_no_stopwords,text_no_urls,text_no_html,text_no_emojis,text_stemmed,text_lemmatized
0,@AppleSupport causing the reply to be disregar...,@applesupport causing the reply to be disregar...,applesupport causing the reply to be disregard...,applesupport causing reply disregarded tapped ...,applesupport causing reply disregarded tapped ...,applesupport causing reply disregarded tapped ...,applesupport causing reply disregarded tapped ...,applesupport caus repli disregard tap notif ke...,applesupport cause reply disregard tap notific...
1,@105835 Your business means a lot to us. Pleas...,@105835 your business means a lot to us. pleas...,105835 your business means a lot to us please ...,105835 business means lot us please dm name zi...,105835 business means lot us please dm name zi...,105835 business means lot us please dm name zi...,105835 business means lot us please dm name zi...,105835 busi mean lot us pleas dm name zip code...,105835 business mean lot we please dm name zip...
2,@76328 I really hope you all change but I'm su...,@76328 i really hope you all change but i'm su...,76328 i really hope you all change but im sure...,76328 really hope change im sure wont dont,76328 really hope change im sure wont dont,76328 really hope change im sure wont dont,76328 really hope change im sure wont dont,76328 realli hope chang im sure wont dont,76328 really hope change I m sure will not do not
3,@105836 LiveChat is online at the moment - htt...,@105836 livechat is online at the moment - htt...,105836 livechat is online at the moment https...,105836 livechat online moment httpstcosy94vtu8...,105836 livechat online moment httpstcosy94vtu8...,105836 livechat online moment httpstcosy94vtu8...,105836 livechat online moment httpstcosy94vtu8...,105836 livechat onlin moment httpstcosy94vtu8k...,105836 livechat online moment httpstcosy94vtu8...
4,@VirginTrains see attached error message. I've...,@virgintrains see attached error message. i've...,virgintrains see attached error message ive tr...,virgintrains see attached error message ive tr...,virgintrains see attached error message ive tr...,virgintrains see attached error message ive tr...,virgintrains see attached error message ive tr...,virgintrain see attach error messag ive tri le...,virgintrain see attach error message I ve try ...


# **Test**

In [92]:
import random
index = random.randint(0, 10)
print(tweets.size)
print(index)

print("text_no_emojis------------------------------------------")
print(tweets['text_no_emojis'][index])
print("text_stemmed------------------------------------------")
print(tweets['text_stemmed'][index])
print("text_lemmatized------------------------------------------")
print(tweets['text_lemmatized'][index])

837
5
text_no_emojis------------------------------------------
105836 tried another device miriam mm
text_stemmed------------------------------------------
105836 tri anoth devic miriam mm
text_lemmatized------------------------------------------
105836 try another device miriam mm
