### import packages

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [37]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from nltk.tokenize import ToktokTokenizer
import string as str

In [47]:
import spacy
import unidecode
from word2number import w2n
import contractions
from gensim.models import Word2Vec 

### data

In [3]:
text = "<body> The parts of a Written or spoken Statement that precede or Follow a specific Word or passage, ^_^ usually <br/> influencing it's meaning <br/> or effect #meaning: You have misinterpreted #passage my Remark because you took it out of context. minarahimi@yahoo.com The set of circumstances or facts that surround a particular event,< situation, etc. ali-hamidi@gmail.com </body>. like three butterflies in the garden"
text

"<body> The parts of a Written or spoken Statement that precede or Follow a specific Word or passage, ^_^ usually <br/> influencing it's meaning <br/> or effect #meaning: You have misinterpreted #passage my Remark because you took it out of context. minarahimi@yahoo.com The set of circumstances or facts that surround a particular event,< situation, etc. ali-hamidi@gmail.com </body>. like three butterflies in the garden"

### remove html tags

In [4]:
def del_html_tag(df):
    soup = BeautifulSoup(df, "html.parser")
    stripped_text = soup.get_text(separator = "")
    return stripped_text

In [5]:
first_clean_data = del_html_tag(df = text)

In [6]:
first_clean_data

" The parts of a Written or spoken Statement that precede or Follow a specific Word or passage, ^_^ usually  influencing it's meaning  or effect #meaning: You have misinterpreted #passage my Remark because you took it out of context. minarahimi@yahoo.com The set of circumstances or facts that surround a particular event,< situation, etc. ali-hamidi@gmail.com . like three butterflies in the garden"

### remove emails

In [7]:
def remove_email(df):
    match = r'[\w-]+@[\w.]+'
    return re.sub(match, '', df)

In [8]:
second_clean_data = remove_email(df = first_clean_data)

In [9]:
second_clean_data

" The parts of a Written or spoken Statement that precede or Follow a specific Word or passage, ^_^ usually  influencing it's meaning  or effect #meaning: You have misinterpreted #passage my Remark because you took it out of context.  The set of circumstances or facts that surround a particular event,< situation, etc.  . like three butterflies in the garden"

### find and remove hashtags

In [10]:
def hashtag_list(df):
    hashtag = re.findall(r'#(\w+)', df)
    return hashtag

In [11]:
hashtag = hashtag_list(df = second_clean_data)
hashtag

['meaning', 'passage']

In [12]:
def remove_hashtag(df):
    hashtag_remover = r'#(\w+)'
    return re.sub(hashtag_remover, '', df)

In [13]:
third_clean_data = remove_hashtag(df = second_clean_data)
third_clean_data

" The parts of a Written or spoken Statement that precede or Follow a specific Word or passage, ^_^ usually  influencing it's meaning  or effect : You have misinterpreted  my Remark because you took it out of context.  The set of circumstances or facts that surround a particular event,< situation, etc.  . like three butterflies in the garden"

### remove special characters

In [14]:
def remove_special_characters(df):
    special_character = r'[^a-zA-Z0-9.,!?/:;\"\'\s]'
    return re.sub(special_character, '', df)

In [15]:
fourth_clean_data = remove_special_characters(df = third_clean_data)
fourth_clean_data

" The parts of a Written or spoken Statement that precede or Follow a specific Word or passage,  usually  influencing it's meaning  or effect : You have misinterpreted  my Remark because you took it out of context.  The set of circumstances or facts that surround a particular event, situation, etc.  . like three butterflies in the garden"

### remove punctuations

In [21]:
def remove_punctuations(df):
    exclude = set(str.punctuation)
    df = ''.join([ch for ch in df if ch not in exclude])
    return df

In [22]:
fifth_clean_data = remove_punctuations(df = fourth_clean_data)
fifth_clean_data

' The parts of a Written or spoken Statement that precede or Follow a specific Word or passage  usually  influencing its meaning  or effect  You have misinterpreted  my Remark because you took it out of context  The set of circumstances or facts that surround a particular event situation etc   like three butterflies in the garden'

### stemming

In [23]:
def get_stem(df):
    stemmer = nltk.porter.PorterStemmer()
    df = ' '.join([stemmer.stem(word) for word in df.split()])
    return df

In [24]:
sixth_clean_data = get_stem(df = fifth_clean_data)
sixth_clean_data

'the part of a written or spoken statement that preced or follow a specif word or passag usual influenc it mean or effect you have misinterpret my remark becaus you took it out of context the set of circumst or fact that surround a particular event situat etc like three butterfli in the garden'

### stop words

In [38]:
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove('not')

In [42]:
def remove_stopwords(df):
    tokens = tokenizer.tokenize(df)
    tokens = [token.strip() for token in tokens]
    t = [token for token in tokens if token.lower() not in stopword_list]
    df = ' '.join(t)
    return df

In [43]:
seventh_clean_data = remove_stopwords(df = sixth_clean_data)
seventh_clean_data

'part written spoken statement preced follow specif word passag usual influenc mean effect misinterpret remark becaus took context set circumst fact surround particular event situat etc like three butterfli garden'

### tokenizer

In [44]:
def tokenize(df):
    my_word = word_tokenize(df)
    return my_word

In [45]:
eightth_clean_data = tokenize(seventh_clean_data)
eightth_clean_data

['part',
 'written',
 'spoken',
 'statement',
 'preced',
 'follow',
 'specif',
 'word',
 'passag',
 'usual',
 'influenc',
 'mean',
 'effect',
 'misinterpret',
 'remark',
 'becaus',
 'took',
 'context',
 'set',
 'circumst',
 'fact',
 'surround',
 'particular',
 'event',
 'situat',
 'etc',
 'like',
 'three',
 'butterfli',
 'garden']

## comment

### lower case

In [60]:
def lower_case(df):
    df = [[word.lower() for word in text.split()] for text in df]
    return df

In [62]:
nineth_clean_data = lower_case(df = eightth_clean_data)
nineth_clean_data

[['part'],
 ['written'],
 ['spoken'],
 ['statement'],
 ['preced'],
 ['follow'],
 ['specif'],
 ['word'],
 ['passag'],
 ['usual'],
 ['influenc'],
 ['mean'],
 ['effect'],
 ['misinterpret'],
 ['remark'],
 ['becaus'],
 ['took'],
 ['context'],
 ['set'],
 ['circumst'],
 ['fact'],
 ['surround'],
 ['particular'],
 ['event'],
 ['situat'],
 ['etc'],
 ['like'],
 ['three'],
 ['butterfli'],
 ['garden']]

### expand contractions

In [43]:
# it's to its
def expand(df):
    text = contractions.fix(df)
    return text

In [44]:
expand(df = text)

### word to number

In [47]:
#5 to five
tokens = [w2n.word_to_num(token.text) if token.pos_ == 'NUM' else token for token in text]

print(tokens)

In [50]:
def stopword(df):
    for x in text:
        df.vocab[x].is_stop = False

In [90]:
stopword(df = text)

### delete numbers

In [34]:
def remove_numbers(df):
    pattern = r'[0-9]'
    return re.sub(pattern, '', df)

In [36]:
remove_numbers(df = text)