In [24]:
import numpy as np, pandas as pd
import regex as re

In [25]:
doc_0 = ''' Peter<b>, Piper picked a peck of pickled peppers;
A peck of pickled peppers Peter Piper picked;
If Peter Piper picked a peck of pickled peppers,
Where’s the peck of pickled peppers Peter Piper picked.'''

In [26]:
doc_1='''1234567How much wood would a woodchuck chuck
if a woodchuck could chuck wood?
He would chuck, he would, as much as he could,
and chuck as much wood as a woodchuck would
if a woodchuck could chuck wood'''

In [27]:
doc_2='''She sells seashells on the seashore.
The shells she sells are seashells, I’m sure.
And if she sells seashells on the seashore,
Then I’m sure she sells seashore shells.'''

In [28]:
doc_3='''Birdie birdie in the sky laid a turdie in my eye.
If cows could fly I’d have a cow pie in my eye.'''

In [29]:
doc_4='''Yellow butter, purple jelly, red jam, black bread.
Spread it thick, say it quick!
Yellow butter, purple jelly, red jam, black bread.
Spread it thicker, say it quicker!
Yellow butter, purple jelly, red jam, black bread.
Don’t eat with your mouth full!'''

In [30]:
df=pd.DataFrame({'docs':[doc_0, doc_1, doc_2, doc_3, doc_4]})

In [31]:
df.copy()

Unnamed: 0,docs
0,"Peter<b>, Piper picked a peck of pickled pepp..."
1,1234567How much wood would a woodchuck chuck\n...
2,She sells seashells on the seashore.\nThe shel...
3,Birdie birdie in the sky laid a turdie in my e...
4,"Yellow butter, purple jelly, red jam, black br..."


# Converting to uniform case

In [32]:
df['docs'].str.lower()

0     peter<b>, piper picked a peck of pickled pepp...
1    1234567how much wood would a woodchuck chuck\n...
2    she sells seashells on the seashore.\nthe shel...
3    birdie birdie in the sky laid a turdie in my e...
4    yellow butter, purple jelly, red jam, black br...
Name: docs, dtype: object

- or 

In [33]:
def lowerCaseConversion(x):
    return x.str.lower()

In [34]:
df['docs']=lowerCaseConversion(df['docs'])

In [35]:
df

Unnamed: 0,docs
0,"peter<b>, piper picked a peck of pickled pepp..."
1,1234567how much wood would a woodchuck chuck\n...
2,she sells seashells on the seashore.\nthe shel...
3,birdie birdie in the sky laid a turdie in my e...
4,"yellow butter, purple jelly, red jam, black br..."


### handling html tags

In [36]:
x ='<b>Hello</b>'
re.sub(r'<.*?>','',x)

'Hello'

In [37]:
def removeTags(x):
    return re.sub(r'<.*?>','',x)

In [38]:
df['docs']= df['docs'].apply(removeTags)

In [39]:
x='crossing carefully. https://www'

In [40]:
df['docs'][2][:-5:-1]

'.sll'

In [41]:
df['docs']

0     peter, piper picked a peck of pickled peppers...
1    1234567how much wood would a woodchuck chuck\n...
2    she sells seashells on the seashore.\nthe shel...
3    birdie birdie in the sky laid a turdie in my e...
4    yellow butter, purple jelly, red jam, black br...
Name: docs, dtype: object

In [42]:
df['docs'][0]

' peter, piper picked a peck of pickled peppers;\na peck of pickled peppers peter piper picked;\nif peter piper picked a peck of pickled peppers,\nwhere’s the peck of pickled peppers peter piper picked.'

In [43]:
def handlingNewlineChar(x):
    return re.sub(r'\n', ' ', x)

In [44]:
df['docs']=df['docs'].apply(handlingNewlineChar)
df['docs'][0]

' peter, piper picked a peck of pickled peppers; a peck of pickled peppers peter piper picked; if peter piper picked a peck of pickled peppers, where’s the peck of pickled peppers peter piper picked.'

## Handling Spechars and Numbers

In [45]:
x = '12@Heloo_**9.9Hi   '
re.sub(r'^[a-zA-Z]', ' ', x)

'12@Heloo_**9.9Hi   '

In [46]:
x = '12@Heloo_**9Hi   '
re.sub(r'^[a-zA-Z0-9]', ' ', x)

' 2@Heloo_**9Hi   '

In [47]:
9e14

900000000000000.0

In [48]:
x = '12@Heloo_**9Hi   '
re.sub(r'^[a-zA-Z0 - 9]', ' ', x)

'12@Heloo_**9Hi   '

In [49]:
x = '12@Heloo_**9Hi   '
re.sub(r'^[a-zA-Z0-9]', ' ', x)

' 2@Heloo_**9Hi   '

### I want numbers negative numbers float values while removing special characters

In [50]:
x = '12@Heloo_**9Hi   '
re.sub(r'^[a-zA-Z0-9]', ' ', x)

def removingSpeCharsAndNumbers(x):
    return re.sub(r'[^a-zA-A-Z]', ' ', x)
    

In [51]:
df['docs']=df['docs'].apply(removingSpeCharsAndNumbers)

In [52]:
df['docs'][0]

' peter  piper picked a peck of pickled peppers  a peck of pickled peppers peter piper picked  if peter piper picked a peck of pickled peppers  where s the peck of pickled peppers peter piper picked '

In [53]:
9e-14

9e-14

In [54]:
'hi hello'

'hi hello'

# Removing Stopwords

In [55]:
#!pip install nltk

In [56]:
from nltk.corpus import stopwords
nltk.download('stopwords')

NameError: name 'nltk' is not defined

In [None]:
print(dir(stopwords))

In [None]:
stopwords.words('English')

In [None]:
stopwords.words('French')

In [None]:
stopwords.words('Spanish')

In [None]:
print(len(stopwords.words('english')), stopwords.words('english'))

In [None]:
help(stopwords.words)

In [None]:
x='Myself Arnold, You might have heard my name'

In [None]:
x

In [None]:
x=x.split()

In [None]:
x

In [None]:
print(stopwords.words('english'))

In [None]:
z= []
for ele in x:
    if ele in stopwords.words('english'):
        z.append(ele)

In [None]:
' '.join(z)

In [None]:
def removeStopWords(x):
    import nltk
    nltk.download('stopwords')
    from nltk.corpus import stopwords

In [None]:
z=[]

for ele in x.split():
    if ele not in stopwords.words('english'):
        z.append(ele)
return ' '.join(z)

In [None]:
def stopword(x):
    import nltk
    nltk.download('stopwords')
    from nltk.corpus import stopwords
    z=[]
    for ele in x.split():
        if ele not in stopwords.words('english'):
            z.append(ele)
    return ' '.join(z)

In [None]:
df['docs']=df['docs'].apply(stopword)

In [None]:
df['docs']

#### Stemming is the process of getting different morphological variations given a root word. The root word is also called the stem and hence the name stemming. For example, for the word ‘like’, we can have different forms such as ‘likes’, ‘likely’, ‘liking’, etc. And we can also have the words such as ‘lik’ which is not a technical English word. This feature helps us search for words in search engines and other applications easier. The programs are written for the process of stemming using the algorithms called stemming algorithms or stemmers. Most of these are based on rules applying to suffix-stripping. One of them which is the most common is the Porter-Stemmer12. To implement stemming using Python, we use the nltk module1. We can import this module by writing the below statement:



In [57]:
import nltk


In [58]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

ps = PorterStemmer() #creating an instance of the class
# creating a list of some words to be stemmed
words = ['run','ran','running']
for x in words:
    print(x, " : ", ps.stem(x))


run  :  run
ran  :  ran
running  :  run


In [59]:
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer('english')

In [60]:
print(dir(stemmer))

['__abstractmethods__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_abc_impl', 'languages', 'stem', 'stemmer', 'stopwords']


In [61]:
stemmer.stem('unplayable')

'unplay'

### Lemmantiziation

In [62]:
from nltk.stem import WordNetLemmatizer

In [63]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\alexa\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [64]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\alexa\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [65]:
lem=WordNetLemmatizer()

In [66]:
print(dir(lem))

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', 'lemmatize']


In [67]:
lem.lemmatize('unplayable')

'unplayable'

# todays task write a regex function which handles numbers convert everything into function which we have done today

In [68]:
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords

def stem_and_lemmatize(text):
    # Tokenize the text into words
    words = nltk.word_tokenize(text)
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word.lower() not in stop_words]
    
    # Stem the words
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(word) for word in words]
    
    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in stemmed_words]
    
    return lemmatized_words


In [69]:
import nltk
from nltk.stem import PorterStemmer

def stem_words(text):
    # Tokenize the text into words
    words = nltk.word_tokenize(text)
    
    # Stem the words
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(word) for word in words]
    
    return stemmed_words


In [70]:
def stem(x):
    z=[]
    for ele in x.split():
        z.append(lem.lemmatize(ele))
    return ' '.join(z)

In [71]:
def lemmatize(x):
    z=[]
    for ele in x.split():
        z.append(lem.lemmatize(ele))
    return ' '.join(z)

In [72]:
def lemmatizer(x):
    z=[]
    for ele in x.split():
        z.append(lem.lemmatize(ele))
    return ' '.join(z)
    

In [73]:
df['lemDocs']=df['docs'].apply(lemmatize)
df

Unnamed: 0,docs,lemDocs
0,peter piper picked a peck of pickled peppers...,peter piper picked a peck of pickled pepper a ...
1,how much wood would a woodchuck chuck i...,how much wood would a woodchuck chuck if a woo...
2,she sells seashells on the seashore the shell...,she sell seashell on the seashore the shell sh...
3,birdie birdie in the sky laid a turdie in my e...,birdie birdie in the sky laid a turdie in my e...
4,yellow butter purple jelly red jam black br...,yellow butter purple jelly red jam black bread...


In [76]:
import re

def is_valid_number(input_str):
    # Integer pattern: ^-?\d+$
    integer_pattern = r'^-?\d+$'
    
    # Floating point pattern: ^-?\d+(\.\d+)?$
    float_pattern = r'^-?\d+(\.\d+)?$'
    
    if re.match(integer_pattern, input_str) or re.match(float_pattern, input_str):
        return True
    else:
        return False

# Test cases
print(is_valid_number("123"))      # True
print(is_valid_number("-45"))      # True
print(is_valid_number("3.14"))     # True
print(is_valid_number("-0.5"))     # True
print(is_valid_number("abc"))      # False
print(is_valid_number("12.34.56")) # False


True
True
True
True
False
False


### Regex Function that handles Numbers

In [78]:
import re

def contains_numbers(input_str):
    # Define a regex pattern to match any digit
    pattern = r'\d'
    
    # Use re.search to find the first occurrence of a digit in the string
    match = re.search(pattern, input_str)
    
    # If a match is found, return True; otherwise, return False
    return bool(match)

# Test cases
print(contains_numbers("Hello, world!"))  # False
print(contains_numbers("The price is $42.50"))  # True
print(contains_numbers("NoNumbersHere"))  # False
print(contains_numbers("12345"))  # True


False
True
False
True


In [3]:
import re

def contains_numbers(input_str):
    # Define a regex pattern to match any digit
    pattern = r'\d'
    
    # Use re.search to find the first occurrence of a digit in the string
    match = re.search(pattern, input_str)
    
    # If a match is found, return True; otherwise, return False
    return bool(match)

# Test the function
def test_contains_numbers():
    assert contains_numbers("Hello, world!") == False
    assert contains_numbers("The price is $42.50") == True
    assert contains_numbers("NoNumbersHere") == False
    assert contains_numbers("12345") == True

# Run the test cases
test_contains_numbers()
