In [16]:
import pandas as pd
import re
import nltk
import spacy
import string
pd.options.mode.chained_assignment = None

full_df = pd.read_csv("./bugtest.csv", nrows=30)
full_df.head()
df = full_df[["software"]]
df["software"] = df["software"].astype(str)
df.head()


Unnamed: 0,software
0,Firefox
1,Firefox
2,Core
3,Toolkit
4,addons.mozilla.o


In [17]:
df["lowertext"] = df["software"].str.lower()
df["lowertext"].head()

0             firefox
1             firefox
2                core
3             toolkit
4    addons.mozilla.o
Name: lowertext, dtype: object

In [18]:
PUNCT = string.punctuation
def remove_punctuation(text):
 return text.translate(str.maketrans('', '', PUNCT))
df["nopuntext"] = df["software"].apply(lambda text: remove_punctuation(text))
df["nopuntext"].head()

0           Firefox
1           Firefox
2              Core
3           Toolkit
4    addonsmozillao
Name: nopuntext, dtype: object

In [19]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
", ".join(stopwords.words('english'))
STOPW = set(stopwords.words('english'))
def remove_stopwords(text):
 return " ".join([word for word in str(text).split() if word not in STOPW])
df["stopwtext"] = df["nopuntext"].apply(lambda text: remove_stopwords(text))
df["stopwtext"].head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


0           Firefox
1           Firefox
2              Core
3           Toolkit
4    addonsmozillao
Name: stopwtext, dtype: object

In [29]:
from collections import Counter
cnt = Counter()
for text in df["stopwtext"].values:
    for word in text.split():
        cnt[word] += 1
        cnt.most_common(10)
        FREQWS = set([w for (w, wc) in cnt.most_common(10)])
def remove_freqwords(text):
    return " ".join([word for word in str(text).split() if word not in FREQWS])


In [33]:
df["stopfretext"] = df["nopuntext"].apply(lambda text: remove_freqwords(text))
df["stopfretext"].head()

0           Firefox
1           Firefox
2              Core
3           Toolkit
4    addonsmozillao
Name: stopfretext, dtype: object

In [34]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
def stem_words(text):
 return " ".join([stemmer.stem(word) for word in text.split()])
df["stemtext"] = df["stopfretext"].apply(lambda text: stem_words(text))
df["stemtext"].head()

0           firefox
1           firefox
2              core
3           toolkit
4    addonsmozillao
Name: stemtext, dtype: object

In [41]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
 return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


In [42]:
df["text_lemmatized"] = df["stemtext"].apply(lambda text: lemmatize_words(text))
df.head()

Unnamed: 0,software,lowertext,nopuntext,stopwtext,remove_freqwords,stemtext,stopftext,stopfretext,text_lemmatized
0,Firefox,firefox,Firefox,,,firefox,Firefox,Firefox,firefox
1,Firefox,firefox,Firefox,,,firefox,Firefox,Firefox,firefox
2,Core,core,Core,,,core,Core,Core,core
3,Toolkit,toolkit,Toolkit,,,toolkit,Toolkit,Toolkit,toolkit
4,addons.mozilla.o,addons.mozilla.o,addonsmozillao,,,addonsmozillao,addonsmozillao,addonsmozillao,addonsmozillao


In [43]:
import re
def remove_urls(text):
 url_pattern = re.compile(r'https?://\S+|www\.\S+')
 return url_pattern.sub(r'', text)

In [44]:
df["remove_url"] = df["stemtext"].apply(lambda text: remove_urls(text))
df.head()

Unnamed: 0,software,lowertext,nopuntext,stopwtext,remove_freqwords,stemtext,stopftext,stopfretext,text_lemmatized,remove_url
0,Firefox,firefox,Firefox,,,firefox,Firefox,Firefox,firefox,firefox
1,Firefox,firefox,Firefox,,,firefox,Firefox,Firefox,firefox,firefox
2,Core,core,Core,,,core,Core,Core,core,core
3,Toolkit,toolkit,Toolkit,,,toolkit,Toolkit,Toolkit,toolkit,toolkit
4,addons.mozilla.o,addons.mozilla.o,addonsmozillao,,,addonsmozillao,addonsmozillao,addonsmozillao,addonsmozillao,addonsmozillao


In [48]:
def remove_html(text):
 html_pattern = re.compile('<.*?>')
 return html_pattern.sub(r'', text)

In [49]:
df["remove_html"] = df["stemtext"].apply(lambda text: remove_html(text))
df.head()

Unnamed: 0,software,lowertext,nopuntext,stopwtext,remove_freqwords,stemtext,stopftext,stopfretext,text_lemmatized,remove_url,remove_html
0,Firefox,firefox,Firefox,,,firefox,Firefox,Firefox,firefox,firefox,firefox
1,Firefox,firefox,Firefox,,,firefox,Firefox,Firefox,firefox,firefox,firefox
2,Core,core,Core,,,core,Core,Core,core,core,core
3,Toolkit,toolkit,Toolkit,,,toolkit,Toolkit,Toolkit,toolkit,toolkit,toolkit
4,addons.mozilla.o,addons.mozilla.o,addonsmozillao,,,addonsmozillao,addonsmozillao,addonsmozillao,addonsmozillao,addonsmozillao,addonsmozillao


In [50]:
from spellchecker import SpellChecker
spell = SpellChecker()
def correct_spellings(text):
    corrected_text = []
    misspelled_words = spell.unknown(text.split())
    for word in text.split():
        if word in misspelled_words:
           corrected_text.append(spell.correction(word))
        else:
           corrected_text.append(word)
    return " ".join(corrected_text)

text = "speling hihi haha"
correct_spellings(text)

'spelling mihi haha'