# Pre-processing functions

In [1]:
import pandas as pd
import string
import re
# import NLP lib and its stopwords module
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize
porter_stemmer = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()

nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
# stopwords[10:20]
nltk.download('omw-1.4')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Veera\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Veera\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Veera\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Veera\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [10]:
def remove_hyperlink(text):
  text = re.sub(r'https?://\S+', "", text)
  text = " ".join(re.sub(r'https?://\S+', "", text).split())
  return text

def remove_tags(text):
  return re.sub(r"<.*?>", " ", text)

def tokenization(text):
  text = text.lower()
  text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text).strip() # replace punctuation with ' '(space)
  text = " ".join(text.split()) # remove the excess spaces and newlines
  return nltk.word_tokenize(text)

def remove_stopwords(tokens):
  return [token for token in tokens if token not in stopwords]
  
def stemming(tokens):
  return [porter_stemmer.stem(token) for token in tokens]

def lemmatizer(tokens):
  return [wordnet_lemmatizer.lemmatize(token) for token in tokens]

def full_preprocess(text):
  print(text)
  preprocessed_text = remove_hyperlink(text)
  preprocessed_text = remove_tags(preprocessed_text)
  preprocessed_text = tokenization(preprocessed_text)
  preprocessed_text = remove_stopwords(preprocessed_text)
  preprocessed_text = stemming(preprocessed_text)
  preprocessed_text = lemmatizer(preprocessed_text)
  return preprocessed_text


In [None]:
df = pd.read_csv('markdown-index.csv')
raw_markdown = df['markdown_content']
preprocessed_markdown = pd.Series([full_preprocess(md) for md in raw_markdown])
preprocessed_markdown

In [17]:
df[df['markdown_content'].isnull()]

Unnamed: 0,markdown_content,index_in_notebook,author_name,notebook_title,markdown_key
2708,,33,ashukr,exploration-of-220-columns-1,ashukr_exploration-of-220-columns-1_m33
2987,,0,frednavruzov,auto-feature-generation-featuretools-example,frednavruzov_auto-feature-generation-featureto...
3007,,4,ishaan45,thank-you,ishaan45_thank-you_m4
3098,,0,osciiart,homecreditrisk-extensive-eda-baseline-model-jp,osciiart_homecreditrisk-extensive-eda-baseline...
3151,,0,scirpus,pure-gp-with-logloss,scirpus_pure-gp-with-logloss_m0
3152,,0,scirpus,pure-gp-with-mean-squared-error,scirpus_pure-gp-with-mean-squared-error_m0
3390,,0,sz8416,simple-bayesian-optimization-for-lightgbm,sz8416_simple-bayesian-optimization-for-lightg...


In [None]:
i = 2
for md in df['markdown_content']:
    print(f'[{i}] : {md}')
    a = full_preprocess(md)
    i = i + 1

