## 1. Libraries and Dataset

In [None]:
! pip install bs4 
# http://omz-software.com/pythonista/docs/ios/beautifulsoup_guide.html

! pip install contractions
# https://pypi.org/project/contractions/

! pip install autocorrect 
# https://pypi.org/project/autocorrect/

In [None]:
# generic librairies
import time as time
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gc 
# Garbage Collector interface. https://docs.python.org/3/library/gc.html

In [None]:
# Text librairies
import re
from bs4 import BeautifulSoup

import nltk
from nltk.tokenize import ToktokTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.tag.util import untag

import contractions 
# Fixes contractions such as `you're` to you `are`. https://pypi.org/project/contractions/

from autocorrect import Speller 

## 2. Read dataset

In [None]:
# Data types. https://numpy.org/devdocs/user/basics.types.html

dtypes_questions = {'Id':'int32', 'Score': 'int16', 'Title': 'str', 'Body': 'str'}

In [None]:
df_questions = pd.read_csv('../datasets/Questions.csv', 
                            usecols=['Id', 'Score', 'Title', 'Body'],
                            encoding="ISO-8859-1",
                            dtype=dtypes_questions,
                            nrows=100                           
                            )

In [None]:
df_questions[['Title', 'Body']] = df_questions[['Title', 'Body']].applymap(lambda x: str(x).encode("utf-8", errors='surrogatepass').decode("ISO-8859-1", errors='surrogatepass'))

In [None]:
# Remove all questions that have a negative score
df_questions = df_questions[df_questions["Score"] >= 0]

In [None]:
spell = Speller()
token = ToktokTokenizer()
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
charac = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~0123456789'
stop_words = set(stopwords.words("english"))
adjective_tag_list = set(['JJ','JJR', 'JJS', 'RBR', 'RBS']) # List of Adjective's tag from nltk package

In [None]:
df_questions.info()

## 3. Noise removal
(removing anythings that can interfere with your text analysis)

### 3.1 Removing html

In [None]:
%%time

# Parse question and title then return only the text
df_questions['Body'] = df_questions['Body'].apply(lambda x: BeautifulSoup(x, 'html.parser').get_text())
df_questions['Title'] = df_questions['Title'].apply(lambda x: BeautifulSoup(x, 'html.parser').get_text())

In [None]:
def clean_text(text):
    text = re.sub(r"\'", "'", text) # match all literal apostrophe pattern then replace them by a single whitespace
    text = re.sub(r"\n", " ", text) # match all literal Line Feed (New line) pattern then replace them by a single whitespace
    text = re.sub(r"\xa0", " ", text) # match all literal non-breakable space pattern then replace them by a single whitespace
    text = re.sub('\s+', ' ', text) # match all one or more whitespace then replace them by a single whitespace
    text = text.strip(' ')
    return text

In [None]:
%%time

df_questions['Title'] = df_questions['Title'].apply(lambda x: clean_text(x)) 
df_questions['Body'] = df_questions['Body'].apply(lambda x: clean_text(x))

In [None]:
df_questions['Body'][11]

### 3.2 Remove contractions
(expand shortened words)

In [None]:
def expand_contractions(text):
    """expand shortened words, e.g. 'don't' to 'do not'"""
    text = contractions.fix(text)
    return text

In [None]:
%%time

df_questions['Title'] = df_questions['Title'].apply(lambda x: expand_contractions(x)) 
df_questions['Body'] = df_questions['Body'].apply(lambda x: expand_contractions(x))

In [None]:
df_questions['Body'][11]

### 3.3 Spelling correction(Optional)

In [22]:
def autocorrect(text):
    words = token.tokenize(text)
    words_correct = [spell(w) for w in words]
    return ' '.join(map(str, words_correct)) # Return the text untokenize

In [23]:
# %%time

# df_questions['Title'] = df_questions['Title'].apply(lambda x: autocorrect(x)) 
# df_questions['Body'] = df_questions['Body'].apply(lambda x: autocorrect(x)) 

### 3.4 Lowering the text
(Lowering the text is a classical and useful step of Noise removal or Text normalization since it reduce the vocabulary, normalize the text and cost almost nothing.)

In [25]:
%%time

df_questions['Title'] = df_questions['Title'].str.lower()
df_questions['Body'] = df_questions['Body'].str.lower()

CPU times: user 1.11 ms, sys: 134 µs, total: 1.25 ms
Wall time: 1.18 ms


In [26]:
df_questions['Body'][11]

'i have got a menu in python. that part was easy. i am using raw_input() to get the selection from the user. the problem is that raw_input (and input) require the user to press enter after they make a selection. is there any way to make the program act immediately upon a keystroke? here is what i have got so far: import sys print """menu 1) say foo 2) say bar""" answer = raw_input("make a selection> ") if "1" in answer: print "foo" elif "2" in answer: print "bar" it would be great to have something like print menu while lastkey = "": lastkey = check_for_recent_keystrokes() if "1" in lastkey: #do stuff...'

## 4. Removing character
How does Punctuation Affect Neural Models in Natural Language
Inference. https://aclanthology.org/2020.pam-1.15.pdf 

### 4.1 Removing all non-alphabetical character(Optional)
(This step will remove ALL non-alphabetical character, inculding punctuation, munber, special character)

In [27]:
def remove_punctuation_and_number(text):
    """remove all punctuation and number"""
    return text.translate(str.maketrans(" ", " ", charac)) 



def remove_non_alphabetical_character(text):
    """remove all non-alphabetical character"""
    text = re.sub("[^a-z]+", " ", text) # remove all non-alphabetical character
    text = re.sub("\s+", " ", text) # remove whitespaces left after the last operation
    return text

In [None]:
# %%time

# df_questions['Title'] = df_questions['Title'].apply(lambda x: remove_non_alphabetical_character(x)) 
# df_questions['Body'] = df_questions['Body'].apply(lambda x: remove_non_alphabetical_character(x)) 

### 4.2 Removing single character (optional)
Also have no idea if I should remove all single characters, since single character always take no meaning. such as a = "123" etc.

In [28]:
def remove_single_letter(text):
    """remove single alphabetical character"""
    text = re.sub(r"\b\w{1}\b", "", text) # remove all single letter
    text = re.sub("\s+", " ", text) # remove whitespaces left after the last operation
    text = text.strip(" ")
    return text

In [29]:
# %%time

# df_questions['Title'] = df_questions['Title'].apply(lambda x: remove_single_letter(x)) 
# df_questions['Body'] = df_questions['Body'].apply(lambda x: remove_single_letter(x)) 

## 5. Removing stopwords

### 5.1 Removing most frequent words

In [30]:
def remove_stopwords(text):
    """remove common words in english by using nltk.corpus's list"""
    words = token.tokenize(text)
    filtered = [w for w in words if not w in stop_words]
    
    return ' '.join(map(str, filtered)) # Return the text untokenize

In [31]:
# %%time

# df_questions['Title'] = df_questions['Title'].apply(lambda x: remove_stopwords(x))
# df_questions['Body'] = df_questions['Body'].apply(lambda x: remove_stopwords(x)) 

### 5.2 Removing adjectives (optional)
Maybe I should remove all adjectives as the adjectives won't add any useful infomation since all questions are related to Python programming?

In [32]:
def remove_by_tag(text, undesired_tag):
    """remove all words by using ntk tag (adjectives, verbs, etc.)"""
    words = token.tokenize(text) # Tokenize each words
    words_tagged = nltk.pos_tag(tokens=words, tagset=None, lang='eng') # Tag each words and return a list of tuples (e.g. ("have", "VB"))
    filtered = [w[0] for w in words_tagged if w[1] not in undesired_tag] # Select all words that don't have the undesired tags
    
    return ' '.join(map(str, filtered)) # Return the text untokenize

In [34]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/zhi/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [35]:
%%time
df_questions['Title'] = df_questions['Title'].apply(lambda x: remove_by_tag(x, adjective_tag_list))
df_questions['Body'] = df_questions['Body'].apply(lambda x: remove_by_tag(x, adjective_tag_list))

CPU times: user 496 ms, sys: 2.3 ms, total: 498 ms
Wall time: 497 ms
