In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
import os
from os.path import join
import re
from nltk.probability import FreqDist
from nltk.stem import PorterStemmer
from functools import reduce
from operator import add
from nltk.corpus import stopwords
import numpy as np

from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
class TextNormalize(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        # Replacing the new line symbols wiht space.
        X = X.replace('\n', ' ')
        
        X = X.replace('\r', ' ')
        
        # Bringing the text to the lower case.
        X = X.lower()
        
        # Extracting only the words from the text.
        X = ' '.join(re.findall('[a-z]+', X))
        
        return X
            

In [3]:
text = 'The smooth muscle ﬁbres taper at both ends (fusiform) and do not\nshow striations (Figure 7.7b). Cell junctions hold them together and they\nare bundled together in a connective tissue sheath. The wall of internal\norgans such as the blood vessels, stomach and intestine contains this type\nof muscle tissue. Smooth muscles are ‘involuntary’ as their functioning\ncannot be directly controlled. We usually are not able to make it contract\nmerely by thinking about it as we can do with skeletal muscles.\n\nCardiac muscle tissue is a contractile tissue present only in the heart.\nCell junctions fuse the plasma membranes of cardiac muscle cells and\nmake them stick together (Figure 7.70). Communication junctions\n(intercalated discs) at some fusion points allow the cells to contract as a\nunit, i.e., when one cell receives a signal to contract, its neighbours are\nalso stimulated to contract.'
text

'The smooth muscle ﬁbres taper at both ends (fusiform) and do not\nshow striations (Figure 7.7b). Cell junctions hold them together and they\nare bundled together in a connective tissue sheath. The wall of internal\norgans such as the blood vessels, stomach and intestine contains this type\nof muscle tissue. Smooth muscles are ‘involuntary’ as their functioning\ncannot be directly controlled. We usually are not able to make it contract\nmerely by thinking about it as we can do with skeletal muscles.\n\nCardiac muscle tissue is a contractile tissue present only in the heart.\nCell junctions fuse the plasma membranes of cardiac muscle cells and\nmake them stick together (Figure 7.70). Communication junctions\n(intercalated discs) at some fusion points allow the cells to contract as a\nunit, i.e., when one cell receives a signal to contract, its neighbours are\nalso stimulated to contract.'

In [4]:
obj = TextNormalize()
obj.transform(text)

'the smooth muscle bres taper at both ends fusiform and do not show striations figure b cell junctions hold them together and they are bundled together in a connective tissue sheath the wall of internal organs such as the blood vessels stomach and intestine contains this type of muscle tissue smooth muscles are involuntary as their functioning cannot be directly controlled we usually are not able to make it contract merely by thinking about it as we can do with skeletal muscles cardiac muscle tissue is a contractile tissue present only in the heart cell junctions fuse the plasma membranes of cardiac muscle cells and make them stick together figure communication junctions intercalated discs at some fusion points allow the cells to contract as a unit i e when one cell receives a signal to contract its neighbours are also stimulated to contract'

In [5]:
class WordExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, stop_words = None):
        self.__stop_words = stop_words
    def fit(self, X, y=None):
        self.__hapaxes = []
        
        fdist = FreqDist()
        
        self.__hapaxes = fdist.hapaxes()
        return self
    def transform(self, X, y=None):
        
        self.fit(X)
        self.common_hapaxes = list(reduce(add, [self.__hapaxes]))
        X = ' '.join([word for word in word_tokenize(X)
                                        if word not in self.common_hapaxes])
        
        if self.__stop_words is not None:
            X = ' '.join([word for word in word_tokenize(X)
                            if word not in self.__stop_words])
        return X

In [6]:
obj2 = WordExtractor(stopwords.words('english'))
obj2.transform(text)

'The smooth muscle ﬁbres taper ends ( fusiform ) show striations ( Figure 7.7b ) . Cell junctions hold together bundled together connective tissue sheath . The wall internal organs blood vessels , stomach intestine contains type muscle tissue . Smooth muscles ‘ involuntary ’ functioning directly controlled . We usually able make contract merely thinking skeletal muscles . Cardiac muscle tissue contractile tissue present heart . Cell junctions fuse plasma membranes cardiac muscle cells make stick together ( Figure 7.70 ) . Communication junctions ( intercalated discs ) fusion points allow cells contract unit , i.e . , one cell receives signal contract , neighbours also stimulated contract .'

In [7]:
class ApplyStemmer(BaseEstimator, TransformerMixin):
    def __init__(self, stemmer = None):
        self.stemmer = stemmer
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X =  ' '.join([self.stemmer.stem(word) for word in word_tokenize(X)])
        return X

In [8]:
obj3 = ApplyStemmer(PorterStemmer())
obj3.transform(text)

'the smooth muscl ﬁbre taper at both end ( fusiform ) and do not show striation ( figur 7.7b ) . cell junction hold them togeth and they are bundl togeth in a connect tissu sheath . the wall of intern organ such as the blood vessel , stomach and intestin contain thi type of muscl tissu . smooth muscl are ‘ involuntari ’ as their function can not be directli control . We usual are not abl to make it contract mere by think about it as we can do with skelet muscl . cardiac muscl tissu is a contractil tissu present onli in the heart . cell junction fuse the plasma membran of cardiac muscl cell and make them stick togeth ( figur 7.70 ) . commun junction ( intercal disc ) at some fusion point allow the cell to contract as a unit , i.e. , when one cell receiv a signal to contract , it neighbour are also stimul to contract .'