In [105]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FeatureUnion, Pipeline
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import math
from nltk.probability import FreqDist
import warnings

In [106]:
warnings.filterwarnings("ignore")

* *TransformerMixin* : assure we have fit_transform and we can write
 
* *BaseEstimator*    : assure that we have get_params and set_params for free

In [261]:
# custom Transformer, extracts column passed as arg to constructor
class FeatureSelector(BaseEstimator, TransformerMixin):

    # constructor
    def __init__(self, feature_names):
        self.feature_names = feature_names

    # return just self 
    def fit(self, X, y=None):
        return self

    # describe what we need this transformer to do 
    def transform(self, X, y=None):
        return X[self.feature_names].values.tolist()

In [260]:
# preprocessing of the data 
class Preprocess_text(BaseEstimator, TransformerMixin):

    # constructor
    def __init__(self, raw_data, ps):
        self.raw_data = raw_data
        self.ps = PortStemmer()
 
    # tokenize the data 
    def tokenize(self):
        self.tokenized_data = [word_tokenize(item) for item in raw_data]
        self.tokenized_data = [item for sublist in tokenized_data for item in sublist]
        return self.tokenized_data

    # get only alphabetic and numerical elements 
    def text_data(self):
        self.raw_data = [item.lower() for item in raw_data if item.isalnum()]
        return self.raw_data

    # remove stop words 
    def remove_sw(self):
        self.tokenized_data = [item for item in tokenized_data if item not in set(stopwords.words('english'))]

    # stemming
    def stemming(self):
        self.stemmed_data = [ps.stem(item) for item in self.tokenized_data]
        return stemmed_data 

In [255]:
class mohamed_transformer(BaseEstimator, TransformerMixin):

    # Constructor
    def __init__(self, raw_data):
        self.raw_data = [item.lower() for item in raw_data]
        self.tokens = [item.lower() for item in word_tokenize(" ".join(raw_data))]
        self.lenn = len(raw_data) # total number of documents 
        self.df = pd.DataFrame(columns=[f'doc {i}' for i in range(len(self.raw_data))], index=list(set([token.lower() for token in self.tokens])))
        self.norm_df = pd.DataFrame(columns=[f'doc {i}' for i in range(len(self.raw_data))], index=['max norm'])

    # get the dictionary
    def make_dictionary(self):
        self.dictionary = {text: i for i,text in enumerate(self.raw_data)}
        return self.dictionary

    def inverse_indexing(self):
        self.indexing = {key.lower(): [] for i, key in enumerate(self.tokens)}
        for _ in range(len(self.tokens)):
            # if token exists in doc
            token = self.tokens[_]
            for i in range(len(self.raw_data)):

                if token in self.raw_data[i]:
                    if self.dictionary[self.raw_data[i]] not in self.indexing[token]:
                        self.indexing[token].append(self.dictionary[self.raw_data[i]])
        return self.indexing

    def log_tf(self):
        self.df[:] = 0
        for token in self.tokens:

            for i, text in enumerate(self.raw_data):
                freq_dis = FreqDist(text)

                if token in freq_dis.keys():
                    self.df[f"doc {i}"][token] += freq_dis[token]
                else:
                    self.df[f"doc {i}"][token] = 1
        for col in self.df:
            self.df[col] = self.df[col].apply(lambda x: 1 + math.log(x))
            
        self.df = self.df.apply(pd.to_numeric, errors='coerce')
        return self.df

    def prob_idf(self):
        self.dictionary_idf = {key: max(0, math.log((self.lenn-len(self.indexing[key])/len(self.indexing[key])))) for i, key in enumerate(self.tokens)}
        return self.dictionary_idf

    def get_norm(self):
        for col in self.df.columns:
            value = max(self.df[col])
            self.norm_df[col]['max norm'] = value

        return self.norm_df

    # tf * idf / norm for each token
    def result(self): 
        self.res = []
        for text in self.raw_data:
            tokenized = word_tokenize(text)
            x = []
            for i in range(len(tokenized)):
                tf_i = self.df[f'doc {self.dictionary[text]}'][tokenized[i]] 
                idf_i = self.dictionary_idf[tokenized[i]]
                norm_i = self.norm_df[f"doc {self.dictionary[text]}"]['max norm']
                x.append(round(tf_i * idf_i / norm_i,2))
            self.res.append(x)
        return self.res  

In [258]:
sentences = [
    "The quick brown fox jumps over the lazy dog.",
    "To be or not to be, that is the question.",
    "Actions speak louder than words."
]


In [259]:
instance = mohamed_transformer(sentences)
instance.make_dictionary()
instance.inverse_indexing()
instance.log_tf()
instance.prob_idf()
instance.get_norm()
instance.result()

[[0.33, 0.33, 0.33, 0.33, 0.33, 0.33, 0.33, 0.33, 0.33, 0.69],
 [0.33, 0.33, 0.33, 0.33, 0.33, 0.33, 0.33, 0.33, 0.33, 0.33, 0.33, 0.69],
 [0.33, 0.33, 0.33, 0.33, 0.33, 0.69]]