In [1]:
import re
import numpy
from os import listdir
from os.path import join, abspath
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from modules.TextPreProcessor import removeShortDocs
from modules.TextPreProcessor import removeStopWords
from modules.TextPreProcessor import stemSentences
import pandas as pd
from collections import Counter

In [2]:
df = pd.read_pickle('/home/ifte/alechat_core/corpus/processed/processed_features.pkl')

In [3]:
collector = []
length = len(df)
for i in range(length):
    collector.append([x for x in df.lines[i]])

In [4]:
all_sentences = [x for sub in collector for x in sub]
def page_text_split(page_text, word_limit):
    page_text = page_text.split()
    return [' '.join(page_text[i:i + word_limit]) for i in range(0, len(page_text), word_limit)]
collector = []
for i in all_sentences:
    if len(i) > 200:
        collector.extend(page_text_split(i, 15))
    else:
        collector.append(i)

In [5]:
ps = PorterStemmer()
nltk_stop_words = set(stopwords.words('english'))

In [6]:
# remove sentences that do not contribute meaning by assuming short sentences have less meaning
sentences = [i for i in collector if len(i) > 25]

# remove stop words from all sentences
processedSentences = removeStopWords(sentences, nltk_stop_words)

# stem all tokens of all sentences
# processedSentences = stemSentences(sentences, ps)

In [None]:
processedSentences

In [7]:
from spacy import load as spacy_model_load
spacy_model = spacy_model_load("en_core_web_sm")
RECOMMEND_FEATURES = ['PROPN', 'NOUN']
processed = []
for sent in processedSentences:
    doc = spacy_model(sent)
    collector = []
    for token in doc:
        if not token.is_stop and token.pos_ in RECOMMEND_FEATURES:
            collector.append(str(token.lemma_))
    processed.append(' '.join(collector))

In [8]:
cluster_count = int(len(processed)/25)

In [9]:
# create tfidf matrix from the processed sentences
vector = TfidfVectorizer()
tfidf_matrix = vector.fit_transform(processed)

# cluster our tokenized sentences into 10 groups
kMeansCluster = KMeans(n_clusters=cluster_count)
kMeansCluster.fit(tfidf_matrix)
clusters = kMeansCluster.labels_.tolist()

In [27]:
df = pd.DataFrame(zip(clusters, sentences, processedSentences), columns=['id', 'raw', 'processed'])

In [52]:
df.head()

Unnamed: 0,id,raw,processed
0,17,inveitco inveitco blog custom pages clients bl...,inveitco inveitco blog custom pages clients bl...
1,2,us transitions left right animation fade up do...,us transitions left right animation fade anima...
2,15,text under image without space home about us s...,text image without space home us services solu...
3,9,us inveitco inveitco blog page template page ...,us inveitco inveitco blog page template page t...
4,4,ajax fade page not loaded qode theme ver,ajax fade page loaded qode theme ver


In [54]:
raw = [df[(df['id'] == i)].raw.values.tolist() for i in [x for x in df.id.unique()]]
processed = [df[(df['id'] == i)].processed.values.tolist() for i in [x for x in df.id.unique()]]

In [55]:
df = pd.DataFrame(zip(raw, processed), columns=['raw', 'processed'])

In [56]:
df.head()

Unnamed: 0,raw,processed
0,[inveitco inveitco blog custom pages clients b...,[inveitco inveitco blog custom pages clients b...
1,[us transitions left right animation fade up d...,[us transitions left right animation fade anim...
2,[text under image without space home about us ...,[text image without space home us services sol...
3,[us inveitco inveitco blog page template page...,[us inveitco inveitco blog page template page ...
4,"[ajax fade page not loaded qode theme ver , ...","[ajax fade page loaded qode theme ver, page pa..."


In [57]:
collector = []
length = len(df)
for i in range(length):
    doc = spacy_model(' '.join([x for x in df.raw[i]]))
    collector.append(sorted([str(x) for x in doc.noun_chunks], key=len, reverse=True)[0])

In [59]:
df['header'] = collector

In [60]:
df.head()

Unnamed: 0,raw,processed,header
0,[inveitco inveitco blog custom pages clients b...,[inveitco inveitco blog custom pages clients b...,clients blog contact inveitco orca australia c...
1,[us transitions left right animation fade up d...,[us transitions left right animation fade anim...,animation portfolio gallery style gallery styl...
2,[text under image without space home about us ...,[text image without space home us services sol...,us services solutions portfolio blog contact text
3,[us inveitco inveitco blog page template page...,[us inveitco inveitco blog page template page ...,page template page template full width page te...
4,"[ajax fade page not loaded qode theme ver , ...","[ajax fade page loaded qode theme ver, page pa...",page parent ajax fade page


In [67]:
generator = []
for heads in df['header'].values:
    generator.append([' '.join(y) for y in [x.split() for x in page_text_split(heads, 3)] if len(y)>2])

df['generator'] = generator


In [88]:
data = [x.split() for x in list(set([i for sub in [x for x in df.raw.values.tolist()] for i in sub]))]

In [None]:
data

In [90]:
    TOKEN_SIZE = 3
# Another model
    def sequence_frequency_dict(data):
        words_tokens = [s for sublist in data for s in sublist]
        sequence_dict = {}
        for c in range(len(words_tokens) - TOKEN_SIZE):
            seq = ' '.join(words_tokens[c:c + TOKEN_SIZE])
            if seq not in sequence_dict.keys():
                sequence_dict[seq] = []
            sequence_dict[seq].append(words_tokens[c + TOKEN_SIZE])
        return sequence_dict

    # Another model
    def reply_generate(query, data):
        sequence_dict = sequence_frequency_dict(data)
        curr_sequence = query
        result = curr_sequence
        for counter in range(100):
            if curr_sequence not in sequence_dict.keys():
                break
            possible_words = sequence_dict[curr_sequence]
            result += ' ' + Counter(possible_words).most_common(1)[0][0]
            seq_words = result.split()
            curr_sequence = ' '.join(seq_words[len(seq_words) - TOKEN_SIZE:len(seq_words)])
        return result

    def driver(queries, data):
        collector = []
        for query in queries:
            collector.append(reply_generate(query, data))
        return collector

In [114]:
collect = []
for items in df['generator'].values:
    collect.append(driver(items, data))

df['reply'] = collect

In [117]:
df.head()

Unnamed: 0,raw,processed,header,generator,reply
0,[inveitco inveitco blog custom pages clients b...,[inveitco inveitco blog custom pages clients b...,clients blog contact inveitco orca australia c...,"[clients blog contact, inveitco orca australia]",[clients blog contact us inveitco about us pag...
1,[us transitions left right animation fade up d...,[us transitions left right animation fade anim...,animation portfolio gallery style gallery styl...,"[animation portfolio gallery, style gallery st...",[animation portfolio gallery style gallery sty...
2,[text under image without space home about us ...,[text image without space home us services sol...,us services solutions portfolio blog contact text,"[us services solutions, portfolio blog contact]",[us services solutions portfolio blog contact ...
3,[us inveitco inveitco blog page template page...,[us inveitco inveitco blog page template page ...,page template page template full width page te...,"[page template page, template full width, page...",[page template page template full width page t...
4,"[ajax fade page not loaded qode theme ver , ...","[ajax fade page loaded qode theme ver, page pa...",page parent ajax fade page,[page parent ajax],[page parent ajax fade page not loaded qode th...
