In [1]:
import os
import sys
import math
import pandas as pd
import nltk as nl
import numpy as np
import json
import gensim.models.keyedvectors as word2vec
import re
import skfuzzy as fuzz
from jgtextrank import keywords_extraction

import pyswarms as ps
from pyswarms.utils.functions import single_obj as fx

In [2]:
# from nltk.stem.porter import *
from functools import singledispatch
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from string import punctuation

from collections import Counter

from sklearn.metrics.pairwise import cosine_similarity

from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering

from pyeasyga import pyeasyga
# from geneticalgorithm import geneticalgorithm as ga

from lexrank import STOPWORDS, LexRank

2021-04-26 22:17:32,050 - filelock - INFO - Lock 1714673706224 acquired on C:\ProgramData\Anaconda3\lib\site-packages\urlextract\data\tlds-alpha-by-domain.txt.lock
2021-04-26 22:17:32,078 - filelock - INFO - Lock 1714673706224 released on C:\ProgramData\Anaconda3\lib\site-packages\urlextract\data\tlds-alpha-by-domain.txt.lock


In [3]:
# from pprint import pprint
from fuzzy_logic.terms import Term
from fuzzy_logic.variables import FuzzyVariable
from fuzzy_logic.mamdani_fs import MamdaniFuzzySystem
from fuzzy_logic.mf import TriangularMF

In [4]:
@singledispatch
def to_serializable(val):
    """Used by default."""
    return str(val)

@to_serializable.register(np.float32)
def ts_float32(val):
    """Used if *val* is an instance of numpy.float32."""
    return np.float64(val)

# Text Pre-Processing

In [5]:
# stemmer = PorterStemmer()
stemmer = SnowballStemmer(language='english')

In [6]:
def remove_stop_words(words):
    """
    Will remove stop words from a list
    :param words: input list
    :return: output list of words without stop words.
    """
    custom_stop_words = set(stopwords.words('english') + list(punctuation))
    return [stemmer.stem(word) for word in words if word not in custom_stop_words]

In [7]:
def tokenize(text, stop_words):
    """
    Tokenizes a given text and also removes stop words.
    :param text:
    :param stop_words:
    :return:
    """
    words = nl.word_tokenize(text)
    words = [w.lower() for w in words]
    return [w for w in words if w not in stop_words and not w.isdigit()]

In [8]:
def text_cleaner(in_text):
    """

    :param in_text:
    :return:
    """
    cleaned_text = re.sub(r'([a-zA-Z])\\1{2,}', r'$1', in_text)
    cleaned_text = re.sub("\S*\d\S*", "", cleaned_text).strip()
    return re.sub(r'[^a-zA-Z0-9\s,.]', '', cleaned_text)

In [9]:
def get_cleaned_text(text):
    """

    :param text:
    :return:
    """

    in_text = text.lower()
    cleaned_text = text_cleaner(in_text)
    tokens = nl.word_tokenize(cleaned_text)

    # TODO : Use setemimng here.
    return remove_stop_words(tokens)

# Util Functions

In [10]:
def get_sen_length(sen):
    return len(nl.word_tokenize(sen))

In [11]:
# def get_sen_pos(postion, total_sens):
#     postion = 1 if postion == 0 else postion
#     return (postion - 1) / total_sens

def get_sen_pos(postion, total_sens):
    postion += 1
    postion = 1 if postion == total_sens else postion
    postion = 2 if postion == (total_sens - 1) else postion
    return (total_sens - postion) / total_sens

In [12]:
def get_proper_nouns(sentence):
    nnp = [word for word, pos in nl.pos_tag(nl.word_tokenize(str(sentence))) if pos == 'NNP']
    return len(nnp)

In [13]:
def extract_np(psent):
    for subtree in psent.subtrees():
        if subtree.label() == 'NP':
            yield ' '.join(word for word, tag in subtree.leaves())

In [14]:
def get_np_vps(sen):
    # Todo check grammar for verb phrases
    grammar = r"""
          NP: {<DT|PP\$>?<JJ>*<NN>}   # chunk determiner/possessive, adjectives and noun
          {<NNP>+}                # chunk sequences of proper nouns
          {<NN>+}                 # chunk consecutive nouns
          VP: {<VB.*><NP|PP|CLAUSE>+$}
          """
    cp = nl.RegexpParser(grammar)

    tagged_sent = nl.pos_tag(sen.split())
    parsed_sent = cp.parse(tagged_sent)
    nps = []
    for npstr in extract_np(parsed_sent):
        nps.append(npstr)
    return nps

In [15]:
def vectorize_sent(sent, model):
    # print(pre_process(sent))
    return np.mean([model[w] for w in get_cleaned_text(sent) if w in model]
                   or [np.zeros(300)], axis=0)

In [16]:
def get_sen_vec_list(sens, model):
    index_vec_list = []
    for i, sen in enumerate(sens):
        vec = vectorize_sent(sen, model)
        index_vec_list.append(vec)
    return index_vec_list

In [17]:
def calculate_tf_idf_sum(sen, D, df_map):
    tokens = get_cleaned_text(sen)
    tf_map = dict(Counter(tokens))
    tf_idf_map = {k: (v * math.log(D / df_map.get(k, 1))) for k, v in tf_map.items()}
    return sum(tf_idf_map.values())

In [18]:
def get_sen_cohesiveness(sens, model):
    index_vec_list = get_sen_vec_list(sens, model)
    sen_score_map = {}
    # print(index_vec_list)
    c_s_array = (cosine_similarity(index_vec_list, index_vec_list))
    cohesiveness = (c_s_array.sum(axis=0)) - 1
    return {i: v for i, v in enumerate(cohesiveness)}

In [19]:
def get_sentences(text):
    return nl.sent_tokenize(text)

In [20]:
def get_val(val, min_val, max_val):
    min_val = 1 if min_val <= 0 else min_val
    return (val - min_val) / (max_val - min_val)

In [21]:
def load_model(model_path):
    return word2vec.KeyedVectors.load_word2vec_format(model_path, binary=True)

# Base Summrizer

In [22]:
# nltk.download('reuters')
from nltk.corpus import reuters, stopwords

In [23]:
def normalize_values(res_arr):
    # print([x['tf_idf'] for x in res_arr])
    min_max_map = {}
    for k in ['tf_idf', 'sen_length', 'proper_nouns', 'np_vps', 'c_score', 'sen_pos']:
        min_max_map.setdefault(k, {})['max'] = max([x[k] for x in res_arr])
        min_max_map[k]['min'] = min([x[k] for x in res_arr])
    result = []
    # print(min_max_map)
    for r in res_arr:
        tf_idf = get_val(r['tf_idf'], min_max_map['tf_idf']['min'], min_max_map['tf_idf']['max'])
        sen_length = get_val(r['sen_length'], min_max_map['sen_length']['min'], min_max_map['sen_length']['max'])
        proper_nouns = get_val(r['proper_nouns'], min_max_map['proper_nouns']['min'],
                               min_max_map['proper_nouns']['max'])
        np_vps = get_val(r['np_vps'], min_max_map['np_vps']['min'], min_max_map['np_vps']['max'])
        c_score = get_val(r['c_score'], min_max_map['c_score']['min'], min_max_map['c_score']['max'])
        sen_pos = r['sen_pos']
        total_score = 0.7 * tf_idf + 0.05 * sen_length + 0.025 * proper_nouns + \
                      0.025 * np_vps + 0.15 * c_score + 0.15 * sen_pos
        obj = {'tf_idf': 0.7 * tf_idf, 'sen_length': 0.05 * sen_length, 'proper_nouns': 0.025 * proper_nouns,
               'np_vps': 0.025 * np_vps,
               'c_score': 0.15 * c_score, 'sen_pos': 0.15 * sen_pos, 'total_score': total_score, 'index': r['index']}
        result.append(obj)
    return result

In [24]:
def get_final_sen_ranks(text, D, df_map, model):
    curpath = os.path.abspath(os.curdir)
    sens = get_sentences(text)
    t = len(sens)
    coh_map = get_sen_cohesiveness(sens, model)
    result_array = []

    for i, s in enumerate(sens):
        # print(s)
        tf_idf = calculate_tf_idf_sum(s, D, df_map)
        sen_length = get_sen_length(s)
        proper_nouns = get_proper_nouns(s)
        np_vps = len(get_np_vps(s))
        c_score = coh_map[i]
        pos = get_sen_pos(i, t)
        rank_obj = {'tf_idf': tf_idf, 'sen_length': sen_length, 'proper_nouns': proper_nouns,
                    'np_vps': np_vps, 'c_score': c_score, 'sen_pos': pos, 'index': i}
        # print("   ", rank_obj)
        result_array.append(rank_obj)
    result = normalize_values(result_array)
    wr = open(os.path.join(curpath, '/tmp/score.json'), "w")
    for i, r in enumerate(result):
        r["sentence"] = sens[i]
        wr.write(json.dumps(r) + "\n")
    return result, sens

In [25]:
def return_postive_value(val):
    if(val < 0):
        val = -val
    return val

In [26]:
def fuzzy_model(res_array):
    min_max_map = {}    
    result = []
    
    t1 = Term('L', TriangularMF(0, 0.1, 0.2))
    t2 = Term('H', TriangularMF(0.2, 0.5, 1.0))

    tf_idf: FuzzyVariable = FuzzyVariable('tf_idf', 0.0, 1.0, t1, t2)
    sen_length: FuzzyVariable = FuzzyVariable('sen_length', 0.0, 1.0, t1, t2)
    proper_nouns: FuzzyVariable = FuzzyVariable('proper_nouns', 0.0, 1.0, t1, t2)
    np_vps: FuzzyVariable = FuzzyVariable('np_vps', 0.0, 1.0, t1, t2)
    c_score: FuzzyVariable = FuzzyVariable('c_score', 0.0, 1.0, t1, t2)
    sen_pos: FuzzyVariable = FuzzyVariable('sen_pos', 0.0, 1.0, t1, t2)
        
    output = FuzzyVariable(
        'output', 0.0, 1.0,
        Term('unimportant', TriangularMF(0.0, 0.1, 0.2)),
        Term('average', TriangularMF(0.2, 0.3, 0.4)),
        Term('good', TriangularMF(0.4, 0.5, 0.7)),
        Term('important', TriangularMF(0.7, 0.8, 1)),
    )
    
    mf: MamdaniFuzzySystem = MamdaniFuzzySystem([tf_idf, sen_length, proper_nouns, np_vps, c_score, sen_pos], [output])

    fuzzy_rules = pd.read_csv(r'C:\Users\Mohsin\Desktop\Desktop\Fuzzy Logics Rules (Thesis)\rules_v3.csv')
    for item in fuzzy_rules.values:
        mf.rules.append(mf.parse_rule("if (tf_idf is " + item [0] + ") and (sen_length is " + item [1] + ") and (proper_nouns is " + item [2] + ") and (np_vps is " + item [3] + ") and (c_score is " + item [4] + ") and (sen_pos is " + item [5] + ") then (output is " + item [6] + ")"))
        
        
    for k in ['tf_idf', 'sen_length', 'proper_nouns', 'np_vps', 'c_score', 'sen_pos']:
        min_max_map.setdefault(k, {})['max'] = max([x[k] for x in res_array])
        min_max_map[k]['min'] = min([x[k] for x in res_array])

        
    for r in res_array:
        
        tfIdf = get_val(r['tf_idf'], min_max_map['tf_idf']['min'], min_max_map['tf_idf']['max'])
        senLength = get_val(r['sen_length'], min_max_map['sen_length']['min'], min_max_map['sen_length']['max'])
        properNouns = get_val(r['proper_nouns'], min_max_map['proper_nouns']['min'], min_max_map['proper_nouns']['max']) 
        npVps = get_val(r['np_vps'], min_max_map['np_vps']['min'], min_max_map['np_vps']['max'])
        cScore = get_val(r['c_score'], min_max_map['c_score']['min'], min_max_map['c_score']['max']) 
        senPos = ((r['sen_pos'] - min_max_map['sen_pos']['min'])/(min_max_map['sen_pos']['max'] - min_max_map['sen_pos']['min']))
        
        resultant_value = mf.calculate({
            tf_idf: ts_float32(return_postive_value(tfIdf)), 
            sen_length: ts_float32(return_postive_value(senLength)), 
            proper_nouns: ts_float32(return_postive_value(properNouns)), 
            np_vps: ts_float32(return_postive_value(npVps)), 
            c_score: ts_float32(return_postive_value(cScore)), 
            sen_pos: ts_float32(return_postive_value(senPos))
        })
        
        fuzzy_score = [val for val in resultant_value.values()]
        
        print("fuzzay Score: ", fuzzy_score)
        
        obj = {
            'tf_idf': ts_float32(return_postive_value(tfIdf)),
            'sen_length': ts_float32(return_postive_value(senLength)),
            'proper_nouns': ts_float32(return_postive_value(properNouns)),
            'np_vps': ts_float32(return_postive_value(npVps)),
            'c_score': ts_float32(return_postive_value(cScore)),
            'sen_pos': ts_float32(return_postive_value(senPos)),
            'total_score': fuzzy_score[0],
            'index': r['index']
        }
        
        result.append(obj)
    
    return result

In [27]:
def fuzzy_ranking_model(text, D, df_map, model):
    curpath = os.path.abspath(os.curdir)
    sens = get_sentences(text)
    t = len(sens)
    coh_map = get_sen_cohesiveness(sens, model)
    result_array = []
    
    
    for i, s in enumerate(sens):
        # print(s)
        tf_idf = calculate_tf_idf_sum(s, D, df_map)
        sen_length = get_sen_length(s)
        proper_nouns = get_proper_nouns(s)
        np_vps = len(get_np_vps(s))
        c_score = coh_map[i]
        pos = get_sen_pos(i, t)
    
        rank_obj = {
            'tf_idf': tf_idf, 
            'sen_length': sen_length, 
            'proper_nouns': proper_nouns,
            'np_vps': np_vps,
            'c_score': c_score,
            'sen_pos': pos,
            'index': i
        }
        
        result_array.append(rank_obj)
        
        
    result = fuzzy_model(result_array)

    wr = open(os.path.join(curpath, '/tmp/score.json'), "w")
    for i, r in enumerate(result):
        r["sentence"] = sens[i]
        wr.write(json.dumps(r) + "\n")
    return result, sens

In [28]:
def fitness (individual, data):
    fitness = 0
    for (selected, (feature_label, score)) in zip(individual, data):
        if selected:
            fitness += score            
    return fitness

# def fitness (individual, data):
#     fitness = 0
#     if individual.count(1) == 3:
#         for (selected, (feature_label, score)) in zip(individual, data):
#             if selected:
#                 fitness += score    
#     return fitness

In [29]:
def ga_model(res_array):
    min_max_map = {}    
    result = [] 
#     senetnce_scores = []
        
    for k in ['tf_idf', 'sen_length', 'proper_nouns', 'np_vps', 'c_score', 'sen_pos']:
        min_max_map.setdefault(k, {})['max'] = max([x[k] for x in res_array])
        min_max_map[k]['min'] = min([x[k] for x in res_array])
    
        
    for r in res_array:
        
        tfIdf = get_val(r['tf_idf'], min_max_map['tf_idf']['min'], min_max_map['tf_idf']['max'])
        senLength = get_val(r['sen_length'], min_max_map['sen_length']['min'], min_max_map['sen_length']['max'])
        properNouns = get_val(r['proper_nouns'], min_max_map['proper_nouns']['min'], min_max_map['proper_nouns']['max']) 
        npVps = get_val(r['np_vps'], min_max_map['np_vps']['min'], min_max_map['np_vps']['max'])
        cScore = get_val(r['c_score'], min_max_map['c_score']['min'], min_max_map['c_score']['max']) 
        senPos = ((r['sen_pos'] - min_max_map['sen_pos']['min'])/(min_max_map['sen_pos']['max'] - min_max_map['sen_pos']['min']))
        
        data = [('tfidf', tfIdf), ('sen_length', senLength), ('proper_nouns', properNouns), ('np_vps', npVps), ('c_score', cScore), ('sen_pos', senPos)]
        
        ga = pyeasyga.GeneticAlgorithm(
            data,
            population_size = 100,
            generations = 100,
            crossover_probability = 0.8,
            mutation_probability = 0.2,
            elitism = True,
            maximise_fitness = True
        )
        
        ga.fitness_function = fitness
        
        ga.run()
        
        res = ga.best_individual()
        
        totalScore = res[0] * ts_float32(return_postive_value(tfIdf)) + res[0] * ts_float32(return_postive_value(senLength)) + res[0] * ts_float32(return_postive_value(properNouns)) + res[0] * ts_float32(return_postive_value(npVps)) + res[0] * ts_float32(return_postive_value(cScore)) + res[0] * ts_float32(return_postive_value(senPos))
        
#         totalScore = totalScore * res[0] 
        
        obj = {
            'tf_idf': res[0] * ts_float32(return_postive_value(tfIdf)),
            'sen_length': res[0] * ts_float32(return_postive_value(senLength)),
            'proper_nouns': res[0] * ts_float32(return_postive_value(properNouns)),
            'np_vps': res[0] * ts_float32(return_postive_value(npVps)),
            'c_score': res[0] * ts_float32(return_postive_value(cScore)),
            'sen_pos': res[0] * ts_float32(return_postive_value(senPos)),
#             'total_score': res[0],
            'total_score': totalScore,
            'index': r['index']
        }
#         senetnce_scores.append(res[0])
        result.append(obj)
        
#     for val in result:
#         val['total_score'] = (val['total_score'] - min(senetnce_scores)) / (max(senetnce_scores) - min(senetnce_scores))
    
    return result

In [30]:
def ga_ranking_model(text, D, df_map, model):
    curpath = os.path.abspath(os.curdir)
    sens = get_sentences(text)
    t = len(sens)
    coh_map = get_sen_cohesiveness(sens, model)
    result_array = []
    
    
    for i, s in enumerate(sens):
        # print(s)
        tf_idf = calculate_tf_idf_sum(s, D, df_map)
        sen_length = get_sen_length(s)
        proper_nouns = get_proper_nouns(s)
        np_vps = len(get_np_vps(s))
        c_score = coh_map[i]
        pos = get_sen_pos(i, t)
    
        rank_obj = {
            'tf_idf': tf_idf, 
            'sen_length': sen_length, 
            'proper_nouns': proper_nouns,
            'np_vps': np_vps,
            'c_score': c_score,
            'sen_pos': pos,
            'index': i
        }
        
        result_array.append(rank_obj)
        
        
    result = ga_model(result_array)

    wr = open(os.path.join(curpath, '/tmp/score.json'), "w")
    for i, r in enumerate(result):
        r["sentence"] = sens[i]
        wr.write(json.dumps(r) + "\n")
    return result, sens

In [31]:
def pso_model(res_array):
    min_max_map = {}    
    result = [] 
#     senetnce_scores = []
        
    for k in ['tf_idf', 'sen_length', 'proper_nouns', 'np_vps', 'c_score', 'sen_pos']:
        min_max_map.setdefault(k, {})['max'] = max([x[k] for x in res_array])
        min_max_map[k]['min'] = min([x[k] for x in res_array])
    
        
    for r in res_array:
        
        tfIdf = get_val(r['tf_idf'], min_max_map['tf_idf']['min'], min_max_map['tf_idf']['max'])
        senLength = get_val(r['sen_length'], min_max_map['sen_length']['min'], min_max_map['sen_length']['max'])
        properNouns = get_val(r['proper_nouns'], min_max_map['proper_nouns']['min'], min_max_map['proper_nouns']['max']) 
        npVps = get_val(r['np_vps'], min_max_map['np_vps']['min'], min_max_map['np_vps']['max'])
        cScore = get_val(r['c_score'], min_max_map['c_score']['min'], min_max_map['c_score']['max']) 
        senPos = ((r['sen_pos'] - min_max_map['sen_pos']['min'])/(min_max_map['sen_pos']['max'] - min_max_map['sen_pos']['min']))
        
#         options = {'c1': 0.5, 'c2': 0.3, 'w':0.9}
        options = {
            'c1': tfIdf, 
            'c2': senLength,
            'c3': properNouns,
            'c4': npVps,
            'c5': cScore,
            'c6': senPos,
            'w': 0.75
        }
        
        data  = [tfIdf, senLength, properNouns, npVps, cScore, senPos]
        
        
        optimizer = ps.single.GlobalBestPSO(n_particles=10, dimensions=6, options=options)
        
        cost, pos = optimizer.optimize(fx.sphere, iters=100)
    
#         res = cost
        
        totalScore = cost * ts_float32(return_postive_value(tfIdf)) + cost * ts_float32(return_postive_value(senLength)) + cost * ts_float32(return_postive_value(properNouns)) + cost * ts_float32(return_postive_value(npVps)) + cost * ts_float32(return_postive_value(cScore)) + cost * ts_float32(return_postive_value(senPos))
        
#         totalScore = totalScore * res[0] 
        
        obj = {
            'tf_idf': cost * ts_float32(return_postive_value(tfIdf)),
            'sen_length': cost * ts_float32(return_postive_value(senLength)),
            'proper_nouns': cost * ts_float32(return_postive_value(properNouns)),
            'np_vps': cost * ts_float32(return_postive_value(npVps)),
            'c_score': cost * ts_float32(return_postive_value(cScore)),
            'sen_pos': cost * ts_float32(return_postive_value(senPos)),
#             'total_score': cost,
            'total_score': totalScore,
            'index': r['index']
        }
#         senetnce_scores.append(res[0])
        result.append(obj)
        
#     for val in result:
#         val['total_score'] = (val['total_score'] - min(senetnce_scores)) / (max(senetnce_scores) - min(senetnce_scores))
    
    return result

In [32]:
def pso_ranking_model(text, D, df_map, model):
    curpath = os.path.abspath(os.curdir)
    sens = get_sentences(text)
    t = len(sens)
    coh_map = get_sen_cohesiveness(sens, model)
    result_array = []
    
    
    for i, s in enumerate(sens):
        # print(s)
        tf_idf = calculate_tf_idf_sum(s, D, df_map)
        sen_length = get_sen_length(s)
        proper_nouns = get_proper_nouns(s)
        np_vps = len(get_np_vps(s))
        c_score = coh_map[i]
        pos = get_sen_pos(i, t)
    
        rank_obj = {
            'tf_idf': tf_idf, 
            'sen_length': sen_length, 
            'proper_nouns': proper_nouns,
            'np_vps': np_vps,
            'c_score': c_score,
            'sen_pos': pos,
            'index': i
        }
        
        result_array.append(rank_obj)
        
        
    result = pso_model(result_array)

    wr = open(os.path.join(curpath, '/tmp/score.json'), "w")
    for i, r in enumerate(result):
        r["sentence"] = sens[i]
        wr.write(json.dumps(r) + "\n")
    return result, sens

In [33]:
def rank__V2(res_array, senetnces):
    min_max_map = {}    
    result = [] 
    sents_ranks = []
#     senetnce_scores = []
        
    for k in ['tf_idf', 'sen_length', 'proper_nouns', 'np_vps', 'c_score', 'sen_pos']:
        min_max_map.setdefault(k, {})['max'] = max([x[k] for x in res_array])
        min_max_map[k]['min'] = min([x[k] for x in res_array])
        
    for r in res_array:
        
        tfIdf = get_val(r['tf_idf'], min_max_map['tf_idf']['min'], min_max_map['tf_idf']['max'])
        senLength = get_val(r['sen_length'], min_max_map['sen_length']['min'], min_max_map['sen_length']['max'])
        properNouns = get_val(r['proper_nouns'], min_max_map['proper_nouns']['min'], min_max_map['proper_nouns']['max']) 
        npVps = get_val(r['np_vps'], min_max_map['np_vps']['min'], min_max_map['np_vps']['max'])
        cScore = get_val(r['c_score'], min_max_map['c_score']['min'], min_max_map['c_score']['max']) 
#         senPos = ((r['sen_pos'] - min_max_map['sen_pos']['min'])/(min_max_map['sen_pos']['max'] - min_max_map['sen_pos']['min']))
        senPos = r['sen_pos']
    
        sent_text = senetnces[r['index']]
        
        res = keywords_extraction(sent_text, top_p = 1, directed=False)[0][:len(sent_text.split())]
        
        resValues = res.values()
        
        resTotal = 0
        
        for val in resValues:
            resTotal += val
        
        totalScore = ts_float32(return_postive_value(tfIdf)) + ts_float32(return_postive_value(senLength)) + ts_float32(return_postive_value(properNouns)) + ts_float32(return_postive_value(npVps)) + ts_float32(return_postive_value(cScore)) + ts_float32(return_postive_value(senPos))
        
#         totalScore = totalScore * resTotal
        
        obj = {
            'tf_idf': ts_float32(return_postive_value(tfIdf)),
            'sen_length': ts_float32(return_postive_value(senLength)),
            'proper_nouns': ts_float32(return_postive_value(properNouns)),
            'np_vps': ts_float32(return_postive_value(npVps)),
            'c_score': ts_float32(return_postive_value(cScore)),
            'sen_pos': ts_float32(return_postive_value(senPos)),
#             'total_score': res[0],
            'total_score': totalScore,
            'index': r['index']
        }
#         senetnce_scores.append(res[0])
        result.append(obj)
        
#     for val in result:
#         val['total_score'] = (val['total_score'] - min(senetnce_scores)) / (max(senetnce_scores) - min(senetnce_scores))
    
    return result

In [34]:
def ranking_model_v2(text, D, df_map, model):
    curpath = os.path.abspath(os.curdir)
    sens = get_sentences(text)
    t = len(sens)
    coh_map = get_sen_cohesiveness(sens, model)
    result_array = []
    
    
    for i, s in enumerate(sens):
        # print(s)
        tf_idf = calculate_tf_idf_sum(s, D, df_map)
        sen_length = get_sen_length(s)
        proper_nouns = get_proper_nouns(s)
        np_vps = len(get_np_vps(s))
        c_score = coh_map[i]
        pos = get_sen_pos(i, t)
    
        rank_obj = {
            'tf_idf': tf_idf, 
            'sen_length': sen_length, 
            'proper_nouns': proper_nouns,
            'np_vps': np_vps,
            'c_score': c_score,
            'sen_pos': pos,
            'index': i
        }
        
        result_array.append(rank_obj)
        
        
    result = rank__V2(result_array, sens)

    wr = open(os.path.join(curpath, '/tmp/score.json'), "w")
    for i, r in enumerate(result):
        r["sentence"] = sens[i]
        wr.write(json.dumps(r) + "\n")
    return result, sens

In [35]:
def get_sen_cluster_map(sens, model, n_clusters):
    X = get_sen_vec_list(sens, model)
#     kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(X)
#     agglomerativeClustering = AgglomerativeClustering(n_clusters=None, compute_full_tree=True, distance_threshold=0.75).fit(X)
    cntr, u, u0, d, jm, p, fpc = fuzz.cluster.cmeans(np.transpose(X), n_clusters, n_clusters, error=0.005, maxiter=1000, init=None)
#     labels = agglomerativeClustering.labels_
    labels = np.argmax(u, axis=0)
#     labels = kmeans.labels_
    return labels

In [36]:
def get_summary_indices(cluster_score_map, sens, sum_words):
    sorted_cl_map = {}
    for key, score_map in cluster_score_map.items():
        sorted_by_value = sorted(score_map.items(), key=lambda kv: kv[1], reverse=True)
        sorted_cl_map[key] = sorted_by_value

    # Get final sentence indices.
    final_sumamry_indices = []
    x = 0
    i = 0
    print(' summary words ', sum_words)
    while (x < sum_words):
        for key, score_map in sorted_cl_map.items():

            # The below if will ensure that  current cluster has ith senetence, it can not be always the case.
            if (len(score_map) > i):

                # This condition is for breaking during loop if num words are conaition is stastified.
                if x >= sum_words:
                    break

                index = score_map[i][0]
                token_len = len(nl.word_tokenize(sens[index]))
                if token_len > 15:
                    final_sumamry_indices.append(index)
                    # TODO : don't consider those sentences whose length is less than certain threshhold.
                    x += token_len

        i += 1
    return final_sumamry_indices

In [37]:
def get_summary_sentences(summary_indices, sens):
    return [sens[x] for x in summary_indices]

In [38]:
def calculate_df():
    stop_words = stopwords.words('english') + list(punctuation)

    df_map = {}
    print("going to read corpus for calculating document frequency")
    for file_id in reuters.fileids():
        # TO Get unique occurrence of word
        words = set(tokenize(reuters.raw(file_id), stop_words))
        for w in words:
            df_map.setdefault(w, df_map.get(w, 0) + 1)
    return df_map, len(reuters.fileids())

In [39]:
def get_summary(input_file_path, sum_percent, model_path):
    word_to_vec_model = load_model(model_path)
    df_map, D = calculate_df()

    with open(input_file_path, 'r') as my_file:
        data = my_file.read().replace('\n', '')
        data = data.replace(".", ". ")

#     result_array, sens = get_final_sen_ranks(data, D, df_map, word_to_vec_model)
#     result_array, sens = fuzzy_ranking_model(data, D, df_map, word_to_vec_model)
#     result_array, sens = ga_ranking_model(data, D, df_map, word_to_vec_model)
    result_array, sens = pso_ranking_model(data, D, df_map, word_to_vec_model)
#     result_array, sens = ranking_model_v2(data, D, df_map, word_to_vec_model)
    cluster_labels = get_sen_cluster_map(sens, word_to_vec_model, n_clusters=2)
    total_words = sum([len(nl.word_tokenize(sen_tokens)) for sen_tokens in sens])
    sum_words = math.ceil((sum_percent / 100) * total_words)
    final_cluster_score_map = {}
    for r in result_array:
        index = r['index']
        total_score = r['total_score']
        cluster = int(cluster_labels[index])
        final_cluster_score_map.setdefault(cluster, {})[index] = total_score
    # print(final_cluster_score_map)
    summary_indices = get_summary_indices(final_cluster_score_map, sens, sum_words)
    summary_indices.sort()
    # print(summary_indices)
    return get_summary_sentences(summary_indices, sens)

# Final Run

In [40]:
# summary_percent = 20
# # input_file = r"C:\Users\Mohsin\Desktop\Semantic-text-summarizer-master\data\sample_text.txt"
# input_file = r"C:\Users\Mohsin\Desktop\Desktop\test_doc.txt"
# model_path = r"C:\Users\Mohsin\Desktop\wrd2vec\word2vec-google-news-300.gz"
# summary_sens = get_summary(input_file, summary_percent, model_path)

In [41]:
# print("Summary:\n")
# for s in summary_sens:
#     print(s)

In [42]:
# print('Orignal Text:\n', open(input_file).read()

In [43]:
# files_paths = [
#     r'E:\Thesis\Documments_+_Ref_Summries\DOC_1\DOC_1.txt',
#     r'E:\Thesis\Documments_+_Ref_Summries\DOC_2\DOC_2.txt',
#     r'E:\Thesis\Documments_+_Ref_Summries\DOC_3\DOC_3.txt',
#     r'E:\Thesis\Documments_+_Ref_Summries\DOC_4\DOC_4.txt',
#     r'E:\Thesis\Documments_+_Ref_Summries\DOC_5\DOC_5.txt',
#     r'E:\Thesis\Documments_+_Ref_Summries\DOC_6\DOC_6.txt',
#     r'E:\Thesis\Documments_+_Ref_Summries\DOC_7\DOC_7.txt',
#     r'E:\Thesis\Documments_+_Ref_Summries\DOC_8\DOC_8.txt',
#     r'E:\Thesis\Documments_+_Ref_Summries\DOC_9\DOC_9.txt',
#     r'E:\Thesis\Documments_+_Ref_Summries\DOC_10\DOC_10.txt'
# ]

files_paths = [
    r'E:\Thesis\Dataset_2\DOC_1\DOC_1.txt',
    r'E:\Thesis\Dataset_2\DOC_2\DOC_2.txt',
    r'E:\Thesis\Dataset_2\DOC_3\DOC_3.txt',
    r'E:\Thesis\Dataset_2\DOC_4\DOC_4.txt',
    r'E:\Thesis\Dataset_2\DOC_5\DOC_5.txt',
    r'E:\Thesis\Dataset_2\DOC_6\DOC_6.txt',
    r'E:\Thesis\Dataset_2\DOC_7\DOC_7.txt',
    r'E:\Thesis\Dataset_2\DOC_8\DOC_8.txt',
    r'E:\Thesis\Dataset_2\DOC_9\DOC_9.txt',
    r'E:\Thesis\Dataset_2\DOC_10\DOC_10.txt',
    r'E:\Thesis\Dataset_2\DOC_11\DOC_11.txt',
    r'E:\Thesis\Dataset_2\DOC_12\DOC_12.txt',
    r'E:\Thesis\Dataset_2\DOC_13\DOC_13.txt',
    r'E:\Thesis\Dataset_2\DOC_14\DOC_14.txt',
    r'E:\Thesis\Dataset_2\DOC_15\DOC_15.txt',
    r'E:\Thesis\Dataset_2\DOC_16\DOC_16.txt',
    r'E:\Thesis\Dataset_2\DOC_17\DOC_17.txt',
    r'E:\Thesis\Dataset_2\DOC_18\DOC_18.txt',
    r'E:\Thesis\Dataset_2\DOC_19\DOC_19.txt',
    r'E:\Thesis\Dataset_2\DOC_20\DOC_20.txt'
]

In [44]:
summary_percent = 25
model_path = r"C:\Users\Mohsin\Desktop\wrd2vec\word2vec-google-news-300.gz"
summries = []
count = 1
for file in files_paths:
    summary_sens = get_summary(file, summary_percent, model_path)
    
    obj = {
            'DOC_' + str(count) : summary_sens,
    }
    
    summries.append(obj)
    
    count += 1

    
wr = open(os.path.join(os.path.abspath(os.curdir), '/tmp/result_poposed_model.json'), "w")
for summ in summries:
    wr.write(json.dumps(summ) + "\n\n")
    
wr.close()

2021-04-26 22:17:32,864 - gensim.models.utils_any2vec - INFO - loading projection weights from C:\Users\Mohsin\Desktop\wrd2vec\word2vec-google-news-300.gz
2021-04-26 22:18:04,929 - gensim.models.utils_any2vec - INFO - loaded (3000000, 300) matrix from C:\Users\Mohsin\Desktop\wrd2vec\word2vec-google-news-300.gz


going to read corpus for calculating document frequency


2021-04-26 22:18:17,594 - pyswarms.single.global_best - INFO - Optimize for 100 iters with {'c1': 0.7894736842105263, 'c2': 0.7419354838709677, 'c3': 0.45454545454545453, 'c4': 1.0, 'c5': 0.900523, 'c6': 1.0, 'w': 0.75}
pyswarms.single.global_best: 100%|█████████████████████████████████████████████████████████|100/100, best_cost=6.31e-11
2021-04-26 22:18:17,713 - pyswarms.single.global_best - INFO - Optimization finished | best cost: 6.306013314071113e-11, best pos: [ 2.02920872e-06  1.56552220e-06 -4.87704900e-06 -1.31266800e-06
  5.33630362e-06 -1.58327027e-06]
2021-04-26 22:18:17,723 - pyswarms.single.global_best - INFO - Optimize for 100 iters with {'c1': 0.31578947368421045, 'c2': 0.5483870967741935, 'c3': 0.09090909090909091, 'c4': 0.8571428571428571, 'c5': 0.33995503, 'c6': 0.9166666666666667, 'w': 0.75}
pyswarms.single.global_best: 100%|██████████████████████████████████████████████████████████|100/100, best_cost=9.02e-9
2021-04-26 22:18:17,812 - pyswarms.single.global_best - I

 summary words  104


2021-04-26 22:18:51,427 - gensim.models.utils_any2vec - INFO - loaded (3000000, 300) matrix from C:\Users\Mohsin\Desktop\wrd2vec\word2vec-google-news-300.gz


going to read corpus for calculating document frequency


2021-04-26 22:19:04,259 - pyswarms.single.global_best - INFO - Optimize for 100 iters with {'c1': 0.7, 'c2': 0.75, 'c3': 0.7777777777777778, 'c4': 0.3333333333333333, 'c5': 0.79893506, 'c6': 1.0, 'w': 0.75}
pyswarms.single.global_best: 100%|██████████████████████████████████████████████████████████|100/100, best_cost=2.29e-9
2021-04-26 22:19:04,359 - pyswarms.single.global_best - INFO - Optimization finished | best cost: 2.293162667156001e-09, best pos: [ 3.67564562e-06 -1.26443941e-06 -4.31956569e-05  6.78764232e-07
  1.76177671e-05  1.00668902e-05]
2021-04-26 22:19:04,366 - pyswarms.single.global_best - INFO - Optimize for 100 iters with {'c1': 1.0, 'c2': 0.9, 'c3': 0.6666666666666666, 'c4': 1.0, 'c5': 0.88332415, 'c6': 0.6666666666666666, 'w': 0.75}
pyswarms.single.global_best: 100%|██████████████████████████████████████████████████████████|100/100, best_cost=4.72e-8
2021-04-26 22:19:04,444 - pyswarms.single.global_best - INFO - Optimization finished | best cost: 4.723857161709795e-

 summary words  43


2021-04-26 22:19:37,110 - gensim.models.utils_any2vec - INFO - loaded (3000000, 300) matrix from C:\Users\Mohsin\Desktop\wrd2vec\word2vec-google-news-300.gz


going to read corpus for calculating document frequency


2021-04-26 22:19:50,053 - pyswarms.single.global_best - INFO - Optimize for 100 iters with {'c1': 0.5769230769230769, 'c2': 0.46511627906976744, 'c3': 0.14285714285714285, 'c4': 0.5, 'c5': 0.8795086, 'c6': 1.0, 'w': 0.75}
pyswarms.single.global_best: 100%|██████████████████████████████████████████████████████████|100/100, best_cost=3.04e-9
2021-04-26 22:19:50,151 - pyswarms.single.global_best - INFO - Optimization finished | best cost: 3.0360571749838666e-09, best pos: [ 2.99053166e-05 -3.55302835e-06 -2.73012622e-05  1.29450216e-06
 -1.77916195e-05 -3.26424390e-05]
2021-04-26 22:19:50,158 - pyswarms.single.global_best - INFO - Optimize for 100 iters with {'c1': 0.49999999999999994, 'c2': 0.5348837209302325, 'c3': 0.42857142857142855, 'c4': 0.35714285714285715, 'c5': 1.0, 'c6': 0.9655172413793104, 'w': 0.75}
pyswarms.single.global_best: 100%|██████████████████████████████████████████████████████████|100/100, best_cost=4.07e-8
2021-04-26 22:19:50,235 - pyswarms.single.global_best - INFO

going to read corpus for calculating document frequency


2021-04-26 22:20:38,944 - pyswarms.single.global_best - INFO - Optimize for 100 iters with {'c1': 1.0, 'c2': 0.8333333333333334, 'c3': 0.8, 'c4': 0.6153846153846154, 'c5': 0.7705463, 'c6': 1.0, 'w': 0.75}
pyswarms.single.global_best: 100%|██████████████████████████████████████████████████████████|100/100, best_cost=4.23e-5
2021-04-26 22:20:39,039 - pyswarms.single.global_best - INFO - Optimization finished | best cost: 4.233648057072952e-05, best pos: [ 2.94759947e-05  8.81036485e-05  1.02749358e-04 -6.50179917e-03
 -1.95866988e-04  7.44027435e-05]
2021-04-26 22:20:39,046 - pyswarms.single.global_best - INFO - Optimize for 100 iters with {'c1': 0.3913043478260869, 'c2': 0.5714285714285714, 'c3': 0.0, 'c4': 0.38461538461538464, 'c5': 0.6926882, 'c6': 0.9411764705882354, 'w': 0.75}
pyswarms.single.global_best: 100%|█████████████████████████████████████████████████████████|100/100, best_cost=5.35e-11
2021-04-26 22:20:39,125 - pyswarms.single.global_best - INFO - Optimization finished | be

 summary words  120


2021-04-26 22:20:41,364 - gensim.models.utils_any2vec - INFO - loading projection weights from C:\Users\Mohsin\Desktop\wrd2vec\word2vec-google-news-300.gz
2021-04-26 22:21:13,310 - gensim.models.utils_any2vec - INFO - loaded (3000000, 300) matrix from C:\Users\Mohsin\Desktop\wrd2vec\word2vec-google-news-300.gz


going to read corpus for calculating document frequency


2021-04-26 22:21:26,346 - pyswarms.single.global_best - INFO - Optimize for 100 iters with {'c1': 0.7142857142857142, 'c2': 0.6, 'c3': 0.5714285714285714, 'c4': 0.5833333333333334, 'c5': 0.9951679901130999, 'c6': 1.0, 'w': 0.75}
pyswarms.single.global_best: 100%|█████████████████████████████████████████████████████████|100/100, best_cost=9.62e-10
2021-04-26 22:21:26,453 - pyswarms.single.global_best - INFO - Optimization finished | best cost: 9.618942440624254e-10, best pos: [-2.71783989e-05  2.66102281e-06  6.53454698e-06 -9.84189898e-06
 -4.99644312e-06  7.18471375e-06]
2021-04-26 22:21:26,465 - pyswarms.single.global_best - INFO - Optimize for 100 iters with {'c1': 0.6785714285714285, 'c2': 0.6, 'c3': 0.0, 'c4': 0.6666666666666666, 'c5': 0.8620125606683557, 'c6': 0.96875, 'w': 0.75}
pyswarms.single.global_best: 100%|█████████████████████████████████████████████████████████|100/100, best_cost=6.18e-10
2021-04-26 22:21:26,544 - pyswarms.single.global_best - INFO - Optimization finishe

 summary words  165


2021-04-26 22:21:29,991 - gensim.models.utils_any2vec - INFO - loading projection weights from C:\Users\Mohsin\Desktop\wrd2vec\word2vec-google-news-300.gz
2021-04-26 22:22:02,210 - gensim.models.utils_any2vec - INFO - loaded (3000000, 300) matrix from C:\Users\Mohsin\Desktop\wrd2vec\word2vec-google-news-300.gz


going to read corpus for calculating document frequency


2021-04-26 22:22:15,008 - pyswarms.single.global_best - INFO - Optimize for 100 iters with {'c1': 1.0, 'c2': 1.0, 'c3': 0.45454545454545453, 'c4': 0.8461538461538461, 'c5': 0.85303843, 'c6': 1.0, 'w': 0.75}
pyswarms.single.global_best: 100%|█████████████████████████████████████████████████████████|100/100, best_cost=7.38e-10
2021-04-26 22:22:15,111 - pyswarms.single.global_best - INFO - Optimization finished | best cost: 7.377949403199484e-10, best pos: [ 7.14119910e-06 -1.06091976e-06  1.01586524e-05  5.77469303e-06
 -1.87419282e-05 -1.40665381e-05]
2021-04-26 22:22:15,118 - pyswarms.single.global_best - INFO - Optimize for 100 iters with {'c1': 0.8095238095238095, 'c2': 0.71875, 'c3': 0.0, 'c4': 0.46153846153846156, 'c5': 0.67792183, 'c6': 0.9166666666666667, 'w': 0.75}
pyswarms.single.global_best: 100%|█████████████████████████████████████████████████████████|100/100, best_cost=2.02e-10
2021-04-26 22:22:15,193 - pyswarms.single.global_best - INFO - Optimization finished | best cost:

 summary words  91


2021-04-26 22:22:48,660 - gensim.models.utils_any2vec - INFO - loaded (3000000, 300) matrix from C:\Users\Mohsin\Desktop\wrd2vec\word2vec-google-news-300.gz


going to read corpus for calculating document frequency


2021-04-26 22:23:01,656 - pyswarms.single.global_best - INFO - Optimize for 100 iters with {'c1': 0.49999999999999983, 'c2': 0.3877551020408163, 'c3': 0.6666666666666666, 'c4': 0.4166666666666667, 'c5': 0.9530812, 'c6': 1.0, 'w': 0.75}
pyswarms.single.global_best: 100%|█████████████████████████████████████████████████████████|100/100, best_cost=5.13e-12
2021-04-26 22:23:01,762 - pyswarms.single.global_best - INFO - Optimization finished | best cost: 5.134540650448072e-12, best pos: [-7.91885692e-07  4.15528329e-07  5.47064015e-07  2.00225968e-06
 -1.54567813e-07  5.07919271e-08]
2021-04-26 22:23:01,769 - pyswarms.single.global_best - INFO - Optimize for 100 iters with {'c1': 0.22727272727272715, 'c2': 0.24489795918367346, 'c3': -0.1111111111111111, 'c4': 0.16666666666666666, 'c5': 0.63992256, 'c6': 0.9444444444444445, 'w': 0.75}
pyswarms.single.global_best: 100%|██████████████████████████████████████████████████████████|100/100, best_cost=8.85e-7
2021-04-26 22:23:01,850 - pyswarms.sing

 summary words  111


2021-04-26 22:23:04,234 - gensim.models.utils_any2vec - INFO - loading projection weights from C:\Users\Mohsin\Desktop\wrd2vec\word2vec-google-news-300.gz
2021-04-26 22:23:35,773 - gensim.models.utils_any2vec - INFO - loaded (3000000, 300) matrix from C:\Users\Mohsin\Desktop\wrd2vec\word2vec-google-news-300.gz


going to read corpus for calculating document frequency


2021-04-26 22:23:48,830 - pyswarms.single.global_best - INFO - Optimize for 100 iters with {'c1': 0.5833333333333333, 'c2': 0.5952380952380952, 'c3': 0.75, 'c4': 0.7142857142857143, 'c5': 0.8491529510374191, 'c6': 1.0, 'w': 0.75}
pyswarms.single.global_best: 100%|██████████████████████████████████████████████████████████|100/100, best_cost=3.66e-8
2021-04-26 22:23:48,933 - pyswarms.single.global_best - INFO - Optimization finished | best cost: 3.6622709596328665e-08, best pos: [ 1.85665604e-04 -1.41338471e-05 -2.12712508e-05 -2.44367118e-05
  1.50753582e-07 -3.00264174e-05]
2021-04-26 22:23:48,940 - pyswarms.single.global_best - INFO - Optimize for 100 iters with {'c1': 0.875, 'c2': 0.8809523809523809, 'c3': 0.25, 'c4': 1.0, 'c5': 0.8748454695013698, 'c6': 0.9696969696969697, 'w': 0.75}
pyswarms.single.global_best: 100%|██████████████████████████████████████████████████████████|100/100, best_cost=9.77e-7
2021-04-26 22:23:49,015 - pyswarms.single.global_best - INFO - Optimization finish

 summary words  172


2021-04-26 22:23:52,406 - gensim.models.utils_any2vec - INFO - loading projection weights from C:\Users\Mohsin\Desktop\wrd2vec\word2vec-google-news-300.gz
2021-04-26 22:24:24,095 - gensim.models.utils_any2vec - INFO - loaded (3000000, 300) matrix from C:\Users\Mohsin\Desktop\wrd2vec\word2vec-google-news-300.gz


going to read corpus for calculating document frequency


2021-04-26 22:24:37,189 - pyswarms.single.global_best - INFO - Optimize for 100 iters with {'c1': 0.9411764705882353, 'c2': 0.7142857142857143, 'c3': 0.5, 'c4': 0.7777777777777778, 'c5': 0.90277797, 'c6': 1.0, 'w': 0.75}
pyswarms.single.global_best: 100%|█████████████████████████████████████████████████████████|100/100, best_cost=2.93e-10
2021-04-26 22:24:37,292 - pyswarms.single.global_best - INFO - Optimization finished | best cost: 2.93429678969719e-10, best pos: [ 1.21762096e-05 -5.54417814e-06  6.39492236e-06  3.22403361e-06
  4.64988922e-06  6.44366300e-06]
2021-04-26 22:24:37,299 - pyswarms.single.global_best - INFO - Optimize for 100 iters with {'c1': 0.4117647058823529, 'c2': 0.4857142857142857, 'c3': 0.6666666666666666, 'c4': 0.4444444444444444, 'c5': 0.61241263, 'c6': 0.9473684210526315, 'w': 0.75}
pyswarms.single.global_best: 100%|██████████████████████████████████████████████████████████|100/100, best_cost=4.32e-9
2021-04-26 22:24:37,377 - pyswarms.single.global_best - INF

 summary words  129


2021-04-26 22:24:39,636 - gensim.models.utils_any2vec - INFO - loading projection weights from C:\Users\Mohsin\Desktop\wrd2vec\word2vec-google-news-300.gz
2021-04-26 22:25:11,842 - gensim.models.utils_any2vec - INFO - loaded (3000000, 300) matrix from C:\Users\Mohsin\Desktop\wrd2vec\word2vec-google-news-300.gz


going to read corpus for calculating document frequency


2021-04-26 22:25:24,678 - pyswarms.single.global_best - INFO - Optimize for 100 iters with {'c1': 1.0, 'c2': 1.0, 'c3': 1.0, 'c4': 1.0, 'c5': 0.9194302, 'c6': 1.0, 'w': 0.75}
pyswarms.single.global_best: 100%|██████████████████████████████████████████████████████████|100/100, best_cost=2.72e-9
2021-04-26 22:25:24,786 - pyswarms.single.global_best - INFO - Optimization finished | best cost: 2.716609577459775e-09, best pos: [-4.57816801e-05  1.51863235e-05  1.55241397e-05  7.36807929e-06
 -2.70923532e-06 -9.34855399e-06]
2021-04-26 22:25:24,792 - pyswarms.single.global_best - INFO - Optimize for 100 iters with {'c1': 0.8235294117647058, 'c2': 0.6206896551724138, 'c3': 0.5, 'c4': 0.375, 'c5': 0.90079474, 'c6': 0.8333333333333334, 'w': 0.75}
pyswarms.single.global_best: 100%|█████████████████████████████████████████████████████████|100/100, best_cost=6.22e-12
2021-04-26 22:25:24,870 - pyswarms.single.global_best - INFO - Optimization finished | best cost: 6.2220886907946085e-12, best pos: 

 summary words  54


2021-04-26 22:25:58,380 - gensim.models.utils_any2vec - INFO - loaded (3000000, 300) matrix from C:\Users\Mohsin\Desktop\wrd2vec\word2vec-google-news-300.gz


going to read corpus for calculating document frequency


2021-04-26 22:26:11,088 - pyswarms.single.global_best - INFO - Optimize for 100 iters with {'c1': 0.9166666666666669, 'c2': 0.9130434782608695, 'c3': 1.0, 'c4': 0.5714285714285714, 'c5': 0.53005207, 'c6': 1.0, 'w': 0.75}
pyswarms.single.global_best: 100%|███████████████████████████████████████████████████████████|100/100, best_cost=1.5e-9
2021-04-26 22:26:11,202 - pyswarms.single.global_best - INFO - Optimization finished | best cost: 1.4969053200517951e-09, best pos: [ 3.12279777e-05  9.27524038e-06  8.03502671e-06 -1.01501119e-05
 -1.35561524e-05  9.18329754e-06]
2021-04-26 22:26:11,207 - pyswarms.single.global_best - INFO - Optimize for 100 iters with {'c1': 0.0, 'c2': 0.0, 'c3': -0.16666666666666666, 'c4': 0.0, 'c5': 0.4493242, 'c6': 0.8571428571428573, 'w': 0.75}
pyswarms.single.global_best: 100%|████████████████████████████████████████████████████████████|100/100, best_cost=0.853
2021-04-26 22:26:11,285 - pyswarms.single.global_best - INFO - Optimization finished | best cost: 0.8

 summary words  50


2021-04-26 22:26:44,598 - gensim.models.utils_any2vec - INFO - loaded (3000000, 300) matrix from C:\Users\Mohsin\Desktop\wrd2vec\word2vec-google-news-300.gz


going to read corpus for calculating document frequency


2021-04-26 22:26:57,475 - pyswarms.single.global_best - INFO - Optimize for 100 iters with {'c1': 1.0, 'c2': 1.0, 'c3': 1.0, 'c4': 1.0, 'c5': 1.0, 'c6': 1.0, 'w': 0.75}
pyswarms.single.global_best: 100%|█████████████████████████████████████████████████████████|100/100, best_cost=2.27e-10
2021-04-26 22:26:57,594 - pyswarms.single.global_best - INFO - Optimization finished | best cost: 2.2749002860615345e-10, best pos: [-7.74084672e-06 -1.27098842e-07 -9.08006617e-06  2.27202734e-06
  8.44788365e-06  2.92860350e-06]
2021-04-26 22:26:57,599 - pyswarms.single.global_best - INFO - Optimize for 100 iters with {'c1': 0.49999999999999994, 'c2': 0.4411764705882353, 'c3': 0.8, 'c4': 0.375, 'c5': 0.5312417, 'c6': 0.9333333333333333, 'w': 0.75}
pyswarms.single.global_best: 100%|██████████████████████████████████████████████████████████|100/100, best_cost=1.87e-5
2021-04-26 22:26:57,676 - pyswarms.single.global_best - INFO - Optimization finished | best cost: 1.870584803711594e-05, best pos: [-0.00

 summary words  89


2021-04-26 22:26:59,577 - gensim.models.utils_any2vec - INFO - loading projection weights from C:\Users\Mohsin\Desktop\wrd2vec\word2vec-google-news-300.gz
2021-04-26 22:27:31,196 - gensim.models.utils_any2vec - INFO - loaded (3000000, 300) matrix from C:\Users\Mohsin\Desktop\wrd2vec\word2vec-google-news-300.gz


going to read corpus for calculating document frequency


2021-04-26 22:27:44,112 - pyswarms.single.global_best - INFO - Optimize for 100 iters with {'c1': 1.0, 'c2': 0.8679245283018868, 'c3': 0.5, 'c4': 0.8181818181818182, 'c5': 1.0, 'c6': 1.0, 'w': 0.75}
pyswarms.single.global_best: 100%|█████████████████████████████████████████████████████████|100/100, best_cost=1.13e-10
2021-04-26 22:27:44,221 - pyswarms.single.global_best - INFO - Optimization finished | best cost: 1.133649275140501e-10, best pos: [-4.09591879e-06  5.18595398e-06  6.01982783e-06 -3.69486731e-07
 -5.59630291e-06  1.41449787e-06]
2021-04-26 22:27:44,232 - pyswarms.single.global_best - INFO - Optimize for 100 iters with {'c1': 0.9628146529323252, 'c2': 1.0, 'c3': 1.0, 'c4': 1.0, 'c5': 0.9091716742041025, 'c6': 0.9166666666666667, 'w': 0.75}
pyswarms.single.global_best: 100%|██████████████████████████████████████████████████████████|100/100, best_cost=1.47e-8
2021-04-26 22:27:44,315 - pyswarms.single.global_best - INFO - Optimization finished | best cost: 1.4734296581949455e

 summary words  81


2021-04-26 22:28:18,569 - gensim.models.utils_any2vec - INFO - loaded (3000000, 300) matrix from C:\Users\Mohsin\Desktop\wrd2vec\word2vec-google-news-300.gz


going to read corpus for calculating document frequency


2021-04-26 22:28:31,271 - pyswarms.single.global_best - INFO - Optimize for 100 iters with {'c1': 0.9333333333333331, 'c2': 1.0, 'c3': 1.0, 'c4': 0.5, 'c5': 0.9897151, 'c6': 1.0, 'w': 0.75}
pyswarms.single.global_best: 100%|██████████████████████████████████████████████████████████|100/100, best_cost=1.14e-9
2021-04-26 22:28:31,384 - pyswarms.single.global_best - INFO - Optimization finished | best cost: 1.1393223720291245e-09, best pos: [ 4.09123678e-06  1.62753586e-06  5.69725427e-06 -1.43859318e-05
 -1.04209372e-05 -2.77835493e-05]
2021-04-26 22:28:31,390 - pyswarms.single.global_best - INFO - Optimize for 100 iters with {'c1': 0.2666666666666666, 'c2': 0.36, 'c3': -0.16666666666666666, 'c4': 0.25, 'c5': 0.33417737, 'c6': 0.9333333333333333, 'w': 0.75}
pyswarms.single.global_best: 100%|█████████████████████████████████████████████████████████|100/100, best_cost=0.000256
2021-04-26 22:28:31,465 - pyswarms.single.global_best - INFO - Optimization finished | best cost: 0.00025600300720

 summary words  117


2021-04-26 22:28:33,373 - gensim.models.utils_any2vec - INFO - loading projection weights from C:\Users\Mohsin\Desktop\wrd2vec\word2vec-google-news-300.gz
2021-04-26 22:29:05,073 - gensim.models.utils_any2vec - INFO - loaded (3000000, 300) matrix from C:\Users\Mohsin\Desktop\wrd2vec\word2vec-google-news-300.gz


going to read corpus for calculating document frequency


2021-04-26 22:29:18,298 - pyswarms.single.global_best - INFO - Optimize for 100 iters with {'c1': 0.7916666666666666, 'c2': 0.6470588235294118, 'c3': 0.5, 'c4': 0.6428571428571429, 'c5': 1.0, 'c6': 1.0, 'w': 0.75}
pyswarms.single.global_best: 100%|█████████████████████████████████████████████████████████|100/100, best_cost=2.31e-10
2021-04-26 22:29:18,413 - pyswarms.single.global_best - INFO - Optimization finished | best cost: 2.312569123202594e-10, best pos: [-2.64539165e-06  3.82818313e-06 -9.98847585e-06 -5.90039942e-06
  2.87882913e-06  8.16895397e-06]
2021-04-26 22:29:18,422 - pyswarms.single.global_best - INFO - Optimize for 100 iters with {'c1': 0.9166666666666666, 'c2': 0.7450980392156863, 'c3': 1.0, 'c4': 0.5714285714285714, 'c5': 0.9471225141084264, 'c6': 0.9375000000000001, 'w': 0.75}
pyswarms.single.global_best: 100%|█████████████████████████████████████████████████████████|100/100, best_cost=6.71e-10
2021-04-26 22:29:18,497 - pyswarms.single.global_best - INFO - Optimizat

 summary words  104


2021-04-26 22:29:52,447 - gensim.models.utils_any2vec - INFO - loaded (3000000, 300) matrix from C:\Users\Mohsin\Desktop\wrd2vec\word2vec-google-news-300.gz


going to read corpus for calculating document frequency


2021-04-26 22:30:05,173 - pyswarms.single.global_best - INFO - Optimize for 100 iters with {'c1': 0.0, 'c2': 0.0, 'c3': 1.0, 'c4': 0.0, 'c5': 0.0, 'c6': 1.0, 'w': 0.75}
pyswarms.single.global_best: 100%|████████████████████████████████████████████████████████████|100/100, best_cost=0.993
2021-04-26 22:30:05,283 - pyswarms.single.global_best - INFO - Optimization finished | best cost: 0.9927932226896597, best pos: [0.15675921 0.52054674 0.60490927 0.18791903 0.16090136 0.51974305]
2021-04-26 22:30:05,290 - pyswarms.single.global_best - INFO - Optimize for 100 iters with {'c1': 0.1612903225806451, 'c2': 0.28846153846153844, 'c3': 0.0, 'c4': 0.0, 'c5': 0.74231976, 'c6': 0.8, 'w': 0.75}
pyswarms.single.global_best: 100%|██████████████████████████████████████████████████████████|100/100, best_cost=0.00188
2021-04-26 22:30:05,366 - pyswarms.single.global_best - INFO - Optimization finished | best cost: 0.0018826995744993583, best pos: [ 0.01002153  0.01615152 -0.00111635  0.01495759 -0.03598

 summary words  59


2021-04-26 22:30:38,417 - gensim.models.utils_any2vec - INFO - loaded (3000000, 300) matrix from C:\Users\Mohsin\Desktop\wrd2vec\word2vec-google-news-300.gz


going to read corpus for calculating document frequency


2021-04-26 22:30:51,213 - pyswarms.single.global_best - INFO - Optimize for 100 iters with {'c1': 0.75, 'c2': 0.75, 'c3': 1.0, 'c4': 0.6666666666666666, 'c5': 0.71332043, 'c6': 1.0, 'w': 0.75}
pyswarms.single.global_best: 100%|███████████████████████████████████████████████████████████|100/100, best_cost=1.3e-7
2021-04-26 22:30:51,327 - pyswarms.single.global_best - INFO - Optimization finished | best cost: 1.2950551936876828e-07, best pos: [ 3.59029613e-05 -2.91557175e-04  5.11619126e-05 -1.85568919e-04
  6.54997287e-05 -4.32126217e-05]
2021-04-26 22:30:51,334 - pyswarms.single.global_best - INFO - Optimize for 100 iters with {'c1': 1.0, 'c2': 1.0, 'c3': 1.0, 'c4': 0.5, 'c5': 1.0, 'c6': 0.4999999999999999, 'w': 0.75}
pyswarms.single.global_best: 100%|██████████████████████████████████████████████████████████|100/100, best_cost=1.11e-9
2021-04-26 22:30:51,414 - pyswarms.single.global_best - INFO - Optimization finished | best cost: 1.1098400089967476e-09, best pos: [-3.91481389e-06 -1.

 summary words  50


2021-04-26 22:31:24,435 - gensim.models.utils_any2vec - INFO - loaded (3000000, 300) matrix from C:\Users\Mohsin\Desktop\wrd2vec\word2vec-google-news-300.gz


going to read corpus for calculating document frequency


2021-04-26 22:31:37,385 - pyswarms.single.global_best - INFO - Optimize for 100 iters with {'c1': 1.0, 'c2': 1.0, 'c3': 0.0, 'c4': 1.0, 'c5': 0.55068547, 'c6': 1.0, 'w': 0.75}
pyswarms.single.global_best: 100%|██████████████████████████████████████████████████████████|100/100, best_cost=5.75e-9
2021-04-26 22:31:37,477 - pyswarms.single.global_best - INFO - Optimization finished | best cost: 5.750941030946733e-09, best pos: [-2.19300517e-05  3.43779078e-05 -1.31954010e-05  8.64113705e-06
 -1.21143329e-05 -6.07670009e-05]
2021-04-26 22:31:37,483 - pyswarms.single.global_best - INFO - Optimize for 100 iters with {'c1': 0.5499999999999999, 'c2': 0.7317073170731707, 'c3': 0.5, 'c4': 0.7, 'c5': 0.4300745, 'c6': 0.9615384615384615, 'w': 0.75}
pyswarms.single.global_best: 100%|██████████████████████████████████████████████████████████|100/100, best_cost=5.64e-8
2021-04-26 22:31:37,567 - pyswarms.single.global_best - INFO - Optimization finished | best cost: 5.635450538035974e-08, best pos: [ 1

going to read corpus for calculating document frequency


2021-04-26 22:32:25,219 - pyswarms.single.global_best - INFO - Optimize for 100 iters with {'c1': 1.0, 'c2': 1.0, 'c3': 0.6, 'c4': 1.0, 'c5': 1.0, 'c6': 1.0, 'w': 0.75}
pyswarms.single.global_best: 100%|█████████████████████████████████████████████████████████|100/100, best_cost=7.15e-10
2021-04-26 22:32:25,324 - pyswarms.single.global_best - INFO - Optimization finished | best cost: 7.14548731656495e-10, best pos: [ 1.12640954e-05 -2.11598662e-05  1.47008914e-07 -1.04606848e-05
 -1.23415032e-06  5.38129030e-06]
2021-04-26 22:32:25,330 - pyswarms.single.global_best - INFO - Optimize for 100 iters with {'c1': 0.5833333333333333, 'c2': 0.9565217391304348, 'c3': 0.4, 'c4': 1.0, 'c5': 0.6331804, 'c6': 0.6666666666666666, 'w': 0.75}
pyswarms.single.global_best: 100%|██████████████████████████████████████████████████████████|100/100, best_cost=2.17e-9
2021-04-26 22:32:25,408 - pyswarms.single.global_best - INFO - Optimization finished | best cost: 2.1743381560292223e-09, best pos: [ 1.498424

 summary words  34


2021-04-26 22:32:58,266 - gensim.models.utils_any2vec - INFO - loaded (3000000, 300) matrix from C:\Users\Mohsin\Desktop\wrd2vec\word2vec-google-news-300.gz


going to read corpus for calculating document frequency


2021-04-26 22:33:10,973 - pyswarms.single.global_best - INFO - Optimize for 100 iters with {'c1': 0.6206896551724137, 'c2': 0.5576923076923077, 'c3': 1.0, 'c4': 0.6153846153846154, 'c5': 0.75377584, 'c6': 1.0, 'w': 0.75}
pyswarms.single.global_best: 100%|█████████████████████████████████████████████████████████|100/100, best_cost=1.76e-10
2021-04-26 22:33:11,091 - pyswarms.single.global_best - INFO - Optimization finished | best cost: 1.7554013879776753e-10, best pos: [-2.89676286e-06  3.89642421e-06  5.69639065e-07  3.82850016e-06
 -1.13860762e-05 -2.70963980e-06]
2021-04-26 22:33:11,097 - pyswarms.single.global_best - INFO - Optimize for 100 iters with {'c1': 0.3448275862068965, 'c2': 0.25, 'c3': 1.0, 'c4': 0.23076923076923078, 'c5': 0.73926455, 'c6': 0.8750000000000001, 'w': 0.75}
pyswarms.single.global_best: 100%|███████████████████████████████████████████████████████████|100/100, best_cost=0.0038
2021-04-26 22:33:11,176 - pyswarms.single.global_best - INFO - Optimization finished 

 summary words  71
