In [1]:
import json

# Load the list from the file
with open('all_topics.json', 'r') as f:
    all_topics = json.load(f)


In [2]:
import numpy as np
from nltk.stem import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter


# Load GloVe word vectors from a text file 
glove_file_path = "C:\\Users\\houss\\Downloads\\glove.6B\\glove.6B.300d.txt"

def load_glove_embeddings(file_path):
    # Create a dictionary to store word vectors
    embeddings_index = {}
    with open(file_path, encoding="utf8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype="float32")
            embeddings_index[word] = coefs
    return embeddings_index

# Load GloVe embeddings
glove_embeddings = load_glove_embeddings(glove_file_path)

# function to calculate cosine similarity between two topics using GloVe embeddings
def topic_similarity_glove(topic1, topic2, embeddings):
    tokens1 = topic1.split()
    tokens2 = topic2.split()
    embeddings1 = [embeddings.get(token, np.zeros(300)) for token in tokens1]
    embeddings2 = [embeddings.get(token, np.zeros(300)) for token in tokens2]

    if embeddings1 and embeddings2:
        similarity = cosine_similarity(np.mean(embeddings1, axis=0).reshape(1, -1),
                                       np.mean(embeddings2, axis=0).reshape(1, -1))
        return similarity[0, 0]
    else:
        return 0.0

# Group similar topics together using GloVe embeddings
def group_similar_topics_glove(topics, embeddings):
    topic_groups = {}
    topic_frequencies = Counter()

    for topic in topics:
        matched = False
        for group in topic_groups:
            representative_topic = topic_groups[group][0]
            similarity = topic_similarity_glove(topic, representative_topic, embeddings)
            if similarity > 0.6: 
                topic_groups[group].append(topic)
                topic_frequencies[group] += 1
                matched = True
                break
        if not matched:
            topic_groups[topic] = [topic]
            topic_frequencies[topic] += 1

    # Select the most frequent topic in each group as the key
    grouped_topics = {}
    for group, topics in topic_groups.items():
        most_common_topic = max(topics, key=lambda topic: topic_frequencies[topic])
        grouped_topics[most_common_topic] = topics

    return grouped_topics

# Group similar topics using GloVe embeddings
similar_topic_groups = group_similar_topics_glove(all_topics, glove_embeddings)

# Print the grouped topics
for group, topics in similar_topic_groups.items():
    print(f"Group {group}: {topics}")


Group telecommunication: ['telecommunication', 'telecommunication', 'telecommunication', 'telecommunication', 'telecommunication', 'telecommunication', 'telecommunication', 'telecommunication', 'telecommunication', 'telecommunications', 'telecommunication', 'telecommunication', 'telecommunication', 'telecommunication', 'telecommunication operator', 'telecommunication', 'telecommunication', 'telecommunication', 'telecommunication', 'telecommunication', 'telecommunication', 'telecommunication', 'telecommunication', 'telecommunication', 'telecommunication', 'telecommunication', 'telecommunication', 'telecommunication', 'telecommunication', 'telecommunication', 'telecommunication', 'telecommunication', 'telecommunication', 'telecommunication operator', 'telecommunication package', 'telecom operator']
Group discount: ['discount', 'discount offer', 'discount', 'discounts', 'discounts', 'discount', 'discount', 'discount', 'discount', 'discount availability', 'discount offer', 'discounts', 'di

In [3]:
# Load the list from the file
with open('long_calls.json', 'r') as f:
    long_calls = json.load(f)

In [4]:
len(long_calls)

85

In [5]:

# Load the dictionary from the file
with open('topics_per_call.json', 'r') as f:
    topics_per_call = json.load(f)
    
len(topics_per_call)

85

In [6]:
topics_per_call= dict(zip(long_calls, topics_per_call.values()))



In [7]:
for k, topics in topics_per_call.items() : 
    for i, topic in enumerate(topics):
        for group , items in similar_topic_groups.items():
            if topic in items :
                topics_per_call[k][i] = group 
                break

In [8]:
for k, topics in topics_per_call.items():
        topics_per_call[k]=list(set(topics_per_call[k]))

In [9]:
# to check if the topics changed
for topics in topics_per_call.values():
    if "billing" in topics :
        print("hello")
        break

In [10]:
# Save the dictionary to a file
with open('final_topics.json', 'w') as f:
    json.dump(topics_per_call, f)

In [11]:
topics_list = []
for k, topics in topics_per_call.items():
    topics_list.extend(topics)

In [12]:
topics_list


['payment',
 'unlimited calls',
 'termination',
 'telecommunication',
 'contract',
 'internet',
 'traveling',
 'discount',
 'customer service.',
 'subscription',
 'price change',
 'address verification',
 'personal information.',
 'telecommunication',
 'promotions',
 'internet',
 'data collection',
 'phone operator',
 'customer service.',
 'subscription',
 'payment',
 'internet',
 'contract',
 'account access',
 'television service',
 'customer service.',
 'subscription',
 'offers.',
 'new offer',
 'telecommunication',
 'internet',
 'referral program',
 'activation date',
 'customer service.',
 'subscription',
 'billing issue',
 'internet',
 'address verification',
 'swiss permit',
 'telecommunication',
 'contract',
 'pricing',
 'date of birth',
 'data collection',
 'phone operator',
 'flash sale',
 'e-mail communication',
 'television service',
 'customer service.',
 'subscription',
 'nationality',
 'billing issue',
 'telecommunication',
 'promotions',
 'pricing',
 'internet',
 'contr

In [13]:
from collections import Counter

# Calculate the frequency of each topic
frequency = Counter(topics_list)
frequency = sorted(frequency.items(), key=lambda item: item[1], reverse=True)

In [14]:
frequency


[('subscription', 58),
 ('customer service.', 56),
 ('internet', 52),
 ('telecommunication', 35),
 ('phone operator', 34),
 ('contract', 33),
 ('unlimited calls', 32),
 ('billing issue', 31),
 ('new offer', 26),
 ('discount', 24),
 ('address verification', 23),
 ('television service', 22),
 ('payment', 21),
 ('price change', 21),
 ('e-mail communication', 20),
 ('data collection', 19),
 ('pricing', 18),
 ('call back', 15),
 ('promotions', 13),
 ('sim card (chips)', 9),
 ('personal information.', 8),
 ('swiss permit', 8),
 ('delivery', 8),
 ('validation mail', 7),
 ('termination', 6),
 ('date of birth', 6),
 ('fiber optic', 6),
 ('nationality', 5),
 ('working hours', 5),
 ('reservation', 5),
 ('prepaid', 5),
 ('cancellation', 5),
 ('traveling', 4),
 ('referral program', 4),
 ('invoice', 4),
 ('scheduling', 4),
 ('mobile number portability', 4),
 ('photo submission', 4),
 ('confirmation', 4),
 ('account access', 3),
 ('activation date', 3),
 ('callback timing', 3),
 ('rescheduling call',

In [15]:
# Save the list to a file
with open('topic_freq.json', 'w') as f:
    json.dump(frequency, f)