In [29]:
from sentence_transformers import SentenceTransformer, util
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score
from sklearn.preprocessing import normalize, LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.manifold import TSNE, LocallyLinearEmbedding, SpectralEmbedding
from sklearn.decomposition import KernelPCA, SparsePCA, TruncatedSVD, PCA
from matplotlib import pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.manifold import MDS
from sklearn.manifold import TSNE

from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
import re     
import itertools 



In [30]:
review = pd.read_csv('https://raw.githubusercontent.com/LuisSante/Datasets/main/app_reviews.csv')
review

Unnamed: 0,package_name,review,date,star
0,com.mantz_it.rfanalyzer,Great app! The new version now works on my Bra...,October 12 2016,4
1,com.mantz_it.rfanalyzer,Great It's not fully optimised and has some is...,August 23 2016,4
2,com.mantz_it.rfanalyzer,Works on a Nexus 6p I'm still messing around w...,August 04 2016,5
3,com.mantz_it.rfanalyzer,The bandwidth seemed to be limited to maximum ...,July 25 2016,3
4,com.mantz_it.rfanalyzer,Works well with my Hackrf Hopefully new update...,July 22 2016,5
...,...,...,...,...
288060,com.termux.api,it doesn't do anything after installing this i...,June 24 2016,3
288061,com.termux.api,I like this app . Its is very helpful for use....,June 20 2016,5
288062,com.termux.api,Finally Brings back the Unix command line to A...,May 20 2016,5
288063,com.termux.api,The API feature is great just need loads more...,May 05 2016,5


In [31]:
def extract_corpus(dataset):
    lista = []  
    for i in range(len(dataset['package_name'].unique())):
        dataset_temp = dataset.loc[dataset['package_name'] == dataset['package_name'].unique()[i]]
        lista.append({'package_name':dataset['package_name'].unique()[i], 'size': len(dataset_temp)})

    lista = sorted(lista, key=lambda x: x['size'], reverse=True)
    dataframe = dataset[dataset['package_name'] == lista[8]['package_name']]
    corpus = list(dataframe['review'])

    return corpus

In [32]:
def convert_corpus_to_dataFrame(corpus):
    corpus_ds = {
        'Sentences' : corpus
    }

    dataset_new = pd.DataFrame(corpus_ds)
    return dataset_new

In [34]:
def clean_corpus(ds_new):
        
    for i in range(1094):
        sentences=ds_new.Sentences[i]
       # print(sentences)
        
        ds_new.Sentences[i] = re.sub(r'https?:\/\/.\S+', "", ds_new.Sentences[i]) 
          
        ds_new.Sentences[i] = re.sub(r'"', '', ds_new.Sentences[i]) 
        
        ds_new.Sentences[i] = re.sub(r'#', '', ds_new.Sentences[i]) 
          
        ds_new.Sentences[i] = re.sub(r'^RT[\s]+', '', ds_new.Sentences[i]) 
              
        Apos_dict={"'s":" is","n't":" not","'m":" am","'    ll":" will", 
               "'d":" would","'ve":" have","'re":" are"}     
          
        for key,value in Apos_dict.items(): 
            if key in ds_new.Sentences[i]: 
                ds_new.Sentences[i]=ds_new.Sentences[i].replace(key,value) 
        ds_new.Sentences[i] = " ".join([s for s in re.split("([A-Z][a-z]+[^A-Z]*)",ds_new.Sentences[i]) if s])
        ds_new.Sentences[i]=ds_new.Sentences[i].lower() 
        file=open("slang.txt","r") 
        slang=file.read() 
          
        slang=slang.split('\n') 
          
        tweet_tokens= ds_new.Sentences[i].split() 
        slang_word=[] 
        meaning=[] 
          
        for line in slang: 
            temp=line.split("=") 
            slang_word.append(temp[0]) 
            meaning.append(temp[-1]) 
          
        for i,word in enumerate(tweet_tokens): 
            if word in slang_word: 
                idx=slang_word.index(word) 
                tweet_tokens[i]=meaning[idx] 
                  
        ds_new.Sentences[i]=" ".join(tweet_tokens) 
        ds_new.Sentences[i] = ''.join(''.join(s)[:2] for _, s in itertools.groupby(ds_new.Sentences[i]))   
        from autocorrect import Speller  
        spell = Speller(lang='en') 
        ds_new.Sentences[i]=spell(ds_new.Sentences[i]) 
    return ds_new
        
        

In [35]:
def neural_embeddings(dataset):
    model_embedder = SentenceTransformer('all-MiniLM-L6-v2')
    corpus_ = extract_corpus(dataset)
    corpus = clean_corpus(corpus_) 
    corpus_embeddings = model_embedder.encode(corpus, convert_to_tensor=True, show_progress_bar=True)
    corpus_embeddings = corpus_embeddings /  np.linalg.norm(corpus_embeddings, axis=1, keepdims=True)

    return corpus_embeddings

In [36]:
def silhoutte(dataset, attempts):

    embeddings = neural_embeddings(dataset)
    scores_silhouette = []

    for k in range(2,attempts):

        agglomerative_clusterering = AgglomerativeClustering(n_clusters=k, affinity="cosine" , linkage="complete").fit(embeddings)
        cluster_labels = agglomerative_clusterering.labels_

        silhouette_avg = silhouette_score(embeddings, cluster_labels)
        scores_silhouette.append(silhouette_avg)

    max_score = max(scores_silhouette)
    max_index = scores_silhouette.index(max_score)
    n_clusters = max_index + 2

    return n_clusters, embeddings

In [38]:
def segmentation(dataset_review, attempts):
    n_clusters, embeddings = silhoutte(dataset_review, attempts)

    agglomerative_clusterering = AgglomerativeClustering(n_clusters=n_clusters, affinity="cosine" , linkage="complete").fit(embeddings)
    cluster_labels = agglomerative_clusterering.labels_

    return n_clusters, cluster_labels

In [37]:

def clustering(dataset_review, attempts):
    n_clusters, labels = segmentation(dataset_review, attempts)
    corpus_dataset = extract_corpus(dataset_review)

    clustered_sentences = {}
    for sentence_id, cluster_id in enumerate(labels):
        if cluster_id not in clustered_sentences:
            clustered_sentences[cluster_id] = []
    
        clustered_sentences[cluster_id].append(corpus_dataset[sentence_id])
    
    for i, cluster in clustered_sentences.items():
        print("Cluster ", i+1)
        print(cluster)
        print("     ")

In [None]:
clustering(review, 10) 

In [40]:
def red_dim(corpus_embeddings):
    scaler = PCA(n_components=100, random_state = 100)
    X_principal = scaler.fit_transform(corpus_embeddings)
    X_principal = TSNE(n_components=2, learning_rate='auto',init='random', perplexity=3).fit_transform(X_principal)
    distribution = pd.DataFrame(X_principal, columns=['x', 'y'])
    distribution
    
    return X_principal, distribution

In [42]:
def show_dimentions(distribution):
    for col in 'xy':
        sns.kdeplot(distribution[col], shade=True)

    with sns.axes_style(style='ticks'):
       g = sns.factorplot(data=distribution, kind="box")

In [43]:
def show_graphics(corpus_embeddings,X_principal,labels_):
    point_size = 100.0 / np.sqrt(corpus_embeddings.shape[0])
    result = pd.DataFrame(X_principal, columns=['x', 'y'])
    result['labels'] = labels_
    print(result)
    
    fig, ax = plt.subplots(figsize=(14, 8))
    clustered = result[result.labels != -1]
    plt.scatter(clustered.x, clustered.y, c=clustered.labels, s=20, cmap='Spectral')
    plt.colorbar()
    plt.show()

In [44]:
def clustering_whit_m(dataset_review, attempts):
    x_principal, distribution=red_dim(dataset_review)
    show_dimentions(distribution)

    n_clusters, labels = segmentation(x_principal, attempts)
    corpus_dataset = extract_corpus(x_principal)

    clustered_sentences = {}
    for sentence_id, cluster_id in enumerate(labels):
        if cluster_id not in clustered_sentences:
            clustered_sentences[cluster_id] = []
    
        clustered_sentences[cluster_id].append(corpus_dataset[sentence_id])
    
    for i, cluster in clustered_sentences.items():
        print("Cluster ", i+1)
        print(cluster)
        print("     ")
    show_graphics(corpus_dataset,x_principal,labels)

In [45]:
clustering_whit_m(review, 10)

ValueError: could not convert string to float: 'com.mantz_it.rfanalyzer'