In [1]:
import pandas as pd
df = pd.read_csv('companyReviews.csv', encoding='utf8')
df2 = df[df['review'].notna()]
df2["index"] = df2.index # Not fantastic

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2["index"] = df2.index # Not fantastic


In [2]:
import spacy_udpipe
spacy_udpipe.download("sv")

Already downloaded a model for the 'sv' language


In [3]:
# Clean the code from words with limited information and lemmatize
from langdetect import detect

def initClean(df):
    removeWords = ["!", ".", ",", "\r", "\n", "-", "'", '"', "^", "(", ")", "’", ":", ";", "+", "?"]
    with open("Stoppord.txt", encoding="utf-8") as file:
        data = file.readlines()
        for line in data:
            removeWords.append(line.strip())
    nlp = spacy_udpipe.load("sv")
    return removeWords, nlp

removeWords, nlp = initClean(df2)

def cleanReview(review):
    try:
        lang = detect(review)
    except:
        lang = "ERROR"
        # print("Error", review)
        
    if lang != "sv":
        return "WRONG_LANGUAGE"
    else: # Clean the text, Lemmatize, remove stopwords
        doc = nlp(review)
        newText = ""
        for token in doc:
            if token.lemma_.lower() in removeWords:
                pass
            else:
                newText += token.lemma_.lower() + " "
        if len(newText) == 0:
            return "WRONG_LANGUAGE"
        newText = newText[:-1]
        newText = newText.strip()
        return newText
    
df3 = pd.DataFrame()
df3["review"] = df2["review"].apply(cleanReview)
df3["index"] = df2["index"]
df3['grade'] = df2['grade'].astype(int)
df3['company'] = df2['company'].astype(str)
df3 = df3[df3['review'] != "WRONG_LANGUAGE"]
df3 = df3[df3['review'].notna()]

In [4]:
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
import umap.umap_ as umap

vectorizer = TfidfVectorizer(min_df=10, max_features=10000, ngram_range=(1, 2))
vz = vectorizer.fit_transform(df3['review'])

svd_model = TruncatedSVD(n_components=50)
svd_tfidf = svd_model.fit_transform(vz)

umap_model = umap.UMAP(n_components=2)
umap_tfidf = umap_model.fit_transform(svd_tfidf)

failed. This is likely due to too small an eigengap. Consider
adding some noise or jitter to your data.

Falling back to random initialisation!
  warn(


In [5]:
from sklearn.cluster import DBSCAN
eps = 1 # how close points should be to each other to be considered a part of a cluster, e.g 2.5
min_samples = 20 # the minimum number of points to form a dense region, e.g 15
dbscan = DBSCAN(eps=eps,min_samples=min_samples)
dbscan_model = dbscan.fit(umap_tfidf)

In [6]:
from sklearn.feature_extraction.text import CountVectorizer 
import numpy as np
import lda

# create document term matrix
min_df = 4 # minimum required occurences of a word, e.g 4
max_features = 10000 # max number of unique words, e.g 10000

vectorizer = CountVectorizer(min_df=min_df, max_features=max_features, ngram_range=(1, 2)) # unigram & bigram
X = vectorizer.fit_transform(df3["review"])


# build LDA model
n_topics = 25 # pick the number of topics, e.g 5
n_iter = 2000 # number of learning iterations, e.g 2000

lda_model = lda.LDA(n_topics=n_topics, n_iter=n_iter)
X_topics = lda_model.fit_transform(X) # X is document term matrix

# Print the topics found by the LDA model
n_top_words = 10
topic_summaries = []

topic_word = lda_model.topic_word_  # get the topic words
vocab = vectorizer.get_feature_names()

INFO:lda:n_documents: 20732
INFO:lda:vocab_size: 8428
INFO:lda:n_words: 209257
INFO:lda:n_topics: 25
INFO:lda:n_iter: 2000
INFO:lda:<0> log likelihood: -2672783
INFO:lda:<10> log likelihood: -1882060
INFO:lda:<20> log likelihood: -1800038
INFO:lda:<30> log likelihood: -1763763
INFO:lda:<40> log likelihood: -1742565
INFO:lda:<50> log likelihood: -1728754
INFO:lda:<60> log likelihood: -1718022
INFO:lda:<70> log likelihood: -1710565
INFO:lda:<80> log likelihood: -1706219
INFO:lda:<90> log likelihood: -1702955
INFO:lda:<100> log likelihood: -1699692
INFO:lda:<110> log likelihood: -1696284
INFO:lda:<120> log likelihood: -1693705
INFO:lda:<130> log likelihood: -1690532
INFO:lda:<140> log likelihood: -1690561
INFO:lda:<150> log likelihood: -1688092
INFO:lda:<160> log likelihood: -1686595
INFO:lda:<170> log likelihood: -1684051
INFO:lda:<180> log likelihood: -1683674
INFO:lda:<190> log likelihood: -1682976
INFO:lda:<200> log likelihood: -1681188
INFO:lda:<210> log likelihood: -1681374
INFO:lda

In [7]:
df3["wordCount"] = df3["review"].apply(lambda x: len(x.split()))

In [8]:
# df3.to_csv("processed.csv", encoding="utf-8")
# df3 = pd.read_csv('processed.csv', encoding='utf8')

In [9]:
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    topic_summaries.append(' '.join(topic_words))
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))

Topic 0: paket skicka hämta leverans post postnord beställa sms adress välja
Topic 1: personal service bemötande leverans kunnig kund rekommenderar hjälpsam rekommendera varm
Topic 2: rabatt prisjakt betyg köp använda kund rabattkod låg via black
Topic 3: leverans fungera beställning kvalitet köp beställning leverans jätte jättebra klaga jättenöjd
Topic 4: ny beställa köpte bord skada hem sakna furniturebox stol soffa
Topic 5: skicka tillbaka ny pengar fel skicka tillbaka betala retur reklamation kundtjänst
Topic 6: företag kund oseriös conrad usel se varning kundservice riktig böra
Topic 7: skicka order beställning ord beställa tid leveranstid ens lag mail
Topic 8: kundtjänst problem leverans kontakt bemötande lösa cykelkraft cykeln cykel kontakt kundtjänst
Topic 9: service leverans leveranse utbud supersnabb fantastisk grym kundservice service leverans supersnabb leverans
Topic 10: köp installation hel installatör tid pump fungera genomföra beställning köpet
Topic 11: ringa tid höra 

In [10]:
import pyLDAvis
# VISUALIZATION
def prepareLDAData(lda_model, lda_df, vectorizer):
    data = {
        'vocab': vocab,
        'doc_topic_dists': lda_model.doc_topic_,
        'doc_lengths': list(lda_df['wordCount']),
        'term_frequency':vectorizer.vocabulary_,
        'topic_term_dists': lda_model.components_
    } 
    return data

# load the pre-prepared pyLDAvis data
lda_data = prepareLDAData(lda_model=lda_model, lda_df=df3, vectorizer=vectorizer)
prepared_data = pyLDAvis.prepare(**lda_data)
pyLDAvis.display(prepared_data)

  default_term_info = default_term_info.sort_values(


In [32]:
def nameTopics():
    default = ["Seriöshet", "Kundtjänst (generisk)", "Lager", "Betyg(noise)", "Returhantering", "Kundtjänst (tel)", "Leverans/Paket", "Bok/Jul (noise)", "Leveranstid", "Installation", "Möbler (noise)", "Kläder (noise)", "Fraktkostnad", "Betalning", "Linser + rabatt (noise)", "Service (noise)", "problem", "Lev/betal-alternativ", "Hemsida (sök)", "Hemsida (tydlighet)", "Leverans/sortiment", "Leveranstid 2", "Leverans(fungera)", "Frakt (noise)", "Verkstad/Service"] # These have to be manually changed by a human! (for now)
    names = []
    print("q -> quit")
    for i in range(n_topics):
        query = input(f"Namn för topic nr {i+1}: ")
        if query == "q":
            return default
        names.append(query)
    return names

In [33]:
%%capture
topicNames = nameTopics()

def classifier(review="hemsidan går att läsa men de är inte bra på att skicka paket. Min stol kom aldrig fram"):
    cleanR = review # cleanReview(review)
    vector = vectorizer.transform([cleanR])
    v = vector.toarray()
    
    topic_probs = lda_model.transform(v)
    results = []
    for i, t_prob in enumerate(topic_probs[0]):
        if t_prob > 0.10:
            results.append([i, t_prob])
    
    return results

def classifierFromX(topicProbs):
    results = []
    for reviewTopics in topicProbs:
        topicP = [0 for i in range(n_topics)]
        for i, t_prob in enumerate(reviewTopics):
            if t_prob > 0.10: # Only keep probabilities where we are kind of sure
                topicP[i] = t_prob
        results.append(topicP)

    return results
    
    

In [17]:
results = classifierFromX(X_topics)
df = pd.DataFrame(results)
df["index"] = df3["index"]# Not fantastic
reviewInfoDf = pd.merge(df3, df)
reviewInfoDf.head()
reviewInfoDf.shape

Unnamed: 0,review,index,grade,company,wordCount
0,leverans,0,5,Dustin,1
1,beställning förmiddag timme hemlevera timme ma...,1,2,Dustin,15
2,fruktansvärd leveranstid,2,2,Dustin,2
3,känna förtroende dustin home märkt flertal ord...,3,5,Dustin,13
4,märka tydlig seriösa företag varm webbutik utb...,4,5,Dustin,17


In [18]:
reviewInfoDf.head()

Unnamed: 0,review,index,grade,company,wordCount,0,1,2,3,4,...,15,16,17,18,19,20,21,22,23,24
0,leverans,0,5,Dustin,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.314286,0.0,0.0,0.0
1,beställning förmiddag timme hemlevera timme ma...,1,2,Dustin,15,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.423256,0.0
2,fruktansvärd leveranstid,2,2,Dustin,2,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.563636,0.0,0.0
3,känna förtroende dustin home märkt flertal ord...,3,5,Dustin,13,0.0,0.12,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.234286,0.0,0.0,0.0,0.0
4,märka tydlig seriösa företag varm webbutik utb...,4,5,Dustin,17,0.0,0.121569,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.278431,0.160784,0.0,0.0


In [81]:
# Main calculation
companyScores = {}
companyOverallScores = {}
overallScore = 0
reviewCount = 0
reviewTopicDist = [0 for i in range(n_topics)]
previousCompany = reviewInfoDf["company"].iloc[0]
n = 0
for index, row in reviewInfoDf.iterrows():
    company = row["company"]
    if company == previousCompany:
        reviewCount += 1
    else:
        companyScores[previousCompany] = [prob/(2*reviewCount) for prob in reviewTopicDist]
        companyOverallScores[previousCompany] = overallScore/reviewCount
        reviewTopicDist = [0 for i in range(n_topics)]
        overallScore = 0
        previousCompany = company
        reviewCount = 1
        
    scores = list(row.iloc[5:n_topics+5])
    for index in range(len(scores)):
        reviewTopicDist[index] += (row["grade"]-3)*scores[index]
    overallScore += row["grade"]
        
    

In [84]:
infoOut = {}
for company in companyScores.items(): # We can mmake this much more complicated
    name = company[0]
    scores = company[1]
    npScores = np.array(scores).astype(float)
    indexRange = npScores.argsort()
    bottomTopics = indexRange[0:4]
    topTopics = indexRange[-4:]
    infoOut[name] = [topTopics, bottomTopics, npScores]
    

In [92]:
outInfo = []
for company in infoOut.items():
    bestTopics = company[1][0][::-1]
    worstTopics = company[1][1]
    bestStr, worstStr = "", ""
    bestList, worstList = [], []

    for t in bestTopics:
        score = company[1][2][t] * 25
        if score > 0.1:
            bestStr += topicNames[t] + " (Score: " + str(score) + ")\n"
            bestList.append([topicNames[t], score])
    for t in worstTopics:
        score = company[1][2][t] * 25
        if score < -0.1:
            worstStr += topicNames[t] + " (Score: " + str(score) + ")\n"
            worstList.append([topicNames[t], score])
    if len(bestStr) == 0: bestStr = "-\n"
    if len(worstStr) == 0: worstStr = "-\n"
    infoStr = f"{company[0]} is best at:\n{bestStr}and worst at:\n{worstStr}Overall Score: {companyOverallScores[company[0]]}\n"
    outInfo.append([company[0], bestList, worstList, infoStr])
    

In [94]:
import csv
with open('info.csv', 'w', encoding="utf-8") as f:
    write = csv.writer(f) 
    write.writerow(["Company", "BestList", "WorstList", "Info"])
    write.writerows(outInfo)