In [1]:
import pandas as pd
df = pd.read_csv('companyReviews.csv', encoding='utf8')
df2 = df[df['review'].notna()]
df2["index"] = df2.index # Not fantastic

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2["index"] = df2.index # Not fantastic


In [2]:
import spacy_udpipe
spacy_udpipe.download("sv")

In [None]:
# Clean the code from words with limited information and lemmatize
from langdetect import detect

def initClean(df):
    removeWords = ["!", ".", ",", "\r", "\n", "-", "'", '"', "^", "(", ")", "’", ":", ";", "+", "?"]
    with open("Stoppord.txt", encoding="utf-8") as file:
        data = file.readlines()
        for line in data:
            removeWords.append(line.strip())
    nlp = spacy_udpipe.load("sv")
    return removeWords, nlp

removeWords, nlp = initClean(df2)

def cleanReview(review):
    try:
        lang = detect(review)
    except:
        lang = "ERROR"
        # print("Error", review)
        
    if lang != "sv":
        return "WRONG_LANGUAGE"
    else: # Clean the text, Lemmatize, remove stopwords
        doc = nlp(review)
        newText = ""
        for token in doc:
            if token.lemma_.lower() in removeWords:
                pass
            else:
                newText += token.lemma_.lower() + " "
        if len(newText) == 0:
            return "WRONG_LANGUAGE"
        newText = newText[:-1]
        newText = newText.strip()
        return newText
    
df3 = pd.DataFrame()
df3["review"] = df2["review"].apply(cleanReview) # barf
df3['grade'] = df2['grade'].astype(int)
df3['company'] = df2['company'].astype(str)
df3 = df3[df3['review'] != "WRONG_LANGUAGE"]
df3 = df3[df3['review'].notna()]

In [None]:
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
import umap.umap_ as umap

vectorizer = TfidfVectorizer(min_df=10, max_features=10000, ngram_range=(1, 2))
vz = vectorizer.fit_transform(df3['review'])

svd_model = TruncatedSVD(n_components=50)
svd_tfidf = svd_model.fit_transform(vz)

umap_model = umap.UMAP(n_components=2)
umap_tfidf = umap_model.fit_transform(svd_tfidf)

In [None]:
from sklearn.cluster import DBSCAN
eps = 1 # how close points should be to each other to be considered a part of a cluster, e.g 2.5
min_samples = 20 # the minimum number of points to form a dense region, e.g 15
dbscan = DBSCAN(eps=eps,min_samples=min_samples)
dbscan_model = dbscan.fit(umap_tfidf)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer 
import numpy as np
import lda

# create document term matrix
min_df = 4 # minimum required occurences of a word, e.g 4
max_features = 10000 # max number of unique words, e.g 10000

vectorizer = CountVectorizer(min_df=min_df, max_features=max_features, ngram_range=(1, 2)) # unigram & bigram
X = vectorizer.fit_transform(df3["review"])


# build LDA model
n_topics = 25 # pick the number of topics, e.g 5
n_iter = 2000 # number of learning iterations, e.g 2000

lda_model = lda.LDA(n_topics=n_topics, n_iter=n_iter)
X_topics = lda_model.fit_transform(X) # X is document term matrix

# Print the topics found by the LDA model
n_top_words = 10
topic_summaries = []

topic_word = lda_model.topic_word_  # get the topic words
vocab = vectorizer.get_feature_names()

In [None]:
df3["wordCount"] = df3["review"].apply(lambda x: len(x.split()))

In [None]:
# df3.to_csv("processed.csv", encoding="utf-8")
# df3 = pd.read_csv('processed.csv', encoding='utf8')

In [None]:
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    topic_summaries.append(' '.join(topic_words))
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))

In [None]:
import pyLDAvis
# VISUALIZATION
def prepareLDAData(lda_model, lda_df, vectorizer):
    data = {
        'vocab': vocab,
        'doc_topic_dists': lda_model.doc_topic_,
        'doc_lengths': list(lda_df['wordCount']),
        'term_frequency':vectorizer.vocabulary_,
        'topic_term_dists': lda_model.components_
    } 
    return data

# load the pre-prepared pyLDAvis data
lda_data = prepareLDAData(lda_model=lda_model, lda_df=df3, vectorizer=vectorizer)
prepared_data = pyLDAvis.prepare(**lda_data)
pyLDAvis.display(prepared_data)

In [None]:
%%capture
def classifier(review="hemsidan går att läsa men de är inte bra på att skicka paket. Min stol kom aldrig fram"):
    # topicNames = ["Seriöshet", "Kundtjänst (generisk)", "Lager", "Betyg(noise)", "Returhantering", "Kundtjänst (tel)", "Leverans/Paket", "Bok/Jul (noise)", "Leveranstid", "Installation", "Möbler (noise)", "Kläder (noise)", "Fraktkostnad", "Betalning", "Linser + rabatt (noise)", "Service (noise)", "problem", "Lev/betal-alternativ", "Hemsida (sök)", "Hemsida (tydlighet)", "Leverans/sortiment", "Leveranstid 2", "Leverans(fungera)", "Frakt (noise)", "Verkstad/Service"] # These have to be manually changed by a human! (for now)
    cleanR = review # cleanReview(review)
    vector = vectorizer.transform([cleanR])
    v = vector.toarray()
    
    topic_probs = lda_model.transform(v)
    results = []
    for i, t_prob in enumerate(topic_probs[0]):
        if t_prob > 0.10:
            results.append([i, t_prob])
    
    return results

def classifierFromX(topicProbs):
    n = 0
    results = []
    for reviewTopics in topicProbs:
        topicP = [0 for i in range(n_topics)]
        for i, t_prob in enumerate(reviewTopics):
            if t_prob > 0.10: # Only keep probabilities where we are kind of sure
                topicP[i] = t_prob
        results.append(topicP)
        n += 1
        if n == 4:
            break
    return results
    
    

In [None]:
results = classifierFromX(X_topics)
df = pd.DataFrame(results)
df["index"] = df.index # Not fantastic
reviewInfoDf = pd.concat([df3, df])
reviewInfoDf.head()

In [None]:
reviewInfoDf.head()

In [None]:
# Main calculation
companyScores = {}
reviewCount = 0
reviewTopicDist = [0 for i in range(n_topics)]
previousCompany = reviewInfoDf["company"].iloc[0]
n = 0
for index, row in reviewInfoDf.iterrows():
    company = row["company"]
    if company == previousCompany:
        reviewCount += 1
    else:
        companyScores[previousCompany] = [prob/reviewCount for prob in reviewTopicDist]
        previousCompany = company
        reviewCount = 1
        
    
    print(row.iloc[:25])
        
    
    
    n+=1
    if n == 4:
        break
    