In [64]:
!pip install mpld3
!pip install bs4
!pip install tk

In [65]:
import numpy as np
import pandas as pd
import tkinter as tk
from tkinter import *
import random
import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
import re
import os
import codecs # for encoding and decoding
from sklearn import feature_extraction
import mpld3  #The mpld3 project brings together Matplotlib, the popular Python-based graphing library, and D3js, the popular JavaScript library for creating interactive data visualizations for the web. The result is a simple API for exporting your matplotlib graphics to HTML code which can be used within the browser, within standard web pages, blogs, or tools such as the IPython notebook.
from sklearn.metrics.pairwise import cosine_similarity  
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.manifold import MDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Clustring Algorithms
from sklearn.cluster import KMeans
from sklearn.cluster import MiniBatchKMeans
from sklearn.decomposition import LatentDirichletAllocation #LDA
from sklearn.decomposition import NMF
from sklearn.decomposition import TruncatedSVD #LSA

In [66]:
nltk.download("stopwords")

In [67]:
def read_data(path = "articles1.csv", selected_column = 'content'):
    #Read Data set
    Data = pd.read_csv(path, encoding='latin-1')
    
    #Selecting required columns and rows
    Data = Data[[selected_column]]
    
    #Cleaning Data
    Data = Data[pd.notnull(Data[selected_column])]
    Data[selected_column] = Data[selected_column].str.replace('â\x80\x99',"'")
    Data[selected_column] = Data[selected_column].str.replace('â\x80\x98',"'")
    Data[selected_column] = Data[selected_column].str.replace('â\x80\x9c','"')
    Data[selected_column] = Data[selected_column].str.replace('â\x80\x9d','"')
    Data[selected_column] = Data[selected_column].str.replace('â\x80\x94','-')
    Data[selected_column] = Data[selected_column].str.replace('â\x80¦','...')
    Data[selected_column] = Data[selected_column].str.replace('â\x80¢','•')
    Data[selected_column] = Data[selected_column].str.replace('Ã©','é')
    Data[selected_column] = Data[selected_column].str.replace('Ã³','ó')
    Data[selected_column] = Data[selected_column].str.replace('Ã¼','ü')
    Data[selected_column] = Data[selected_column].str.replace('Ã¡','á')
    Data[selected_column] = Data[selected_column].str.replace('_____','')
    Data[selected_column] = Data[selected_column].str.replace("' '",'')
    Data[selected_column] = Data[selected_column].str.replace('  ',' ')
    Data[selected_column] = Data[selected_column].str.replace('   ',' ')
    
    # Convert dataframe to list
    data = Data[selected_column].tolist()
    
    return data

In [68]:
# read data from files and clean it. 
Data = read_data("articles1.csv", 'content')

In [69]:
# Stop Words
stop_words = set(stopwords.words('english'))

# Load 'stemmer'
stemmer = SnowballStemmer("english")

In [70]:
# Functions for sentence tokenizer, to remove numeric tokens and raw #punctuation

def sentence_seperator(text):
    sentences = nltk.sent_tokenize(text)
    return sentences

def tokenize_only_single(text):
    tokens = [word for word in nltk.word_tokenize(text)]
    filtered_tokens = []
    for token in tokens:
        if (re.search('[a-zA-Z]', token) and token.casefold() not in stop_words and not re.search('\W', token)):
            filtered_tokens.append(token)
    return filtered_tokens



def tokenize_only(text):
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    for token in tokens:
        if (re.search('[a-zA-Z]', token) and token.casefold() not in stop_words and not re.search('\W', token)):
            filtered_tokens.append(token)
    return filtered_tokens



def tokenize_and_stem(text):
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    for token in tokens:
        if (re.search('[a-zA-Z]', token) and token.casefold() not in stop_words and not re.search('\W', token)):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens if t]
    return stems



def tokenize_and_stem_single(text):
    tokens = [word for word in nltk.word_tokenize(text)]    
    filtered_tokens = []
    for token in tokens:
        if (re.search('[a-zA-Z]', token) and token.casefold() not in stop_words and not re.search('\W', token)):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens if t]
    return stems



In [71]:
# the Topics will be list of sentences
def tf_idf(topics = []):
    tfidf_vectorizer = TfidfVectorizer(
        max_df=0.95, 
        #max_features=200000, 
        min_df=0.05, 
        stop_words='english', 
        #use_idf=True, 
        tokenizer=tokenize_and_stem_single, 
    )
    
    #fit the vectorizer to data
    matrix = tfidf_vectorizer.fit_transform(topics) 
    terms = tfidf_vectorizer.get_feature_names()
    
    return matrix, terms

In [72]:
# the Topics will be list of articles that each article has a lot of sentences
def tf_idf2(topics = []):
    tfidf_vectorizer = TfidfVectorizer(
        max_df=0.95, 
        #max_features=200000, 
        min_df=0.05, 
        stop_words='english', 
        #use_idf=True, 
        tokenizer=tokenize_and_stem, 
    )
    
    #fit the vectorizer to data
    matrix = tfidf_vectorizer.fit_transform(topics) 
    terms = tfidf_vectorizer.get_feature_names()
    
    return matrix, terms

In [73]:
# NOT USED ---------------------------------
def count_vectorizer(topics = []):
    cv = CountVectorizer(
        max_df=0.95, 
        min_df=2, 
        stop_words='english',
        tokenizer=tokenize_and_stem_single,
    )
    dtm = cv.fit_transform(topics)
    terms = cv.get_feature_names()
    return matrix, terms
# NOT USED ---------------------------------

In [74]:
def LDA_model(topics, matrix, terms, num_clusters=3, num_words_of_name=5):
    
    model = LatentDirichletAllocation(n_components=num_clusters, random_state=42)
    model.fit(matrix)
    
    topic_results = model.transform(matrix)
    
    #final clusters
    clusters = topic_results.argmax(axis=1)
    topic_data = {'topic': topics, 'cluster': clusters }
    frame = pd.DataFrame(topic_data, columns = ['cluster'])
    
    # Sorted Clusters (bigger to smaller) by number of docs per cluster 
    categories = frame['cluster'].value_counts().keys()

    # printing top names for topic
    names=[]
    for index in categories:
        name=""
        topic = model.components_[index]
        num_words_of_name = min(len(topic), num_words_of_name)
        for i in topic.argsort()[-num_words_of_name:]:
            name += (terms[i]+" ")
        names.append(name)
    
    return names, frame, categories

In [75]:
def NMF_model(topics, matrix, terms, num_clusters=3, num_words_of_name=5):
    
    model = NMF(n_components = num_clusters, random_state=42)
    model.fit(matrix)
    
    topic_results = model.transform(matrix)
    
    #final clusters
    clusters = topic_results.argmax(axis=1)
    topic_data = {'topic': topics, 'cluster': clusters }
    frame = pd.DataFrame(topic_data, columns = ['cluster'])
    
    # Sorted Clusters (bigger to smaller) by number of docs per cluster 
    categories = frame['cluster'].value_counts().keys()

    # printing top names for topic
    names=[]
    for index in categories:
        name=""
        topic = model.components_[index]
        num_words_of_name = min(len(topic), num_words_of_name)
        for i in topic.argsort()[-num_words_of_name:]:
            name += (terms[i]+" ")
        names.append(name)
    
    return names, frame, categories

In [76]:
def LSA_model(topics, matrix, terms, num_clusters=3, num_words_of_name=5):
    
    model = TruncatedSVD(n_components = num_clusters)
    model.fit(matrix)
    
    topic_results = model.transform(matrix)
    
    #final clusters
    clusters = topic_results.argmax(axis=1)
    topic_data = {'topic': topics, 'cluster': clusters }
    frame = pd.DataFrame(topic_data, columns = ['cluster'])
    
    # Sorted Clusters (bigger to smaller) by number of docs per cluster 
    categories = frame['cluster'].value_counts().keys()

    # printing top names for topic
    names=[]
    for index in categories:
        name=""
        topic = model.components_[index]
        num_words_of_name = min(len(topic), num_words_of_name)
        for i in topic.argsort()[-num_words_of_name:]:
            name += (terms[i]+" ")
        names.append(name)
    
    return names, frame, categories

In [77]:
def kmean_model(topics, matrix, terms, num_clusters=3, num_words_of_name=5):
    #Running clustering algorithm
    model = KMeans(n_clusters=num_clusters)
    model.fit(matrix)
    
    #final clusters -> Ex.[1, 1, 0, 0, 0, 2, 1, 2, 0]
    clusters = model.labels_.tolist()
    
    topic_data = {'topic': topics, 'cluster': clusters }
    frame = pd.DataFrame(topic_data, columns = ['cluster'])

    # Sorted Clusters (bigger to smaller) by number of docs per cluster 
    categories = frame['cluster'].value_counts().keys()   # => [2, 0, 1]    
        
    #sort cluster centers by proximity to centroid
    order_centroids = model.cluster_centers_.argsort()[:, ::-1]
    
    # printing top names for topic
    names=[]
    for i in range(len(categories)):#num-Of-Clusters
        name=""
        num_words_of_name = min( len(order_centroids[categories[i]]), num_words_of_name)
        for index in order_centroids[categories[i], :num_words_of_name]:
            name+= (terms[index] + " ")
        names.append(name)
        
    return names, frame, categories

In [78]:
def mini_batch_kmeans_model(topics, matrix, terms, num_clusters=3, num_words_of_name=5):
    #Running clustering algorithm
    model = MiniBatchKMeans(n_clusters=num_clusters)
    model.fit(matrix)
    
    #final clusters -> Ex.[1, 1, 0, 0, 0, 2, 1, 2, 0]
    clusters = model.labels_.tolist()
     
    topic_data = {'topic': topics, 'cluster': clusters }
    frame = pd.DataFrame(topic_data, columns = ['cluster'])
    
    # Sorted Clusters (bigger to smaller) by number of docs per cluster 
    categories = frame['cluster'].value_counts().keys()    
    
    #sort cluster centers by proximity to centroid
    order_centroids = model.cluster_centers_.argsort()[:, ::-1]

    # printing top names for topic
    names=[]
    for i in range(len(categories)):#num-Of-Clusters
        name=""
        num_words_of_name = min(len(order_centroids[categories[i]]), num_words_of_name)
        for index in order_centroids[categories[i], :num_words_of_name]:
            name+= (terms[index] + " ")
        names.append(name)
        
    return names, frame, categories

In [79]:
def apply_algorithm():
    clear_listbox() # clear list box from any data
    
    selected_model = dropdown_model.get() # read the name of algorithm that will use
    route = dropdown_choose.get() # read the route which will use (Individual / On Previous Data)
    
    numOfClusters = 3
    numOfWords = 5
    
    if(len(entry_num_clusters.get()) > 0): #check if user input number of clusters
        numOfClusters = int(entry_num_clusters.get())
        
    if(len(entry_num_words.get()) > 0): #check if user input number of words in each Topic Name
        numOfWords = int(entry_num_words.get())
    
    article = text_area.get("1.0", tk.END)  # Retrieve all text from the beginning to the end from Text Area
    
    
    # check which route will be use to choose the right tf-idf function 
    #(Individual -> tf-idf), (On Previous Data -> tf-idf2),
    topics =[]
    if(route == "Individual"):
        topics = sentence_seperator(article)
        matrix, terms = tf_idf(topics)
    else:
        mini = 0
        maxi = 20
        if(len(entry_min_articles.get()) > 0): #check if user input minimum range if data
            mini = int(entry_min_articles.get())
        
        if(len(entry_max_articles.get()) > 0): #check if user input maximum range if data
            maxi = int(entry_max_articles.get())
        
        topics = Data[mini:maxi]
        matrix, terms = tf_idf2(topics)
    


    # check which algorithm will used and call the function of it
    # the parameters that send to model:
    #     topics : all topics that algorithm should classified them to clustters 
    #     matrix : features data that algorithm will use
    #     terms : features name
    #     numOfClusters : number of clusters that will algorithm devide data on it
    #     numOfWords : number of words in each topic name
    names=[]
    if(selected_model=="KMeans"):
        names, frame, categories = kmean_model(topics, matrix, terms, numOfClusters, numOfWords)
    elif(selected_model=="MiniBatchKMeans"):
        names, frame, categories = mini_batch_kmeans_model(topics, matrix, terms, numOfClusters, numOfWords)
    elif(selected_model=="LDA"):
        names, frame, categories = LDA_model(topics, matrix, terms, numOfClusters, numOfWords)
    elif(selected_model=="LSA"):
        names, frame, categories = LSA_model(topics, matrix, terms, numOfClusters, numOfWords)
    elif(selected_model=="NMF"):
        names, frame, categories = NMF_model(topics, matrix, terms, numOfClusters, numOfWords)
     
    
    
    nameOfTopic = ""
    if(route == "Individual"):
        nameOfTopic = names[0] # choose the best topic name for this articale
    else:             
        nameOfTopic = getTopic(names, frame, categories, index_random) # get the topic name of this artical
    

    # array of persentages of articals in each cluster 
    persentages = ((frame['cluster'].value_counts().values/len(topics))*100).round(1)
    
    
    # concatenate each cluster name with it persantage and add thim to list box
    for i in range(numOfClusters):
        per=0
        if(i<len(persentages)): per = persentages[i]
            
        name="!!"
        if(i<len(names)): name = names[i]
        
        item = name + "  ==>  " + str(per) +"%"
        add_to_list(item)
        
    
    # add final altical topic name to list box
    add_to_list("\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n")
    if(route == "Individual"):
        add_to_list("The Best Topic Name of This Article:")
    else:
        add_to_list("The Topic Name of This Article:")
    
    add_to_list(nameOfTopic)
    
    
    # draw box blot for each article and colored it with it cluster color
    graph_draw(matrix, names, categories, persentages, frame)

In [80]:
def getTopic(names, frame, categories, index_random):
    cluster = frame["cluster"][index_random] # get cluster of this articale with index = index_random in list of Data
    topic_name = ""
    for i in range(len(categories)):
        if(categories[i] == cluster): 
            topic_name = names[i] #get cluster name of index of categories list 
    
    return topic_name

In [81]:
def clear_all():
    clear_entries()
    clear_text_area()
    clear_listbox()

    
def clear_text_area():
    text_area.delete("1.0", tk.END)
    
def clear_listbox():
    listbox.delete(0, tk.END)
    
def clear_entries():
    entry_num_clusters.delete(0, tk.END)
    entry_num_words.delete(0, tk.END)
    entry_min_articles.delete(0, tk.END)
    entry_max_articles.delete(0, tk.END)

In [82]:
def add_to_list(topic_name):
    listbox.insert(tk.END, topic_name)
    

index_random = 0
mini = 0
maxi = 20
def generate_article():
    clear_text_area()
    
    if(len(entry_min_articles.get()) > 0): #check if user input minimum range if data
        mini = int(entry_min_articles.get())
        
    if(len(entry_max_articles.get()) > 0): #check if user input maximum range if data
        maxi = int(entry_max_articles.get())
        
    #Generate Randmon index to select random topic
    route = dropdown_choose.get()
    if(route == "Individual"):
        index_random = random.randint(0, len(Data))
    else:
        index_random = random.randint(mini, maxi)
    article = Data[index_random]
    
    text_area.insert(tk.END, article) # Set text in the text area

In [83]:
#Set up colors hash examples for a lot of clusters using a dict
cluster_colors = {
    0:  '#1b9e77', 
    1:  '#d95f02', 
    2:  '#7570b3', 
    3:  '#e7298a', 
    4:  '#66a61e',
    5:  '#D2691E',
    6:  '#3333ff',
    7:  '#cc00cc',
    8:  '#ff4d94',
    9:  '#ff3300',
    10: '#e6e600',
    11: '#73e600',
    12: '#006600',
    13: '#66ccff',
    14: '#336699',
    15: '#4d0099'
}


def graph_draw(matrix, names, categories, persentages, frame):
    
    new_names=[]
    for i in range(len(names)):
        s = names[i] + " ==> " + str(persentages[i]) +"%"
        new_names.append(s)
        
    names = new_names
    
    #set up cluster names using a dict
    cluster_names = dict(zip(categories, names))
    
    #Similarity
    similarity_distance = 1 - cosine_similarity(matrix)
    
    # Convert two components as we're plotting points in a two-dimensional plane
    mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
    pos = mds.fit_transform(similarity_distance)  # shape (n_components, n_samples)
    xs, ys = pos[:, 0], pos[:, 1]
    size = max(len(xs), len(ys))
    xs = xs[:size]
    ys = ys[:size]
    
    # Finally plot it
    %matplotlib inline 
    
    clusters = frame['cluster']

    #Create data frame that has the result of the MDS and the cluster 
    df = pd.DataFrame(dict(x=xs, y=ys, label=clusters)) 
    groups = df.groupby('label')

    # Set up plot
    fig, ax = plt.subplots(figsize=(17, 9)) # set size

    for name, group in groups:
        ax.plot(group.x, group.y, marker='o', linestyle='', ms=20, 
                label=cluster_names[name], color=cluster_colors[name], mec='none')
        ax.set_aspect('auto')
        ax.tick_params(axis= 'x', which='both', bottom='off', top='off', labelbottom='off')
        ax.tick_params(axis= 'y', which='both', left='off', top='off', labelleft='off')
    
    
    ax.legend(numpoints=1) 
    plt.show()

In [84]:
# ---------- GUI ----------

x_dimensional = 1100
y_dimensional = 650
frm = tk.Tk()
frm.title("Topic Modiling")
frm.geometry(str(x_dimensional)+"x"+str(y_dimensional))
Label(frm, text="Topic Modiling", font=("Kartika", 40, "underline"), fg='#0000cc').place(x=350, y=10)
title_color="#314f81"

y_direction = 120
x_direction = 20


s = "Range Of Articles (0 - " +str(len(Data)) + ") : " 
Label(frm, text=s, font=("Arial", 10)).place(x=20, y=90)
entry_min_articles = tk.Entry(frm, width=10)
entry_min_articles.pack()
entry_min_articles.place(x=210, y=92)

entry_max_articles = tk.Entry(frm, width=10)
entry_max_articles.pack()
entry_max_articles.place(x=280, y=92)




# Create a variable to store the selected item
dropdown_model = tk.StringVar(frm)
# Create a list of options for the dropdown
options = ['KMeans', 'MiniBatchKMeans', 'LDA', 'LSA', 'NMF']
# Set the default value of the dropdown
dropdown_model.set(options[0])
# Create the dropdown widget
dropdown_model_var = tk.OptionMenu(frm, dropdown_model, *options)
dropdown_model_var.pack()
dropdown_model_var.place(x=20, y=y_direction)



# Create a variable to store the selected item
dropdown_choose = tk.StringVar(frm)
# Create a list of options for the dropdown
options = ['Individual', 'On Previous Data']
# Set the default value of the dropdown
dropdown_choose.set(options[0])
# Create the dropdown widget
dropdown_choose_var = tk.OptionMenu(frm, dropdown_choose, *options)
dropdown_choose_var.pack()
dropdown_choose_var.place(x=150, y=y_direction)




Label(frm, text="N. Clusters: ", font=("Arial", 12)).place(x=290, y=y_direction+3)
entry_num_clusters = tk.Entry(frm, width=10)
entry_num_clusters.pack()
entry_num_clusters.place(x=380, y=y_direction+4)



Label(frm, text="N. Words: ", font=("Arial", 12)).place(x=480, y=y_direction+3)
entry_num_words = tk.Entry(frm, width=10)
entry_num_words.pack()
entry_num_words.place(x=560, y=y_direction+3)



# Create a listbox to display the Topics Names
listbox = tk.Listbox(frm, width=68, height=28)
listbox.pack()
listbox.place(x=x_dimensional-450, y=y_direction)



text_area = tk.Text(frm, width=75, height=25)
text_area.pack()
text_area.place(x=x_direction, y=170)




Button(
    frm, 
    text="Apply", 
    command=apply_algorithm, 
    font=("Arial", 18), 
    width=20, 
    height=1, 
    fg="#b30000"
).place(x=710, y=y_dimensional-60)


Button(
    frm, 
    text="Generate Random Article", 
    command=generate_article, 
    font=("Arial", 13), 
    width=22, 
    height=1, 
    fg="#b30000"
).place(x=260, y=y_dimensional-50)

Button(
    frm, 
    text="Clear", 
    command=clear_all, 
    font=("Arial", 13), 
    width=8, 
    height=1, 
    fg="#b30000"
).place(x=180, y=y_dimensional-50)

Button(
    frm, 
    text="Exit", 
    command=frm.destroy, 
    font=("Arial", 13), 
    width=8, 
    height=1, 
    fg="#b30000"
).place(x=20, y=y_dimensional-50)


frm.mainloop() # run form