# Topic Modeling

# Install bertopic

In [None]:
!pip install bertopic[visualization] --quiet



# Project Path and Dataset Path are Set

In [None]:
proj_path = '/content/drive/MyDrive/colab_data/NLP PROJ'
dataset_file = proj_path + '/Comments.csv'

# Import Required Libraries

In [None]:
import pandas as pd  #package for data analysis
import numpy as np   #packahe to deal with arrays
from copy import deepcopy  #to create a copy (clone) of data without any reference to original data.
from bertopic import BERTopic   #importing model for topic modelling

# Read Dataset

In [None]:
df = pd.read_csv(dataset_file, encoding='cp1252')

# Take a peek on Dataset

In [None]:
df.columns = ["Text"]
df.head()

Unnamed: 0,Text
0,Gluten free options
1,"Change the menu, it's becoming tired."
2,Additional dishes using same ingredients on cu...
3,Very limited
4,"Chicken alfredo Pasta dish, fried shrimp, , Ch..."


# Check null data and remove it from the data

In [None]:
df.isnull().sum()

Text    1
dtype: int64

In [None]:
df.dropna(inplace=True)

# A little Text Preprocessing

In [None]:
stopwords = [' of ', ' more ', ' or ', ' on ', ' as ', ' and ', ' was ', ' the ', ' to ', ' is ', ' a ', ' i ']
def preprocessing(text):
  text = text.lower()
  text = "".join(word for word in text if word not in stopwords)
  return text

In [None]:
df["cleaned_text"] = df.loc[:,'Text'].apply(lambda x : preprocessing(x))

In [None]:
df.head()

Unnamed: 0,Text,cleaned_text
0,Gluten free options,gluten free options
1,"Change the menu, it's becoming tired.","change the menu, it's becoming tired."
2,Additional dishes using same ingredients on cu...,additional dishes using same ingredients on cu...
3,Very limited,very limited
4,"Chicken alfredo Pasta dish, fried shrimp, , Ch...","chicken alfredo pasta dish, fried shrimp, , ch..."


In [None]:
#Store data in a list
text = list(df.loc[:,'cleaned_text'].values)

In [None]:
#Peek on first five text
text[:5]

['gluten free options',
 "change the menu, it's becoming tired.",
 'additional dishes using same ingredients on current menu such as bang bang shrimp on a salad or taco; the blackened mahi-mahi as a taco or salad or/and entree; healthier options of current dishes;  and my kids recommend a smaller kid sized hamburger- we really loved everything',
 'very limited',
 'chicken alfredo pasta dish, fried shrimp, , chicken pot pie, crab claws alaskan  legs, flat bread pizza, avacado california rolls ,']

# Create BERTopic Model using English Language

In [None]:
bertTopicModel = BERTopic(language="english")

# Model fitting according to the Dataset

In [None]:
topics, probs = bertTopicModel.fit_transform(text)

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]



# Have a visual od possible clusters of topics in the Given Dataset

In [1]:
bertTopicModel.visualize_topics()

NameError: name 'bertTopicModel' is not defined

# Check on random data

In [None]:
check_text = "additional dishes using same ingredients on current menu such as bang bang shrimp on a salad or taco; blackened mahi-mahi as taco salad entree;"
check_text = preprocessing(check_text)

In [None]:
our_topics = bertTopicModel.find_topics(check_text)

# This function gets top 10 topics from the clusters

In [None]:
def get_ten_topics(topics_clusters):
  topics = []
  for i in topics_clusters[0]:
    for j in bertTopicModel.get_topic(i):
      topics.append(j[0])
      if len(topics) >=10:   #you can specify number of topics here------ here its value is 10
        break
  return [topic for topic in topics if topic+'s' not in topics and ' '+topic+' ' not in stopwords]

In [None]:
print( "Top 10 topics are: ")
print("\t".join(get_ten_topics(our_topics)))

Top 10 topics are: 
salads	wedge	better	vegetables	choices	chicken	seafood	tacos	pasta	sandwiches
