## Model inference

In this notebook, we will load a previously trained model, explore the learned topics, and predict topics for all the courses on the catalog.

In [27]:
# imports
import sys
import os
import re
sys.path.insert(0, "../")
from utils import scrape_arxiv_abstract
from model import TopicModel
from dataset import ArXivDataset
from gensim.models import LdaModel
from pprint import pprint
from PyPDF2 import PdfReader
import plotly.express as px
import pandas as pd

from langchain.prompts import PromptTemplate
from langchain.llms import OpenAI
OpenAI.api_key = os.getenv('OPENAI_API_KEY')
llm = OpenAI(temperature=0.9)


In [28]:
OpenAI.api_key 

'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'

### Build topic model



In [29]:
# create topic model
model_path = "../models/lda_n20_p10_r929_c34.7"
dataset_path = "../object/dataset.obj"
model = TopicModel(model_path, dataset_path)

In [30]:
prompt = PromptTemplate(
    input_variables=["values"],
    template="give me only the topic name knowing that that these are the words and the and the pertinance of each word on the topic: {values}",
)

### Investigate topics

Next, let us explore the different topics learned by the model so that we can assign understandable topic names to each cluster.

For this we will ask chatchapt to give the best name for each topic

In [31]:
def givetopicsNames(model):
    topicName = ''
    TopicsNames =[]
    for i in model.topics:
        for name,prob in i[1][:10]:
            topicName = topicName + name + ' ' + str(prob) + ' '
        text = prompt.format(values=topicName)
        name = llm(text)
        TopicsNames.append(name.strip())
        topicName = ''
    return TopicsNames

In [81]:
def saveTopicNames(TopicsNames):
    f = open("../data/TopicNames.txt","w+")
    for topic in TopicsNames:
        f.write(topic)
        f.write('\n')
    f.close()

In [77]:
def readTopicNames():
    f = open("../data/TopicNames.txt","r")
    TopicsNames = []
    for line in f.readlines():
        TopicsNames.append(line.strip())
    f.close
    return TopicsNames

#### New Topic Names:

If you want to name the topics uncomment the section bellow

In [82]:
#Names = givetopicsNames(model)
#saveTopicNames(Names)
TopicsNames = readTopicNames()
model.set_topic_names(TopicsNames)
pprint(model.topics)

[('Robotic System Design',
  '0.034*"system" + 0.013*"design" + 0.013*"robot" + 0.012*"economic" + '
  '0.010*"urban" + 0.010*"control" + 0.009*"vehicle" + 0.009*"description" + '
  '0.008*"model" + 0.007*"robotic"'),
 ('Fetal Non-Invasive ECG Extraction',
  '0.016*"extraction" + 0.013*"non_invasive" + 0.013*"ecg" + 0.012*"fetal" + '
  '0.011*"ni_fecg" + 0.008*"project" + 0.008*"st" + 0.007*"advance" + '
  '0.007*"datum" + 0.007*"analysis"'),
 ('Industrial System Design',
  '0.027*"project" + 0.021*"session" + 0.017*"process" + 0.016*"model" + '
  '0.014*"industrial" + 0.012*"system" + 0.010*"concept" + 0.009*"design" + '
  '0.007*"management" + 0.007*"engineering"'),
 ('Systems Engineering',
  '0.032*"system" + 0.017*"energy" + 0.015*"model" + 0.010*"description" + '
  '0.009*"problem" + 0.008*"control" + 0.008*"network" + 0.008*"design" + '
  '0.008*"communication" + 0.007*"engineering"'),
 ('Team Communication Skills',
  '0.033*"project" + 0.032*"skill" + 0.023*"language" + 0.017*"d

### Predict topics for all the courses
Lets pass all the courses trough the model

In [15]:
dir_path = "../data/ByCourse/"

courses = {"texts" : [] , "course" : [], "topics" : []}

for file in os.listdir(dir_path):
    # check if current path is a file
    if os.path.isfile(os.path.join(dir_path, file)):
        text_of_file = ''
        reader = PdfReader(dir_path + file)
        for i in range(len(reader.pages)):
            text_of_file = text_of_file + reader.pages[i].extract_text()
    text_of_file = text_of_file.replace(u'\xa0', u' ')
    text_of_file = text_of_file.replace(u'\n', u' ')
    courses["texts"].append(text_of_file)
    courses["course"].append(file.replace(".pdf",""))

In [16]:
for text in courses["texts"]:
    match = re.findall(r".Description.+Quarter number ", text)
    if match:
        prediction = model.predict(match[0])
        courses["topics"].append(prediction)


In [17]:
courses2 = courses
courses2["topics"] = []
for text in courses2["texts"]:
    match = re.findall(r".Description.+Quarter number ", text)
    if match:
        prediction = model.predictTopTopics(match[0],numberOfTopics=5)
        courses2["topics"].append(prediction)


In [18]:
name = courses2["course"][1] + ".png"

Now, as we have a list of dictionaries with: course name, course text and topics for that course. We can get the radar chart for all the courses:

In [19]:
import warnings
warnings.filterwarnings('ignore')

for i in range(len(courses2["topics"])):
    fig = px.line_polar(r=courses2["topics"][i][1], theta=courses2["topics"][i][0], line_close=True)
    
    name = courses2["course"][i] + ".png"
    fig.write_image("../data/plots/" + name)