## Creating the Chatbot

In [53]:
question = "Tips for WGU"

In [54]:
ask_me_anything(question)

INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 761 tokens
INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 4 tokens


You asked: <b>Tips for WGU</b>

ChatBot says: <b>
Tips for WGU include: 
1. Look up the course on Reddit and check the course's Chatter to see common stumbling blocks, and be aware of when the post was made as courses can change over time. 
2. Take a break if needed. It's better to understand a concept over a two day period than to think you know it over 6 hours. 
3. When studying for exams, try to understand why a particular answer is the correct one and why the others can’t be correct. 
4. You're not beholden to do the WGU coursework. You can use your own materials as long as you can pass the WGU exams. 
5. Take a look at the course work first, and see if there were questions at the end of the chapter or question pools at the end of the course.</b>

#### Set up

In [34]:
# Importing libraries
from llama_index import SimpleDirectoryReader, GPTSimpleVectorIndex, LLMPredictor, PromptHelper
from langchain import OpenAI
from IPython.display import Markdown, display
import os

import config

import pandas as pd
import datetime as dt

#### Importing datasets

In [16]:
post_df = pd.read_csv('wgu_subreddit_posts.csv')
post_df.head()

Unnamed: 0,post_id,subreddit,created_utc,selftext,post_url,post_title,link_flair_text,score,num_comments,upvote_ratio
0,k9jnq0,WGU,1607483000.0,Following is a list of free resources that are...,https://www.reddit.com/r/WGU/comments/k9jnq0/f...,Free Resources For WGU Students,,969,161,1.0
1,f9fmrt,WGU,1582659000.0,,https://i.redd.it/xpsb18v1g4j41.png,And after 6.5 years of college... Whoop!,I'm DONE!,650,56,0.99
2,mb4i9q,WGU,1616467000.0,,https://i.redd.it/ogxbs6w2woo61.jpg,"The boy is refusing to sleep, so tonight he is...",,623,28,1.0
3,ogc63d,WGU,1625766000.0,,https://i.redd.it/49oab5hoy0a71.png,Me on the days I have a call with my mentor,,603,67,0.99
4,10x1sot,WGU,1675872000.0,,https://i.redd.it/f8w825mj21ha1.jpg,I wanted my confetti to be a bit special :),I'm DONE!,580,53,0.99


In [17]:
comments_df = pd.read_csv('wgu_subreddit_comments.csv')
comments_df.head()

Unnamed: 0,post_id,comment
0,k9jnq0,A few more:\n\nSpotify premium / Hulu / Showti...
1,k9jnq0,50% off ANY Microsoft exam voucher when you si...
2,k9jnq0,I’ll add several more to this list. Most of th...
3,k9jnq0,One of the more comprehensive resource posts I...
4,k9jnq0,"also, free license for windows 10 education, w..."


### Data Cleaning

In [22]:
# Converting created_utc to datetime format
post_df['created_date'] = post_df['created_utc'].apply(lambda x: dt.datetime.fromtimestamp(x))
# Adding created year column for data relevancy
post_df['created_year'] = post_df['created_date'].dt.year
post_df.head()

Unnamed: 0,post_id,subreddit,created_utc,selftext,post_url,post_title,link_flair_text,score,num_comments,upvote_ratio,created_date,created_year
0,k9jnq0,WGU,1607483000.0,Following is a list of free resources that are...,https://www.reddit.com/r/WGU/comments/k9jnq0/f...,Free Resources For WGU Students,,969,161,1.0,2020-12-08 21:10:20,2020
1,f9fmrt,WGU,1582659000.0,,https://i.redd.it/xpsb18v1g4j41.png,And after 6.5 years of college... Whoop!,I'm DONE!,650,56,0.99,2020-02-25 13:29:02,2020
2,mb4i9q,WGU,1616467000.0,,https://i.redd.it/ogxbs6w2woo61.jpg,"The boy is refusing to sleep, so tonight he is...",,623,28,1.0,2021-03-22 21:32:41,2021
3,ogc63d,WGU,1625766000.0,,https://i.redd.it/49oab5hoy0a71.png,Me on the days I have a call with my mentor,,603,67,0.99,2021-07-08 12:35:43,2021
4,10x1sot,WGU,1675872000.0,,https://i.redd.it/f8w825mj21ha1.jpg,I wanted my confetti to be a bit special :),I'm DONE!,580,53,0.99,2023-02-08 09:52:36,2023


In [24]:
# Merge posts with their comments
df = post_df.merge(comments_df, on='post_id', how='left')
# Remove rows with missing comments
df = df[~df['comment'].isnull()]

In [25]:
df

Unnamed: 0,post_id,subreddit,created_utc,selftext,post_url,post_title,link_flair_text,score,num_comments,upvote_ratio,created_date,created_year,comment
0,k9jnq0,WGU,1.607483e+09,Following is a list of free resources that are...,https://www.reddit.com/r/WGU/comments/k9jnq0/f...,Free Resources For WGU Students,,969,161,1.00,2020-12-08 21:10:20,2020,A few more:\n\nSpotify premium / Hulu / Showti...
1,k9jnq0,WGU,1.607483e+09,Following is a list of free resources that are...,https://www.reddit.com/r/WGU/comments/k9jnq0/f...,Free Resources For WGU Students,,969,161,1.00,2020-12-08 21:10:20,2020,50% off ANY Microsoft exam voucher when you si...
2,k9jnq0,WGU,1.607483e+09,Following is a list of free resources that are...,https://www.reddit.com/r/WGU/comments/k9jnq0/f...,Free Resources For WGU Students,,969,161,1.00,2020-12-08 21:10:20,2020,I’ll add several more to this list. Most of th...
3,k9jnq0,WGU,1.607483e+09,Following is a list of free resources that are...,https://www.reddit.com/r/WGU/comments/k9jnq0/f...,Free Resources For WGU Students,,969,161,1.00,2020-12-08 21:10:20,2020,One of the more comprehensive resource posts I...
4,k9jnq0,WGU,1.607483e+09,Following is a list of free resources that are...,https://www.reddit.com/r/WGU/comments/k9jnq0/f...,Free Resources For WGU Students,,969,161,1.00,2020-12-08 21:10:20,2020,"also, free license for windows 10 education, w..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
74959,111fb4p,wgu_devs,1.676311e+09,"im really concern about this topic, not really...",https://www.reddit.com/r/wgu_devs/comments/111...,Will the AI make our degrees worthless? thinki...,,0,18,0.35,2023-02-13 11:52:53,2023,"Why, do you think C# devs only work for Bing o..."
74960,111fb4p,wgu_devs,1.676311e+09,"im really concern about this topic, not really...",https://www.reddit.com/r/wgu_devs/comments/111...,Will the AI make our degrees worthless? thinki...,,0,18,0.35,2023-02-13 11:52:53,2023,Lol
74961,111fb4p,wgu_devs,1.676311e+09,"im really concern about this topic, not really...",https://www.reddit.com/r/wgu_devs/comments/111...,Will the AI make our degrees worthless? thinki...,,0,18,0.35,2023-02-13 11:52:53,2023,😂😂😂😂
74962,111fb4p,wgu_devs,1.676311e+09,"im really concern about this topic, not really...",https://www.reddit.com/r/wgu_devs/comments/111...,Will the AI make our degrees worthless? thinki...,,0,18,0.35,2023-02-13 11:52:53,2023,C# is a Microsoft product.


In [28]:
# Combine all posts and comments and save to a text file
df_tmp = df[['post_title', 'selftext', 'comment']].astype(str)
agg_comments = df_tmp.groupby(['post_title', 'selftext'])['comment'].apply('. '.join).reset_index()
agg_comments

Unnamed: 0,post_title,selftext,comment
0,"""Easy"" Courses For Acceleration?",Hey everyone! I've fallen behind the last term...,"Anything in the GenEds is an easy win, if you ..."
1,"""Please pan under the desk.""",,Lift the cat. He could be laying on a cheat s...
2,"""The Missing Semester of your CS Education""",There's been a few things I've thought were mi...,Nice. I continually wondered if this will be c...
3,"""We should be able to do a mid-term migration ...",This is what my student/program mentor emailed...,That's good news. My mentor hasn't responded t...
4,"""now rotate your camera 360 degrees""",,Man this is the truest stuff lol. Also how oft...
...,...,...,...
4930,worried about D084 - Cloud Platform Solutions,I am getting ready to take this one and frankl...,Just passed this one last night. I will try to...
4931,¿Best Business degree from WGU with best retur...,A little justification for your recommendation...,Best return on investment? No idea. Needed a...
4932,“I DID IT” Mandatory Post!!🎉🎊,,Congratulations!!!. Nice! Congratulations and ...
4933,“I can prolly bust this out in 2 weeks” *3 mon...,,"Also, if you need ANY HELP at all with this de..."


In [31]:
agg_comments['combined_text'] = agg_comments.astype(str).agg('. '.join, axis=1)
all_text = ' '.join(agg_comments['combined_text'])

# Save text to txt file
f = open("textdata/all_text_reddit.txt", "w",  encoding="utf-8")
f.write(all_text)
f.close()

In [43]:
# Create a function to construct index from txt file
def construct_index(directory_path):
    # set max input size
    max_input_size = 4096
    # set number of output tokens
    num_outputs = 256
    # set max chunk overlap
    max_chunk_overlap = 20
    # set chunk size limit
    chunk_size_limit = 600
    
    #define LLM (ChatGPT gpt-3.5-turbo)
    llm_predictor = LLMPredictor(llm=OpenAI(temperature=0, model_name="gpt-3.5-turbo", max_tokens=num_outputs))
    prompt_helper = PromptHelper(max_input_size, num_outputs, max_chunk_overlap, chunk_size_limit=chunk_size_limit)
    
    documents = SimpleDirectoryReader(directory_path).load_data()
    
    index = GPTSimpleVectorIndex(
        documents, llm_predictor=llm_predictor, prompt_helper=prompt_helper
    )
    
    index.save_to_disk('index.json')
    
    return index

#Constructing the chatbot
def ask_me_anything(question):
    index = GPTSimpleVectorIndex.load_from_disk('index.json')
    response = index.query(question, response_mode="default")
    
    display(Markdown(f"You asked: <b>{question}</b>"))
    display(Markdown(f"ChatBot says: <b>{response.response}</b>"))

In [44]:
 # Set OpenAI key
os.environ["OPENAI_API_KEY"] = config.api_key

In [37]:
# Construct index (takes file in folder, splits into chunks, then embeds it with OpenAI's embeddings API)
construct_index('textdata')

KeyboardInterrupt: 