# Summarize Course Evaluations

In [None]:
print("Python Notebook to summarize course evaluations!")

In [None]:
# Installation of libraries
!pip install squarify
!pip install transformers

## Data Analysis

### Load Questions and Responses from CSV

`

In [None]:
# Importing the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# additional libraries
import seaborn as sbn
import squarify as sqfy

In [None]:
# hugging face transformers
from transformers import pipeline

In [None]:
import os

In [None]:
# Config variables
_DATA_FILE_QUESTIONS_ = "../data/te_questions.csv"
_DATA_FILE_RESPONSES_ = "../data/te_responses.csv"

#### Load Data

In [None]:
# Load questions
df_questions = pd.read_csv(_DATA_FILE_QUESTIONS_)

In [None]:
# Load responses
df_responses = pd.read_csv(_DATA_FILE_RESPONSES_)

In [None]:
# Inspect questions dataframe
print("\nQuestions dataframe:")
print(df_questions.info())
print("-"*50)
# number of rows
print("Number of questions: ", len(df_questions))
print("-"*50)
print(df_questions.head())
print("-"*50)

In [None]:
# questions stats
print("\nQuestions stats:")
# how many different forms are there (unique FormNumber)
print("Number of unique forms: ", df_questions['FormNumber'].nunique())
# min # of questions for a form
print("Min # of questions for a form: ", df_questions.groupby('FormNumber').size().min())
# max # of questions for a form
print("Max # of questions for a form: ", df_questions.groupby('FormNumber').size().max())
# avg # of questions for a form
print("Avg # of questions for a form: ", df_questions.groupby('FormNumber').size().mean())

In [None]:
# let's plot the data to see the distribution of questions per form
# label the graph too, and provide x and y axis labels
# i want to set size of the images, how do i do that?

plt.figure(figsize=(10, 8))

df_questions.groupby('FormNumber').size().plot(kind='hist', bins=10)
plt.title('Distribution of questions per form')
plt.xlabel('Number of questions')
plt.ylabel('Number of forms')



In [None]:
# what other kinds of plots can i make for this data?
# how about a bar plot?
plt.figure(figsize=(10, 8))
df_questions.groupby('FormNumber').size().plot(kind='bar')
plt.title('Distribution of questions per form')
plt.xlabel('Form number')
plt.ylabel('Number of questions')



In [None]:

# unique questions count
print("\nUnique questions count:")
print(df_questions['Question'].nunique())
print("-"*50)

# list unique questions
# print("\nUnique questions:")
# print(df_questions['Question'].unique())

In [None]:
# now we analyze the reponses dataframe

# Inspect responses dataframe
print("\nResponses dataframe:")
print(df_responses.info())
print("-"*50)
# number of rows
print("Number of responses: ", len(df_responses))
print("-"*50)
# first 5 rows
print(df_responses.head())

In [None]:
# pie chart of responses by Career
# There are six Careers : UGRD, GRAD, PHRM, LAW, MED, OPT
# we set 6 bold colors
colors_career = ['#C8102E', '#00B388', '#640817', '#888B8D', '#FFF9D9', '#F6BE00']


fig = plt.figure(figsize=(15, 12))
fig.set_facecolor('white')
df_responses['Career'].value_counts().plot(kind='pie', autopct='%1.1f%%', wedgeprops={'alpha':0.75}, colors=colors_career)
plt.ylabel('')
plt.title('Responses by Career')


In [None]:
# horizontal bar chart of responses by Career
# how to use different colors for the bars?

fig = plt.figure(figsize=(15, 12))
fig.set_facecolor('white')
df_responses['Career'].value_counts().plot(kind='barh', color=colors_career)
plt.title('Responses by Career')
plt.xlabel('Number of responses')
plt.ylabel('Career')
# show values on bars
for index, value in enumerate(df_responses['Career'].value_counts()):
    plt.text(value, index, str(value))


In [None]:
# Pie chart of responses by InstrModeDescr
fig = plt.figure(figsize=(15, 12))
fig.set_facecolor('white')
df_responses['InstrModeDescr'].value_counts().plot(kind='pie', autopct='%1.1f%%', wedgeprops={'alpha':0.75})
plt.ylabel('')  
plt.title('Responses by Instruction Mode')


In [None]:
# horizontal bar chart of responses by InstrModeDescr
fig = plt.figure(figsize=(15, 12))
fig.set_facecolor('white')
df_responses['InstrModeDescr'].value_counts().plot(kind='barh')
plt.title('Responses by Instruction Mode')
plt.xlabel('Number of responses')
plt.ylabel('Instruction Mode')
# show values on bars
for index, value in enumerate(df_responses['InstrModeDescr'].value_counts()):
    plt.text(value, index, str(value))

In [None]:
# responses by component (lecture/lab etc)
fig = plt.figure(figsize=(15, 12))
fig.set_facecolor('white')
df_responses['Component'].value_counts().plot(kind='pie', autopct='%1.1f%%', wedgeprops={'alpha':0.75})

In [None]:
# horizontal bar chart responses by component (lecture/lab etc) 
fig = plt.figure(figsize=(15, 12))
fig.set_facecolor('white')
df_responses['Component'].value_counts().plot(kind='barh')
plt.title('Responses by Component')
plt.xlabel('Number of responses')
plt.ylabel('Component')
# show values on bars
for index, value in enumerate(df_responses['Component'].value_counts()):
    plt.text(value, index, str(value))


In [None]:
# list distinct acad groups
print("\nDistinct academic groups:")
print(df_responses['AcadGroup'].unique())

In [None]:
# horizontal bar chart of responses by acad group
fig = plt.figure(figsize=(15, 12))
fig.set_facecolor('white')
df_responses['AcadGroup'].value_counts().plot(kind='barh')
plt.title('Responses by Academic Group')
plt.xlabel('Number of responses')
plt.ylabel('Academic Group')
# show values on bars
for index, value in enumerate(df_responses['AcadGroup'].value_counts()):
    plt.text(value, index, str(value))

In [None]:
# tree map by acad group
plt.figure(figsize=(15, 12))
sizes = df_responses['AcadGroup'].value_counts().values
labels = df_responses['AcadGroup'].value_counts().index
colors = [plt.cm.Spectral(i/float(len(labels))) for i in range(len(labels))]
sqfy.plot(sizes=sizes, label=labels, color=colors, alpha=0.75)

In [None]:
# list distinct subjects
print("\nDistinct subjects:")
print(df_responses['Subject'].unique())

In [None]:
# use squarify to plot the responses by subject
# set the figure size
plt.figure(figsize=(50, 40))
# plot the treemap
sqfy.plot(sizes=df_responses['Subject'].value_counts(), label=df_responses['Subject'].value_counts().index, text_kwargs={'fontsize': 12}, alpha=0.75)
plt.axis('off')
plt.title('Responses by Subject')


In [None]:
# now we do a tree map by the department + course : 
# department is subject and course is catalog, so group on two fields
# set the figure size
plt.figure(figsize=(50, 40))
# filter and plot only for math department
df_math = df_responses.loc[df_responses['Subject'] == 'MATH']
# plot the treemap
sqfy.plot(sizes=df_math.groupby(['Subject', 'Catalog']).size(), label=df_math.groupby(['Subject', 'Catalog']).size().index, text_kwargs={'fontsize': 14}, alpha=0.75)
plt.axis('off')
plt.title('Responses by Department + Course')


In [None]:
# now we do a tree map by the department + course : 
# department is subject and course is catalog, so group on two fields
# set the figure size
plt.figure(figsize=(50, 40))
# filter and plot only for pharmacy department
df_phar = df_responses.loc[df_responses['Subject'] == 'PHAR']
# plot the treemap
sqfy.plot(sizes=df_phar.groupby(['Subject', 'Catalog']).size(), label=df_phar.groupby(['Subject', 'Catalog']).size().index, text_kwargs={'fontsize': 14}, alpha=0.75)
plt.axis('off')
plt.title('Responses by Department + Course')


In [None]:
# now we need to do a count 
# also for each acad unit, how many number of subjects
# and for each subject, how many number of catalogs
# and then for each catalog, how many number of sections

# Acad units count: Colleges
cnt_acad_units = len(df_responses['AcadGroup'].unique())
print(f"Distinct academic groups: {cnt_acad_units}")

# Subjects count : Departments
# list distinct subjects
cnt_subjects = len(df_responses['Subject'].unique())
print(f"Distinct subjects: {cnt_subjects}")

# Catalogs count : Courses
# list distinct subject+catalogs
cnt_catalogs = df_responses.groupby(['Subject', 'Catalog']).size()
print(f"Distinct subject+catalogs: {len(cnt_catalogs)}")

# Sections count
# list distinct subject+catalog+ClassNbr
cnt_sections = df_responses.groupby(['Subject', 'Catalog', 'ClassNbr']).size()
print(f"Distinct subject+catalog+sections: {len(cnt_sections)}")

# distinct form numbers
cnt_forms = len(df_responses['EvaluationForm'].unique())    
print(f"Distinct forms: {cnt_forms}")


In [None]:
# print all the distinct forms in the df_responses
print("\nDistinct forms:")
forms_in_responses = df_responses['EvaluationForm'].unique()
forms_in_responses.sort()
for form in forms_in_responses:
    print(form)
    # check to see if it exists in df_questions
    # if so print the questions, otherwise print "No form found"
    if form in df_questions['FormNumber'].unique():
        # print(df_questions.loc[df_questions['FormNumber'] == form]['Question'])
        print('Form found')
    else:
        print("No form found")


In [None]:
# sentiment analysis
def sentiment_analysis(text):
    sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert/distilbert-base-uncased-finetuned-sst-2-english")
    # print(f"text: {text}")
    return sentiment_pipeline(text)


In [None]:

def summarizer(model, text):
    summarizer = pipeline("summarization", model=model)
    cnt_word_tokens = int(len(text.split()))
    min_length = int(0.1*cnt_word_tokens)
    max_length = int(0.2*cnt_word_tokens)
    return summarizer(text, min_length=min_length, max_length=max_length, do_sample = False)


In [None]:
# now we need to iterate through each section
# get the form number for each section
# look up the questions for each section df_questions
# then we can get the responses for each question for each section df_responses
# for each response we can do sentiment analysis - use hugging face transformers library
# then we count how many are +ve, -ve and neutral
# then we concatenate all the comments for the question for each section
# and then summarize the comments for each question for each section
# then we write it to a file
def process_section(instructor, subject, catalog, section_nunber, df_section_responses, df_questions):
    print("="*88)
    print(f"Section: {section_nunber}")
    # print(df_section_responses.head())
    # print("-"*50)
    print(f"Number of responses: {len(df_section_responses)}")
    # does it have a form number?
    form_numbers = df_section_responses['EvaluationForm'].unique()
    if len(form_numbers) == 0:
        print("No form number found")
        return
    elif len(form_numbers) > 1:
        print("Multiple form numbers found")
        return
    else:
        form_number = form_numbers[0]
        print(f"Form number: {form_number}")
        # get the questions for the form number
        df_form_questions = df_questions.loc[df_questions['FormNumber'] == form_number]
        print(f"Number of questions: {len(df_form_questions)}")
        print(f"Questions: {df_form_questions}")

        # for each question, get the responses
        # comments column is the one we are interested in
        # but naming of column is not clear, so we will use the index
        comment_starting_index = 12
        comment_iter = 0
        for index, row in df_form_questions.iterrows():
            print(f"index = {index}")
            question = row['Question']
            print(f"Question: {question}")
            # get the responses for the question
            # get the column name
            column_name = df_section_responses.columns[comment_starting_index + comment_iter]

            # bump comment_iter so for next question, we pick the next column
            comment_iter += 1

            print(f"Column name: {column_name}")
            # get the responses
            responses = df_section_responses[column_name]
            print(f"Number of responses: {len(responses)}")

            # clean out the responses
            # remove nan, empty strings
            responses = responses.dropna()
            responses = responses[responses != '']

            print(f"Number of responses after cleaning: {len(responses)}")
            # print(f"Responses: {responses}")

            # sentiment analysis
            # convert responses to list of strings
            if len(responses) == 0:
                print("No responses to analyze for sentiments")
            else:
                responses = responses.tolist()
                sentiments = sentiment_analysis(responses)
                # print(sentiments)
                
            # summarize the comments
            SUMMARIZER_LOWER_LIMIT = 200
            
            # remove newline from within each individual response/comment
            responses = [response.replace('\n', ' ') for response in responses]

            # concatenate the comments with a newline
            comments = '|'.join(responses)

            len_comments = len(comments)
            print(f"Length of comments: {len_comments}")
            if len_comments < SUMMARIZER_LOWER_LIMIT:
                print("Not enough comments to summarize")
                summary = comments
            else:
                print("="*88)
                print(f"len_comments: {len_comments}")
                # print(comments)
                model = "allenai/led-base-16384"
                model = "google-t5/t5-small"
                summary = summarizer(model, comments)
                summary_text = summary[0]['summary_text']
                # print("-"*88)
                print(f"len_summary: {len(summary_text)}")
                # print(summary_text)

            # write to a file
            # create a file in data/out folder
            # file name has to be subject_catalog_section_instructor_qNo.txt
            # write the question, responses, sentiments, summary
            # create folder for model if it doesn't exist
            # replace '/' in model with - to avoid creating subfolders
            model = model.replace('/', '-')
            if not os.path.exists(f"../data/out/{model}"):
                os.makedirs(f"../data/out/{model}")
            file_name = f"../data/out/{model}/{subject}_{catalog}_{section_nunber}_{instructor}_{column_name}.txt"
            with open(file_name, 'w') as file:
                file.write(f"Question: {question}\n")
                file.write(f"Responses: {len(responses)}\n")
                file.write(f"Comments: {comments}\n")
                file.write(f"Sentiments: {sentiments}\n")
                file.write(f"Summary: {summary_text}\n")
            # close the file
            

    

# we get the list of distinct sections (classNbr)
# then for each ClassNbr we filter out its responses 
distinct_sections = df_responses['ClassNbr'].unique()
print(f"Distinct sections: {len(distinct_sections)}")

processed = False
for section_nunber in distinct_sections:
    if section_nunber == "10989A":
        # InstructorName,Subject,Catalog,ClassNbr,Term,SessionCode,distanceed,EvaluationForm,Comment1,Column2,Comment2,Column3,Comment3,Column4,Comment4,Column5,Comment5,Column6,Comment6,Column7,Comment7,Column8,Comment8,Column9,Comment9,Column10,Comment10,Column11,Comment11,,
        df_section_responses = df_responses.loc[df_responses['ClassNbr'] == section_nunber]

        # get the instructor, subject, catalog
        instructor = df_section_responses['InstructorName'].iloc[0]
        subject = df_section_responses['Subject'].iloc[0]
        catalog = df_section_responses['Catalog'].iloc[0]

        print(f"Now processing section: {section_nunber}")
        process_section(instructor, subject, catalog, section_nunber, df_section_responses, df_questions)
        processed = True
    
    if processed: break
    
    
   