# ML Experiments
We want to try to use machine learning to differentiate between differently rated conversations. Here we begin extracting features from the logfiles and experimenting with algorithms.

In [50]:
# Imports
import pandas as pd
import numpy as np
import sklearn
import nltk
from nltk.classify import NaiveBayesClassifier
from nltk import ConfusionMatrix
#from nltk.corpus import opinion_lexicon
#from nltk.stem import WordNetLemmatizer
#from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.classify import SklearnClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn import linear_model
#from matplotlib import pyplot as plt
import os
import re
import csv
import time
import sys
import matplotlib.pyplot as plt

In [9]:
# nltk downloads
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
# Mount drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [11]:
# Directories
DATA_DIR = '/content/drive/Shareddrives/Alexa Prize 4 (2020 21)/Data/Rating Analysis/Rating-wise grouped conversations/Dataset Partitions'
GROUP_FOLDER = '/content/drive/Shareddrives/Alexa Prize 4 (2020 21)/Data/Rating Analysis/Rating-wise grouped conversations'

# Data Extraction

In [12]:
# Open file and return dataframe
def open_file(filepath, delimiter='\t'):
  logs = pd.read_csv(filepath, sep=delimiter)
  return logs
# Save new data to a new tsv file
def save_to_file(filepath, dataframe, delimiter='\t'):
  dataframe.to_csv(filepath, sep=delimiter, index=False)

In [13]:
trainingFrame = open_file(DATA_DIR + "/Training Data/rating-1-training-set-profanity-tagged.tsv")
trainingFrame

Unnamed: 0,date,turn_count,conversation_id,text,grounding_text,response,chosen_ack_rg,chosen_rg,current_topic,midas,finegrained_labels,finegrained_clf_labels,is_profane
0,2021-02-19,0.0,560597369048bdb615f7d10490e47b44f8e64f310f847a...,let's chat,,"Hi, this is an Alexa Prize Socialbot. <amazon:...",,INTRODUCTION,introduction,,['unk'],['more-information'],0
1,2021-02-19,1.0,560597369048bdb615f7d10490e47b44f8e64f310f847a...,good horrible,,"<say-as interpret-as=""interjection"">Oh boy.</s...",,INTRODUCTION,introduction,"['comment', 'back-channeling', 'opinion']","['unk', 'unk', 'unk']","['statement-non-opinion', 'statement-non-opini...",0
2,2021-02-19,2.0,560597369048bdb615f7d10490e47b44f8e64f310f847a...,no,,I get it. I kept it lowkey this year. Next yea...,,INTRODUCTION,introduction,['neg_answer'],['no-answer'],['no-answer'],0
3,2021-02-19,3.0,560597369048bdb615f7d10490e47b44f8e64f310f847a...,no,,"Well, I hope you are able to stay safe, even i...",,INTRODUCTION,introduction,['neg_answer'],['no-answer'],['no-answer'],0
4,2021-02-19,4.0,560597369048bdb615f7d10490e47b44f8e64f310f847a...,nothing,,"Oh, that's ok! For me, I really want to experi...",,INTRODUCTION,introduction,['other_answers'],['no-answer'],['statement-non-opinion'],0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
45812,2021-05-06,25.0,d59a141d1f85a01e40827e5ab65791db5c0dbf21493202...,yes,,"<amazon:emotion name=""excited"" intensity=""low""...",,RG_SYSTEM_INITIATIVE_MENU,neutral,['pos_answer'],['yes-answer'],['yes-answer'],0
45813,2021-05-06,26.0,d59a141d1f85a01e40827e5ab65791db5c0dbf21493202...,good night,,"I loved chatting with you, let's talk again so...",,GOODBYE,,['closing'],['conversation-closing'],['conversation-closing'],0
45814,2021-05-06,27.0,d59a141d1f85a01e40827e5ab65791db5c0dbf21493202...,who is trying to conversation,,"<amazon:emotion name=""excited"" intensity=""low""...",,DM_GENERATOR,nature,['open_question_factual'],['unk'],['fact-question'],0
45815,2021-05-06,28.0,d59a141d1f85a01e40827e5ab65791db5c0dbf21493202...,are you doing,,"What do you think about this. <prosody rate=""9...",,NATURE,nature,['abandon'],['conversation-opening'],['personal-question'],0


### RG distribution

In [14]:
rg_dist = nltk.FreqDist(trainingFrame['chosen_rg'])
rg_dist.most_common(1000)

[('INTRODUCTION', 20571),
 ('CENTER', 2849),
 (nan, 2622),
 ('DM_GENERATOR', 1606),
 ('NATURE', 1303),
 ('VIDEO_GAMES', 1228),
 ('MOVIESKG', 1141),
 ('ANIMALS', 1140),
 ('TEMPLATE_HANDLER_FALLBACK', 1041),
 ('MUSICKG', 944),
 ('redquestion', 881),
 ('BOOKS', 747),
 ('MUSIC', 742),
 ('GOODBYE', 723),
 ('MOVIES', 688),
 ('EVI', 686),
 ('FOOD', 648),
 ('TVKG', 603),
 ('SPORTSKG', 569),
 ('HOBBIES', 526),
 ('NUTRITION', 504),
 ('RepeatGenerator', 442),
 ('DINOSAURS', 434),
 ('BOARD_GAMES', 433),
 ('HARRYPOTTER', 408),
 ('RG_SYSTEM_INITIATIVE_MENU', 383),
 ('SB_INDEX', 370),
 ('ASTRONOMY', 293),
 ('FunctionalMonolith', 263),
 ('MCU', 257),
 ('COMIC_BOOKS', 182),
 ('PIRATES', 180),
 ('FallbackStrategyInitiative', 109),
 ('ONTOLOGYBOT', 91),
 ('IMMenuChances_normal', 76),
 ('SUPER_BOWL', 48),
 ('ES_INDEX', 41),
 ('WAPO', 37),
 ('WIKIPEDIA', 7),
 ('CONTROLLED_POLICY_DRIVEN_NRG', 1)]

### Topic distribution

In [15]:
topic_dist = nltk.FreqDist(trainingFrame['current_topic'])
topic_dist.most_common(50)

[('introduction', 21789),
 (nan, 3379),
 ('movies', 2474),
 ('music', 2178),
 ('animals', 1921),
 ('nature', 1749),
 ('video_games', 1440),
 ('sports', 1314),
 ('books', 1220),
 ('dinosaurs', 1029),
 ('harry_potter', 880),
 ('food', 824),
 ('hobbies', 793),
 ('nutrition', 792),
 ('astronomy', 764),
 ('tv', 742),
 ('comic_books', 684),
 ('board_games', 597),
 ('pirates', 472),
 ('menu_topic', 435),
 ('neutral', 142),
 ('super_bowl', 68),
 ('conspiracy', 49),
 ('news', 46),
 ('politics', 18),
 ('history', 7),
 ('artificial_intelligence', 4),
 ('donald_trump', 4),
 ('controversial', 2),
 ('science_and_technology', 1)]

# LIWC Implementation
LIWC code from CSE 143

In [16]:
#This tries to be similar to LIWC, even LIWC's more questionable decisions....
# Examples: dot after a number (123.) is included in the number, hyphens break words, "Mr." is a sentence, yeah...
# That said, I refuse to call "123)." two words... some things are just too crazy..
#The primary interface is score_text(), 
#  though you may want to use Dictionary's score_word() directly on occasion.
#This was built for LIWC's 2007 dictionary with category names/numbers in the dic file
#A line with "%" should precede the category list, if not, expect crashing
#Only words starting with alphanumeric characters count towards the "Word Count" 
#  and are used in the normalizing denominator. 
#Parenthesis are counted individually, not in groups of two 
#Please be careful when editing the code, it is more complicated than would be ideal

from __future__ import division
import re
import string
import os, sys
from collections import Counter, defaultdict

#This actually captures most of what LIWC does...., I have no idea why it sells...
_liwc_tokenizer = re.compile(r'(\d[\d\.\,\-\:]*\d\.?|[a-zA-Z][a-zA-Z\.\']*[a-zA-Z]|\S|\n)',re.UNICODE|re.IGNORECASE) 
def score_text(text, raw_counts=False, scores=None, unique_words=None):
    """Returns a sparse counter object of word frequencies or counts if raw_counts is specified
        @param scores: If you want to keep a running total, Scores should be 
            a Counter of previous counts and raw_counts should be set to True!
        @param unique_words: Again, will be created if None. Should be a set().
            If used, you'll probably want to override the scores['Unique Words'] category.
    """
    if scores == None: scores = Counter()
    if unique_words == None: unique_words = set()
    
    all_tokens = _liwc_tokenizer.findall(text.lower())
    sentence_terminated = True
    for i in range(len(all_tokens)):
        token = all_tokens[i]
        if len(token)==0: continue
        
        if token[0].isdigit(): #Numbers
            scores.update(_dictionary.score_word(token))
            sentence_terminated=False
        elif token[0].isalpha(): #Words
            unique_words.add(token)
            previous_token = all_tokens[i-1] if i>0 else ''
            next_token = all_tokens[i+1] if i<len(all_tokens)-1 else ''
            scores.update(_dictionary.score_word(token, previous_token, next_token))
            sentence_terminated=False
        else: #Punctuation and stuff
            scores.update(_dictionary.score_word(token))

        if token in Dictionary.sentence_punctuation and not sentence_terminated:
            scores['Sentences']+=1
            sentence_terminated = True

    if not sentence_terminated:
        scores['Sentences']+=1
    
    scores['Unique Words']=len(unique_words)
    if scores['Sentences'] > 0:
    	scores['Words Per Sentence']=scores['Word Count']/scores['Sentences']
    else:
    	scores['Words Per Sentence'] = 1;
    
    if not raw_counts:
        scores = normalize_scores(scores)
    
    return scores

def score_file(filename, raw_counts=False, scores=None, unique_words=None):
    return score_text(open(filename).read(), raw_counts=raw_counts, scores=scores, unique_words=unique_words)

def normalize_scores(scores, bound_scores=True):
    """@summary: Converts counts to percentages"""
    new_scores = Counter()
    for category, score in scores.items():
        if category not in {'Word Count', 'Sentences','Words Per Sentence', 'Newlines'}:
            if scores['Word Count'] > 0:
                score = 100.0*score/scores['Word Count']
            elif score > 0:
                score = 100.0
            else:
                score = 0.0
            if bound_scores: #Since certain categories can exceed word count
                score = min(100.0, max(0.0, score)) #Bounds it to [0,100]
        new_scores[category]=score
    return new_scores

class Dictionary():
    sentence_punctuation = {'.','?','!','\n'}
    _TYPE_BASIC = 'basic'
    _TYPE_PRE = 'pre'
    _TYPE_POST = 'post'
    
    def __init__(self, filename, use_long_category_names=True, internal_category_list=None):
        """@param internal_category_list: Should be None or '2001' or '2007' """
        self._stems = dict()#this is a prefix tree for the stems, the leaves are sets of categories
        self._lookup = defaultdict(dict) #word->type->????->{categories} 
                                                    #type can be one of "basic", "pre", "post". 
                                                    #basic leads to a set of categories, 
                                                    #pre and post lead to a list of tuples of (conditions, if_true categories, if_false categories)
        self._ignored=set() #caches words that are searched for but not found, this favors processing over memory
        
        self._setup_category_lookup(internal_category_list, use_long_category_names)
        try:
            self.load_dictionary_file(filename, internal_category_list)
        except:
            sys.stderr.writelines(["Failed to load dictionary file: "+filename+"\n",
                                   "Is the dictionary file correct?\n",
                                   "Does a % precede the category list?\n",
                                   "If there is no category list, did you set internal_category_list='2007' ?\n",
                                   "Hope this helps...\n"])
            raise
    
    
    _dictionary_line_re =  re.compile(r'^(\w+)(\*?)\s*(.*)$')
    _dictionary_line_categories_re = re.compile(r'(\d+|\<(\w+(\s+\w+)*)\>(\d+)(\/(\d+))?|\(\s*(\d+(\s+\d+)*)\s*\)(\d+)(\/(\d+))?)')
    def load_dictionary_file(self, filename, internal_category_list=None):
        category_mode = False
        for line in open(filename):
            line = line.strip()
            
            if line=='' or line.startswith('#'): 
                continue
            if line.startswith('%'):
                category_mode = not category_mode
                continue
            
            if category_mode:
                if internal_category_list == None:
                    number, category_name = line.split()
                    category_name = self._translate_category_name(category_name)
                    self._category_lookup[int(number)]=category_name
                continue
            
            word, is_stem, all_word_categories = Dictionary._dictionary_line_re.match(line).groups()
            for category_group in Dictionary._dictionary_line_categories_re.findall(all_word_categories):
                category = category_group[0]
                if category == '00':
                    continue
                elif category.isdigit():
                    if is_stem=='*':
                        self._add_stemmed(word, self._category_lookup[int(category)])
                    else:
                        if Dictionary._TYPE_BASIC not in self._lookup[word]:
                            self._lookup[word][Dictionary._TYPE_BASIC]=set()
                        self._lookup[word][Dictionary._TYPE_BASIC].add(self._category_lookup[int(category)])
                
                elif '(' in category or '<' in category: #convoluted special cases lead to much of the complexity in this program
                    junk, post, junk, if_post, junk, if_not_post, pre, junk, if_pre, junk, if_not_pre = category_group
                    if pre != '':
                        entry_type = Dictionary._TYPE_PRE
                        conditions = sorted([self._category_lookup[int(number)] for number in pre.split()])
                        if_true = self._category_lookup[int(if_pre)]
                        if if_not_pre != '':
                            if_not_true = self._category_lookup[int(if_not_pre)]
                    elif post != '':
                        entry_type = Dictionary._TYPE_POST
                        conditions = sorted(post.lower().split())
                        if_true = self._category_lookup[int(if_post)]
                        if if_not_post != '':
                            if_not_true = self._category_lookup[int(if_not_post)]
                        
                    if entry_type not in self._lookup[word]:
                            self._lookup[word][entry_type]=list()
                    
                    for other_conditions, other_if_set, other_if_not_set in self._lookup[word][entry_type]:
                        if str(other_conditions)==str(conditions): #a little costly on load means less on use
                            other_if_set.add(if_true)
                            other_if_not_set.add(if_not_true)
                            break
                    else: #for else means the for ended naturally
                        self._lookup[word][entry_type].append( (conditions, {if_true}, {if_not_true}) )
    
    def _translate_category_name(self, category_name):
        if category_name.lower() in self._category_name_lookup:
            return self._category_name_lookup[category_name.lower()]
        return category_name
    
    def _add_stemmed(self, word, category):
        current_node = self._stems
        for char in word[:-1]:
            if char not in current_node:
                current_node[char]=dict()
            current_node = current_node[char]
        if word[-1] not in current_node:
            current_node[word[-1]]=set()
        current_node = current_node[word[-1]]

        current_node.add(category)
    
    _pure_punctuation_re = re.compile('^['+re.escape(string.punctuation)+']+$')
    _punctuation_of_interest = {'?':'Question Marks', '!':'Exclamation Marks', '"':'Quote Marks',
                                ',':'Comma',':':'Colon',';':'Semicolon','-':'Dash','\'':'Apostrophe',
                                '(':'Parenthesis', ')':'Parenthesis', '{':'Parenthesis', '}':'Parenthesis', '[':'Parenthesis', ']':'Parenthesis' }
    def score_word(self, word, previous_word=None, next_word=None):
        scores = Counter()
        if word is None:
            return scores
        
        if '\n' in word:
            scores['Newlines']+=1
            
        word = word.strip().lower()
        
        if len(word)==0:
            pass
        elif word[0].isdigit():
            scores['Word Count']+=1
            scores['Numerals']+=1
        elif Dictionary._pure_punctuation_re.match(word):
            scores['All Punctuation']+=1
            for char in word:
                if char in Dictionary._punctuation_of_interest:
                    scores[Dictionary._punctuation_of_interest[char]]+=1
                else:
                    scores['Other Punctuation']+=1
        else:
            scores['Word Count']+=1
            if len(word) > 6:
                scores['Six Letter Words'] += 1
            if word not in self._ignored:
                if word in self._lookup:
                    for entry_type in self._lookup[word]:
                        if entry_type==Dictionary._TYPE_BASIC:
                            scores.update(self._lookup[word][entry_type])
                        else:
                            for conditions, if_set, if_not_set in self._lookup[word][entry_type]:
                                if ((entry_type==Dictionary._TYPE_PRE and not set(self.score_word(word=previous_word, next_word=word).keys()).isdisjoint(set(conditions))) or 
                                    (entry_type==Dictionary._TYPE_POST and next_word is not None and next_word.lower() in conditions)):
                                    scores.update(if_set)
                                else:
                                    scores.update(if_not_set)
                else:
                    current_node = self._stems
                    for char in word:
                        if char in current_node:
                            current_node = current_node[char]
                            if isinstance(current_node, set):
                                if Dictionary._TYPE_BASIC not in self._lookup[word]:
                                    self._lookup[word][Dictionary._TYPE_BASIC]=set()
                                self._lookup[word][Dictionary._TYPE_BASIC].update(current_node) #add to main lookup for time efficiency
                                scores.update(self._lookup[word][Dictionary._TYPE_BASIC])
                                break
                        else:
                            self._ignored.add(word) #dead end
                            break
                    else:
                        self._ignored.add(word) #not found but didn't hit a dead end

                if word not in self._ignored: #Note this is "still not in"
                    scores['Dictionary Words']+=1
        return scores
    
    def _setup_category_lookup(self, internal_category_list, use_long_category_names):
        self._category_name_lookup = dict()
        if use_long_category_names:
            for long_name, LIWC2007_number, LIWC2007_short, LIWC2001_number, LIWC2001_short in Dictionary._liwc_categories:
                if LIWC2001_short is not None:
                    self._category_name_lookup[LIWC2001_short]=long_name
                if LIWC2007_short is not None:
                    self._category_name_lookup[LIWC2007_short]=long_name
        
        self._category_lookup = dict()
        if internal_category_list is not None:
            for long_name, LIWC2007_number, LIWC2007_short, LIWC2001_number, LIWC2001_short in Dictionary._liwc_categories:
                if internal_category_list == '2001' and LIWC2001_number is not None:
                    self._category_lookup[LIWC2001_number]=self._translate_category_name(LIWC2001_short)
                if internal_category_list == '2007' and LIWC2007_number is not None:
                    self._category_lookup[LIWC2007_number]=self._translate_category_name(LIWC2007_short)
        
    #In case it is needed:
    #(long_name, LIWC2007_number, LIWC2007_short, LIWC2001_number, LIWC2001_short)
    _liwc_categories =  [
    ('Total Function Words',1,'funct',None,None), 
    ('Total Pronouns',2,'pronoun',1,'pronoun'), 
    ('Personal Pronouns',3,'ppron',None,None), 
    ('First Person Singular',4,'i',2,'i'), 
    ('First Person Plural',5,'we',3,'we'), 
    ('Second Person',6,'you',5,'you'), 
    ('Third Person Singular',7,'shehe',None,None), 
    ('Third Person Plural',8,'they',None,None), 
    (' Impersonal Pronouns',9,'ipron',None,None), 
    ('Articles',10,'article',9,'article'), 
    ('Common Verbs',11,'verb',None,None), 
    ('Auxiliary Verbs',12,'auxverb',None,None), 
    ('Past Tense',13,'past',38,'past'), 
    ('Present Tense',14,'present',39,'present'), 
    ('Future Tense',15,'future',40,'future'), 
    ('Adverbs',16,'adverb',None,None), 
    ('Prepositions',17,'preps',10,'preps'), 
    ('Conjunctions',18,'conj',None,None), 
    ('Negations',19,'negate',7,'negate'), 
    ('Quantifiers',20,'quant',None,None), 
    ('Number',21,'number',11,'number'), 
    ('Swear Words',22,'swear',66,'swear'), 
    ('Social Processes',121,'social',31,'social'), 
    ('Family',122,'family',35,'family'), 
    ('Friends',123,'friend',34,'friends'), 
    ('Humans',124,'humans',36,'humans'), 
    ('Affective Processes',125,'affect',12,'affect'), 
    ('Positive Emotion',126,'posemo',13,'posemo'), 
    ('Negative Emotion',127,'negemo',16,'negemo'), 
    ('Anxiety',128,'anx',17,'anx'), 
    ('Anger',129,'anger',18,'anger'), 
    ('Sadness',130,'sad',19,'sad'), 
    ('Cognitive Processes',131,'cogmech',20,'cogmech'), 
    ('Insight',132,'insight',22,'insight'), 
    ('Causation',133,'cause',21,'cause'), 
    ('Discrepancy',134,'discrep',23,'discrep'), 
    ('Tentative',135,'tentat',25,'tentat'), 
    ('Certainty',136,'certain',26,'certain'), 
    ('Inhibition',137,'inhib',24,'inhib'), 
    ('Inclusive',138,'incl',44,'incl'), 
    ('Exclusive',139,'excl',45,'excl'), 
    ('Perceptual Processes',140,'percept',27,'senses'), 
    ('See',141,'see',28,'see'), 
    ('Hear',142,'hear',29,'hear'), 
    ('Feel',143,'feel',30,'feel'), 
    ('Biological Processes',146,'bio',None,None), 
    ('Body',147,'body',61,'body'), 
    ('Health',148,'health',None,None), 
    ('Sexual',149,'sexual',62,'sexual'), 
    ('Ingestion',150,'ingest',63,'eating'), 
    ('Relativity',250,'relativ',None,None), 
    ('Motion',251,'motion',46,'motion'), 
    ('Space',252,'space',41,'space'), 
    ('Time',253,'time',37,'time'), 
    ('Work',354,'work',49,'job'), 
    ('Achievement',355,'achieve',50,'achieve'), 
    ('Leisure',356,'leisure',51,'leisure'), 
    ('Home',357,'home',52,'home'), 
    ('Money',358,'money',56,'money'), 
    ('Religion',359,'relig',58,'relig'), 
    ('Death',360,'death',59,'death'), 
    ('Assent',462,'assent',8,'assent'), 
    ('Nonfluencies',463,'nonfl',67,'nonfl'), 
    ('Fillers',464,'filler',68,'fillers'), 
    ('Total first person',None,None,4,'self'), 
    ('Total third person',None,None,6,'other'), 
    ('Positive feelings',None,None,14,'posfeel'), 
    ('Optimism and energy',None,None,15,'optim'), 
    ('Communication',None,None,32,'comm'), 
    ('Other references to people',None,None,33,'othref'), 
    ('Up',None,None,42,'up'), 
    ('Down',None,None,43,'down'), 
    ('Occupation',None,None,47,'occup'), 
    ('School',None,None,48,'school'), 
    ('Sports',None,None,53,'sports'), 
    ('TV',None,None,54,'tv'), 
    ('Music',None,None,55,'music'), 
    ('Metaphysical issues',None,None,57,'metaph'), 
    ('Physical states and functions',None,None,60,'physcal'), 
    ('Sleeping',None,None,64,'sleep'), 
    ('Grooming',None,None,65,'groom')]

# _dictionary_filename = os.path.abspath(os.path.join("./", 'data'))+'/LIWC2007.dic'
_dictionary_filename = os.path.join(GROUP_FOLDER, "LIWC", "LIWC2007.dic")
_dictionary=Dictionary(_dictionary_filename)

In [17]:
# Testing LIWC
txt = "Does this work? I don't know."
print(txt)
print(score_text(txt))

Does this work? I don't know.
Counter({'Unique Words': 100.0, 'Dictionary Words': 33.333333333333336, 'All Punctuation': 33.333333333333336, 'Achievement': 16.666666666666668, 'Work': 16.666666666666668, 'Question Marks': 16.666666666666668, 'Insight': 16.666666666666668, 'Common Verbs': 16.666666666666668, 'Cognitive Processes': 16.666666666666668, 'Present Tense': 16.666666666666668, 'Other Punctuation': 16.666666666666668, 'Word Count': 6, 'Words Per Sentence': 3.0, 'Sentences': 2})


# Compile Data About Conversations
Here, I think it would be useful to construct new dataframes which contain useful information about each conversation (like the user text, system text, length of the conversation, etc...)

In [18]:
conversation_ids = list(set(trainingFrame['conversation_id']))
print("There are {} conversations in this dataframe.".format(len(conversation_ids)))

There are 2926 conversations in this dataframe.


In [19]:
# Regex extraction functions
def extract_system_text(system_utter):
  # Extract normal text from the system utterance and return it
  # Find all interjections of the form <string>
  text_search = re.finditer(r'\<[^<>]*\>', system_utter)
  new_text = ""
  index = 0
  # Step through the matches, taking only the text outside the spans
  for match in text_search:
    new_text += system_utter[index:match.span()[0]]
    index = match.span()[1]
  # Add the last bit of the utterance
  # (or the entire thing if no matches were found)
  new_text += system_utter[index:]
  return new_text

def extract_midas_labels(midas_string):
  # Convert the midas string into a list of strings
  # Matches either all word characters enclosed in ' ' or in " ", depending on which is present
  return [match1 if re.search(r"\w", match1) else match2 for match1, match2 in re.findall(r"\"([^\"]*)\"|'([^']*)'", midas_string)]

In [20]:
tests = ["['hello', 'there']", "['hello', \"they're\"]", "[\"how's\", \"they're\"]", "[\"www'''w'w'w'w'\", \"asdf''asdfsadf'''asdf\"]"]

In [21]:
for test in tests:
  print(extract_midas_labels(test))

['hello', 'there']
['hello', "they're"]
["how's", "they're"]
["www'''w'w'w'w'", "asdf''asdfsadf'''asdf"]


In [22]:
# Extract and compile info about each conversation
def compile_info(dataframe, rating, min_length):
  """
  Iterate through the dataframe and compile information about each conversation.
  This information includes:
    The conversation ID 
    The length of the conversation
    The date at which the conversation took place
    The user utterances
    The system utterances
    A distribution of the topics that appeared
    A distribution of the response generators that were used by the system
    The rating (for later classification tasks)
    Number of profane tagged user utterences
    A list of midas tag lists for each conversation
  
  Params:
    dataframe : pandas DataFrame of a logfile
    rating : int, the rating of the conversations which appear in the dataframe
    min_length : int, the minimum number of turns allowed
  Returns:
    A new pandas DataFrame containing the above information. Each row is one conversation.
  """
  # compiled_info is a list of rows, each in the form of a list
  compiled_info = []
  # Iterate through the dataframe, extracting data about conversations
  # and adding it to the compiled_info list
  current_conv_id = dataframe['conversation_id'][0]
  # Objects to hold information about the current conversation
  user_text = []
  system_text = []
  conv_topic_dist = nltk.FreqDist()
  conv_rg_dist = nltk.FreqDist()
  conv_len = 0
  profanities = 0
  midas = []
  for i in range(len(dataframe)):
    # If the conversation id changes, we have a new conversation beginning
    if not dataframe['conversation_id'][i] == current_conv_id:
      # If conversation is long enough,
      # Add info to compiled_info list
      if conv_len >= min_length:
        compiled_info.append([current_conv_id, rating, dataframe['date'][i-1], conv_len, user_text, system_text, conv_topic_dist, conv_rg_dist, profanities, midas])
      # Reset the conversation
      user_text = []
      system_text = []
      midas = []
      conv_len = 0
      conv_topic_dist = nltk.FreqDist()
      conv_rg_dist = nltk.FreqDist()
      profanities = 0
      # Update current conversation id
      current_conv_id = dataframe['conversation_id'][i]
    # Compile info for each row in a conversation
    # If we have a non-terminal row, add one to the conversation length
    if not np.isnan(dataframe['turn_count'][i]):
      conv_len += 1
    # If there is a valid midas string, convert it to a list and append
    if type(dataframe['midas'][i]) is str:
      midas.append(extract_midas_labels(dataframe['midas'][i]))
    else:
      midas.append([])
    # Sum the number of profanities so far
    profanities += dataframe['is_profane'][i]
    # If the user had a valid utterance, append it
    if type(dataframe['text'][i]) is str:
      user_text.append(dataframe['text'][i])
    # If the system had a valid utterance, append the extracted text
    system_utter = dataframe['response'][i]
    if type(system_utter) is str:
      system_text.append(extract_system_text(system_utter))
    #else: The box is empty (end of a conversation)
    # If the current topic is not blank, add it to the distribution
    if type(dataframe['current_topic'][i]) is str:
      conv_topic_dist[dataframe['current_topic'][i]] += 1
    # If the current response generator is not blank, add it to the distribution
    if type(dataframe['chosen_rg'][i]) is str:
      conv_rg_dist[dataframe['chosen_rg'][i]] += 1
  # Add info for last conversation (current id doesn't change at the end of the file, but we still want the conversation info)
  if conv_len >= min_length:
    compiled_info.append([current_conv_id, rating, dataframe['date'][len(dataframe)-1], conv_len, user_text, system_text, conv_topic_dist, conv_rg_dist, profanities, midas])

  # Get midas label distributions
  for i in range(len(compiled_info)):
    midas_dist = nltk.FreqDist()
    for sub_list in compiled_info[i][9]:
      for element in sub_list:
        midas_dist[element] += 1
    compiled_info[i].append(midas_dist)

  # Construct new dataframe from the info (each list in compiled_info becomes a row)
  new_frame = pd.DataFrame(compiled_info, columns=['conversation_id', 'rating', 'date', 'conversation_length', 'user_text', 'system_text', 'topic_dist', 'rg_dist', 'profanities', 'midas', 'midas_dist'])
  return new_frame

In [23]:
# Testing compile info
new_training_frame = compile_info(trainingFrame, rating=1, min_length=7)
new_training_frame

Unnamed: 0,conversation_id,rating,date,conversation_length,user_text,system_text,topic_dist,rg_dist,profanities,midas,midas_dist
0,560597369048bdb615f7d10490e47b44f8e64f310f847a...,1,2021-02-19,35,"[let's chat, good horrible, no, no, nothing, n...","[Hi, this is an Alexa Prize Socialbot. How are...","{'introduction': 9, 'nature': 1, 'comic_books'...","{'INTRODUCTION': 9, 'DM_GENERATOR': 8, 'BOARD_...",6,"[[], [comment, back-channeling, opinion], [neg...","{'comment': 8, 'back-channeling': 2, 'opinion'..."
1,0a43a9222e69d77b17f0683796dcf5f31c1b96ee67fcd1...,1,2021-06-04,87,"[you can talk to me now, good, no i don't know...","[Hi, this is an Alexa Prize Socialbot. I hope ...","{'introduction': 19, 'nature': 10, 'video_game...","{'INTRODUCTION': 19, 'DM_GENERATOR': 2, 'NATUR...",2,"[[], [back-channeling], [other_answers, compla...","{'back-channeling': 11, 'other_answers': 9, 'c..."
2,ae40e36f348f78e747b624eea6b360182069fa7b81f993...,1,2021-06-05,13,"[let's chat, i'm exhausted, jack, that's corre...","[Hi, this is an Alexa Prize Socialbot. I hope ...","{'introduction': 9, 'animals': 4}","{'INTRODUCTION': 9, 'ANIMALS': 4}",0,"[[], [statement], [statement], [comment], [opi...","{'statement': 2, 'comment': 1, 'opinion': 2, '..."
3,d5a4446987ad97a9679d2fe8ebb1c1ff2fe0f91425de36...,1,2021-03-05,8,"[can you talk to me, good, kindle, yes, i hate...","[Hi, this is an Alexa Prize Socialbot. How are...",{'introduction': 8},"{'INTRODUCTION': 7, 'EVI': 1}",1,"[[], [back-channeling], [statement], [pos_answ...","{'back-channeling': 1, 'statement': 2, 'pos_an..."
4,5342e235fda33364c57908d55dc544d0e95363b00474c5...,1,2021-05-15,11,[what do you think about elon musk conquering ...,"[Hi, this is an Alexa Prize Socialbot. How are...",{'introduction': 11},{'INTRODUCTION': 11},0,"[[], [comment], [opinion], [back-channeling], ...","{'comment': 1, 'opinion': 3, 'back-channeling'..."
...,...,...,...,...,...,...,...,...,...,...,...
1960,39d82fbf4cdb5677f783d7a6107bd1f789062150d27fb4...,1,2021-04-15,9,"[can we have a conversation, good how are you,...","[Hi, this is an Alexa Prize Socialbot. How are...",{'introduction': 9},"{'INTRODUCTION': 4, 'SB_INDEX': 5}",0,"[[], [open_question_factual, back-channeling, ...","{'open_question_factual': 6, 'back-channeling'..."
1961,58e2e357a39955f9a8fea499962931f8f951d90e5e5c11...,1,2021-04-25,28,"[have a conversation with me, horrible, green,...","[Hi, this is an Alexa Prize Socialbot. How are...","{'introduction': 16, 'movies': 12}","{'INTRODUCTION': 16, 'DM_GENERATOR': 1, 'CENTE...",0,"[[], [opinion], [statement], [neg_answer], [ne...","{'opinion': 2, 'statement': 10, 'neg_answer': ..."
1962,99762e7ed366b94e50ed6e60a01586cf40377644846389...,1,2021-02-09,32,"[can we have a conversation, good, no, illinoi...","[Hi, this is an Alexa Prize Socialbot. How are...","{'introduction': 8, 'nature': 12, 'sports': 5,...","{'INTRODUCTION': 8, 'DM_GENERATOR': 2, 'EVI': ...",0,"[[], [back-channeling], [neg_answer], [stateme...","{'back-channeling': 9, 'neg_answer': 2, 'state..."
1963,1b2ca9b559ca9ad1ebba75f8a6ea7f5f097acc0923e38b...,1,2021-02-13,12,"[let's chat, not good, no, i don't have a job,...","[Hi, this is an Alexa Prize Socialbot. How are...","{'introduction': 7, 'food': 1, 'board_games': ...","{'INTRODUCTION': 7, 'DM_GENERATOR': 4, 'NATURE...",1,"[[], [comment], [neg_answer], [statement], [co...","{'comment': 2, 'neg_answer': 6, 'statement': 3..."


### Repeat user testing

In [24]:
# Try to find repeat users
found = []
for i in range(len(new_training_frame)):
  for utter in new_training_frame['system_text'][i]:
    if re.search(r'talk to you again', utter):
      #print("Repeat at index " + str(i))
      found.append(i)

In [25]:
print(found[:15])

[20, 67, 71, 75, 96, 96, 135, 139, 157, 197, 205, 210, 214, 214, 256]


In [26]:
# Try to find repeat users
found2 = []
for i in range(len(new_training_frame)):
  for utter in new_training_frame['system_text'][i]:
    if re.search(r'chat again', utter):
      #print("Repeat at index " + str(i))
      found2.append(i)

In [27]:
print(found2[:15])

[20, 36, 67, 73, 126, 135, 161, 171, 197, 256, 257, 288, 298, 335, 380]


In [28]:
print(len(found), len(found2))
print("Agree: {}".format(len(set(found).intersection(set(found2)))))

114 76
Agree: 28


In [29]:
new_training_frame['system_text'][90]

["Hi, this is an Alexa Prize Socialbot. How's it going.",
 "Okay, can't wait to get to know a thing or two about you. I'm not sure we've been acquainted. What do you go by?",
 "Okay, I see. It's great to make your acquaintance! Have you been doing things that you haven't been able to do during lockdown?",
 "That's totally understandable. If you could go out and safely do anything, is there something at the top of your list?",
 "Right. I'd love to know what you think. I've heard from a lot of people that the two things they're looking forward to most is going out to eat at restaurants and going out to see a movie. Which of those two are you most excited about?",
 "Oh, I see. I'm not sure what I would choose personally. Since I can't eat anything myself, I think I'd be more excited about going to the movies. Anyhow, that's just my opinion, let's move on. The world seems to have changed a lot over the past year. Are you one of the lucky people who get to work from home and stay out of har

### Midas testing

In [30]:
print(len(new_training_frame['user_text'][0]), len(new_training_frame['midas'][0]))

36 36


In [31]:
for text, midas_list in zip(new_training_frame['user_text'][0], new_training_frame['midas'][0]):
  print(text, midas_list)

let's chat []
good horrible ['comment', 'back-channeling', 'opinion']
no ['neg_answer']
no ['neg_answer']
nothing ['other_answers']
no ['neg_answer']
none ['other_answers']
no ['neg_answer']
nothing ['other_answers']
yes ['pos_answer']
no ['neg_answer']
no ['neg_answer']
no ['neg_answer']
yes ['pos_answer']
no one ['other_answers', 'neg_answer', 'abandon']
i don't like board games ['opinion']
i hate dinosaurs ['opinion']
okay ['back-channeling']
yes rex ['opinion', 'pos_answer', 'opinion']
no ['neg_answer']
look up ['abandon']
that is so stupid ['comment']
turn on me ['command']
that sucks ['comment']
oh are dumb ['comment', 'hold', 'comment']
repeat ['command']
i think they do you were so dumb they fall of a cliff and they dying ['opinion', 'opinion', 'comment', 'statement', 'statement']
the last dangerous ['opinion']
that that's stupid why would they believe that they fell off a cliff ['comment', 'comment', 'open_question_opinion']
i love you wanna ['opinion']
no ['neg_answer']
there

### Midas distribution

In [32]:
midas_dist = nltk.FreqDist()
for i in range(len(new_training_frame)):
  midas_list = new_training_frame['midas'][i]
  for j in range(len(midas_list)):
    for k in range(len(midas_list[j])):
      midas_dist[midas_list[j][k]] += 1

In [33]:
midas_dist.most_common(1000)

[('statement', 13430),
 ('opinion', 8464),
 ('neg_answer', 5903),
 ('abandon', 4531),
 ('back-channeling', 3949),
 ('pos_answer', 3558),
 ('command', 3034),
 ('open_question_factual', 2891),
 ('closing', 2854),
 ('comment', 2498),
 ('yes_no_question', 2282),
 ('other_answers', 1816),
 ('hold', 1071),
 ('open_question_opinion', 876),
 ('complaint', 492),
 ('thanking', 282),
 ('apology', 16)]

### Combine two ratings together

In [34]:
def combine(df1, df2):
  # Param1 df1: the first dataframe you wish to combine(e.g rating 1 dataframe)
  # Param2 df2: the second dataframe you wish to combine(e.g rating 2 dataframe)
  # return the combination of the two
  return pd.concat([df1, df2], ignore_index=True)

# Text Preprocessing
Compile info that will be extracted as features, but is slow to extract. This saves time by only doing the computation once.\
\
Note: System texts now include stopwords and punctuation.

In [35]:
def tokenize_frame_texts(dataframe, normalize=True):
  # Extract all the tokens from the user and system texts and append to the dataframe
  # Note: Dataframes passed by reference
  stopwords = set(nltk.corpus.stopwords.words('english'))
  total_user_tokens = []
  total_system_tokens = []
  for i in range(len(dataframe)):
    user_tokens = []
    system_tokens = []
    user_text = dataframe['user_text'][i]
    system_text = dataframe['system_text'][i]
    for sub_text in user_text:
      if normalize:
        #user_tokens += [tok for tok in nltk.word_tokenize(sub_text) if re.search(r'\w', tok) and not tok.lower() in stopwords]
        user_tokens += [tok for tok in re.split(r"\s", sub_text) if re.search(r'\w', tok) and not tok.lower() in stopwords]
      else:
        user_tokens += nltk.word_tokenize(sub_text)
    for sub_text in system_text:
      if normalize:
        system_tokens += [tok for tok in nltk.word_tokenize(sub_text)]
      else:
        system_tokens += nltk.word_tokenize(sub_text)
    total_user_tokens.append(user_tokens)
    total_system_tokens.append(system_tokens)
  # Add to dataframe
  dataframe['user_tokens'] = total_user_tokens
  dataframe['system_tokens'] = total_system_tokens

def sum_frame_tokens(dataframe):
  # Take a dataframe that has been tokenized via the tokenize_frame_texts function
  # and add column summing the number of tokens the user uttered
  token_counts = []
  for i in range(len(dataframe)):
    token_counts.append(len(dataframe['user_tokens'][i]))
  # Add to dataframe
  dataframe['user_token_count'] = token_counts

def sum_system_turns(dataframe, normalize=True):
  # Take a dataframe that has been tokenized via the tokenize_frame_texts function
  # and add a column consisting of the length of each system turn within each conversation.
  # If normalize is True, counts are calculated from tokens which contain letters. This is a more
  # accurate calculation of word count.
  system_turn_lengths = []
  for i in range(len(dataframe)):
    system_text = dataframe['system_text'][i]
    turn_lengths = []
    for utter in system_text:
      turn_lengths.append(len([tok for tok in nltk.word_tokenize(utter) if re.search(r'\w', tok)]))
    system_turn_lengths.append(turn_lengths)
  # Add to dataframe
  dataframe['system_turn_lengths'] = system_turn_lengths

def add_profanity_counts(dataframe):
  # Take dataframe and get the number of user utterances that are labeled as profane
  profanity_counts = []
  for i in range(len(dataframe)):
    profanes = get_profanity_tagger(dataframe['user_text'][i])
    profanity_counts.append(sum(profanes))
  # Add to dataframe
  dataframe['profanity_count'] = profanity_counts

def get_liwc_scores(dataframe):
  # Generate liwc scores from the user utterances
  liwc_scores = []
  for i in range(len(dataframe)):
    user_string = ". ".join(dataframe['user_text'][i])
    liwc_scores.append(score_text(user_string))
  # Add to dataframe
  dataframe['LIWC'] = liwc_scores

In [36]:
tokenize_frame_texts(new_training_frame, normalize=True)
sum_frame_tokens(new_training_frame)
#sum_system_turns(new_training_frame)
get_liwc_scores(new_training_frame)
#add_profanity_counts(dataframe)
new_training_frame

Unnamed: 0,conversation_id,rating,date,conversation_length,user_text,system_text,topic_dist,rg_dist,profanities,midas,midas_dist,user_tokens,system_tokens,user_token_count,LIWC
0,560597369048bdb615f7d10490e47b44f8e64f310f847a...,1,2021-02-19,35,"[let's chat, good horrible, no, no, nothing, n...","[Hi, this is an Alexa Prize Socialbot. How are...","{'introduction': 9, 'nature': 1, 'comic_books'...","{'INTRODUCTION': 9, 'DM_GENERATOR': 8, 'BOARD_...",6,"[[], [comment, back-channeling, opinion], [neg...","{'comment': 8, 'back-channeling': 2, 'opinion'...","[let's, chat, good, horrible, nothing, none, n...","[Hi, ,, this, is, an, Alexa, Prize, Socialbot,...",43,"{'Word Count': 89, 'All Punctuation': 39.32584..."
1,0a43a9222e69d77b17f0683796dcf5f31c1b96ee67fcd1...,1,2021-06-04,87,"[you can talk to me now, good, no i don't know...","[Hi, this is an Alexa Prize Socialbot. I hope ...","{'introduction': 19, 'nature': 10, 'video_game...","{'INTRODUCTION': 19, 'DM_GENERATOR': 2, 'NATUR...",2,"[[], [back-channeling], [other_answers, compla...","{'back-channeling': 11, 'other_answers': 9, 'c...","[talk, good, know, nothing, like, person, want...","[Hi, ,, this, is, an, Alexa, Prize, Socialbot,...",224,"{'Word Count': 435, 'Social Processes': 4.5977..."
2,ae40e36f348f78e747b624eea6b360182069fa7b81f993...,1,2021-06-05,13,"[let's chat, i'm exhausted, jack, that's corre...","[Hi, this is an Alexa Prize Socialbot. I hope ...","{'introduction': 9, 'animals': 4}","{'INTRODUCTION': 9, 'ANIMALS': 4}",0,"[[], [statement], [statement], [comment], [opi...","{'statement': 2, 'comment': 1, 'opinion': 2, '...","[let's, chat, i'm, exhausted, jack, that's, co...","[Hi, ,, this, is, an, Alexa, Prize, Socialbot,...",22,"{'Word Count': 38, 'All Punctuation': 34.21052..."
3,d5a4446987ad97a9679d2fe8ebb1c1ff2fe0f91425de36...,1,2021-03-05,8,"[can you talk to me, good, kindle, yes, i hate...","[Hi, this is an Alexa Prize Socialbot. How are...",{'introduction': 8},"{'INTRODUCTION': 7, 'EVI': 1}",1,"[[], [back-channeling], [statement], [pos_answ...","{'back-channeling': 1, 'statement': 2, 'pos_an...","[talk, good, kindle, yes, hate, working, remot...","[Hi, ,, this, is, an, Alexa, Prize, Socialbot,...",20,"{'Word Count': 39, 'Social Processes': 2.56410..."
4,5342e235fda33364c57908d55dc544d0e95363b00474c5...,1,2021-05-15,11,[what do you think about elon musk conquering ...,"[Hi, this is an Alexa Prize Socialbot. How are...",{'introduction': 11},{'INTRODUCTION': 11},0,"[[], [comment], [opinion], [back-channeling], ...","{'comment': 1, 'opinion': 3, 'back-channeling'...","[think, elon, musk, conquering, mass, good, i'...","[Hi, ,, this, is, an, Alexa, Prize, Socialbot,...",18,"{'Word Count': 32, 'Insight': 3.125, 'Common V..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1960,39d82fbf4cdb5677f783d7a6107bd1f789062150d27fb4...,1,2021-04-15,9,"[can we have a conversation, good how are you,...","[Hi, this is an Alexa Prize Socialbot. How are...",{'introduction': 9},"{'INTRODUCTION': 4, 'SB_INDEX': 5}",0,"[[], [open_question_factual, back-channeling, ...","{'open_question_factual': 6, 'back-channeling'...","[conversation, good, wanna, talk, free, time, ...","[Hi, ,, this, is, an, Alexa, Prize, Socialbot,...",13,"{'Word Count': 42, 'Six Letter Words': 2.38095..."
1961,58e2e357a39955f9a8fea499962931f8f951d90e5e5c11...,1,2021-04-25,28,"[have a conversation with me, horrible, green,...","[Hi, this is an Alexa Prize Socialbot. How are...","{'introduction': 16, 'movies': 12}","{'INTRODUCTION': 16, 'DM_GENERATOR': 1, 'CENTE...",0,"[[], [opinion], [statement], [neg_answer], [ne...","{'opinion': 2, 'statement': 10, 'neg_answer': ...","[conversation, horrible, green, we're, already...","[Hi, ,, this, is, an, Alexa, Prize, Socialbot,...",43,"{'Word Count': 100, 'Six Letter Words': 10.0, ..."
1962,99762e7ed366b94e50ed6e60a01586cf40377644846389...,1,2021-02-09,32,"[can we have a conversation, good, no, illinoi...","[Hi, this is an Alexa Prize Socialbot. How are...","{'introduction': 8, 'nature': 12, 'sports': 5,...","{'INTRODUCTION': 8, 'DM_GENERATOR': 2, 'EVI': ...",0,"[[], [back-channeling], [neg_answer], [stateme...","{'back-channeling': 9, 'neg_answer': 2, 'state...","[conversation, good, illinois, cousin, lives, ...","[Hi, ,, this, is, an, Alexa, Prize, Socialbot,...",83,"{'Word Count': 179, 'Six Letter Words': 10.614..."
1963,1b2ca9b559ca9ad1ebba75f8a6ea7f5f097acc0923e38b...,1,2021-02-13,12,"[let's chat, not good, no, i don't have a job,...","[Hi, this is an Alexa Prize Socialbot. How are...","{'introduction': 7, 'food': 1, 'board_games': ...","{'INTRODUCTION': 7, 'DM_GENERATOR': 4, 'NATURE...",1,"[[], [comment], [neg_answer], [statement], [co...","{'comment': 2, 'neg_answer': 6, 'statement': 3...","[let's, chat, good, job, we've, already, talke...","[Hi, ,, this, is, an, Alexa, Prize, Socialbot,...",18,"{'Word Count': 40, 'All Punctuation': 30.0, 'O..."


# Feature Extraction
Given some dataframe compiling information about each conversation, construct features to be used in a machine learning algorithm.\
Possible features:
1. Conversation length
2. User utterances
3. System utterances
4. Topics that appear in the conversation
5. Date

In [37]:
# Feature extraction functions

def nearest_multiple(input, factor):
  # Returns the nearest multiple of factor to the input
  # Useful for binning values
  i = 0
  while factor*i <= input:
    diff = abs(factor*i - input)
    i+=1
  if abs(factor*i - input) > diff:
    return factor*(i-1)
  else:
    return factor*i
  
def get_ngram_features(tokens, label, n=1, bin=3, lowercase=True):
  # Get features for ngrams
  feature_vector = {}
  if lowercase:
    tokens = [tok.lower() for tok in tokens]
  ngramFDist = nltk.FreqDist()
  if len(tokens) < n:
    return {}
  ngrams = nltk.ngrams(tokens, n=n)
  for seq in ngrams:
    ngramFDist["_".join(seq)] += 1
  # Add counts to feature vector
  for key in ngramFDist:
    count = ngramFDist[key]
    if count < bin:
      feature_vector[label +"_"+ str(n)+"-GRAM_"+key] = count
    else:
      feature_vector[label +"_"+ str(n)+"-GRAM_"+key] = bin
  return feature_vector

def get_topic_features(topic_dist, bin, freq=False, desired_topics=None, include_total=True):
  # Get features from all topics unless desired_topics is not None, then get
  # features from only the topics listed
  # If include_total is True, include the total count of topics
  topic_vector = {}
  for key in topic_dist:
    if desired_topics and not key in desired_topics:
      continue
    if freq:
      topic_vector["TOPIC_FREQ_" + key] = round(topic_dist.freq(key), 3)
    else:
      count = topic_dist[key]
      topic_vector["TOPIC_" + key] = count if count < bin else bin
  if include_total:
    topic_vector['TOPIC_TOTAL'] = topic_dist.B()
  return topic_vector

def get_topic_metrics(topic_dist, metrics=["mean", "std", "max", "min"]):
  # From the distribution of topics throughout a conversation, extract some or all of
  # the mean, standard deviation, max, and min of the number of utterances within each topic
  feature_vector = {}
  topic_values = list(topic_dist.values())
  for metric in metrics:
    if metric == 'mean':
      feature_vector['TOPIC_DIST_MEAN'] = np.mean(topic_values)
    elif metric == 'std':
      feature_vector['TOPIC_DIST_STD'] = np.std(topic_values)
    elif metric == 'max':
      feature_vector['TOPIC_DIST_MAX'] = np.max(topic_values)
    elif metric == 'min':
      feature_vector['TOPIC_DIST_MIN'] = np.min(topic_values)
  return feature_vector

def get_midas_features(midas_dist, bin, freq=False, desired_midas=None):
  # Get feature from all midas tags, or only the ones in desired_midas if not None
  feature_vector = {}
  for key in midas_dist:
    if desired_midas and not key in desired_midas:
      continue
    if freq:
      feature_vector["MIDAS_FREQ_" + key] = round(midas_dist.freq(key), 3)
    else:
      count = midas_dist[key]
      feature_vector["MIDAS_" + key] = count if count < bin else bin
  return feature_vector

def get_liwc_features(liwc_dict, categories=None):
  # Get all liwc values if categories is None, else only the values for the given categories
  feature_vector = {}
  for key in liwc_dict:
    if categories and not key in categories:
      continue
    feature_vector['LIWC_' + key] = round(liwc_dict[key], 1)
  return feature_vector

def get_rg_features(rg_dist, bin, freq=False, desired_rgs=None):
  # If desired_rgs is None, all counts/frequencies will be returned
  # Else, only the rgs which appear in the desired_rgs list will be used
  feature_vector = {}
  for key in rg_dist:
    if desired_rgs and not key in desired_rgs:
      continue
    if freq:
      feature_vector["RG_FREQ_" + key] = round(rg_dist.freq(key), 3)
    else:
      count = rg_dist[key]
      feature_vector["RG_" + key] = count if count < bin else bin
  return feature_vector

def get_system_text_metrics(system_text, metrics=["mean", "std", "max", "min"]):
  # From the system text utterances throughout a conversation, extract some or all of
  # the mean, standard deviation, max, and min of the system utterance lengths
  feature_vector = {}
  utter_lengths = []
  for utter in system_text:
    utter_lengths.append(len([tok for tok in nltk.word_tokenize(utter) if re.search(r'\w', tok)]))
  for metric in metrics:
    if metric == 'mean':
      feature_vector['SYSTEM_UTTER_MEAN'] = np.mean(utter_lengths)
    elif metric == 'std':
      feature_vector['SYSTEM_UTTER_STD'] = np.std(utter_lengths)
    elif metric == 'max':
      feature_vector['SYSTEM_UTTER_MAX'] = np.max(utter_lengths)
    elif metric == 'min':
      feature_vector['SYSTEM_UTTER_MIN'] = np.min(utter_lengths)
  return feature_vector

def get_user_text_metrics(user_text, metrics=["mean", "std", "max", "min"]):
  # From the user text utterances throughout a conversation, extract some or all of
  # the mean, standard deviation, max, and min of the user utterance lengths
  feature_vector = {}
  utter_lengths = []
  for utter in user_text:
    utter_lengths.append(len(re.split(r'\s', utter)))
  for metric in metrics:
    if metric == 'mean':
      feature_vector['USER_UTTER_MEAN'] = np.mean(utter_lengths)
    elif metric == 'std':
      feature_vector['USER_UTTER_STD'] = np.std(utter_lengths)
    elif metric == 'max':
      feature_vector['USER_UTTER_MAX'] = np.max(utter_lengths)
    elif metric == 'min':
      feature_vector['USER_UTTER_MIN'] = np.min(utter_lengths)
  return feature_vector

In [38]:
new_training_frame['user_text'][1]

['you can talk to me now',
 'good',
 "no i don't know",
 'nothing',
 'no',
 'there is like a person who wants to kidnap me in it will kidnap kids it will kill kids',
 'movie',
 'yes',
 "i don't work",
 'a soda',
 'i wanted a visit',
 "i've never been on a plane before",
 'not no',
 "where may we're going to disney world",
 'spin the night my name is',
 'yeah but i wish you were a human too',
 'no',
 'yeah',
 'ugh',
 'aye',
 "yeah but you need to be charged and there's no black places are charge or anything",
 'no never been nothing',
 "alexa but we're but we're gonna go to florida",
 'going on going to restaurants',
 'i go rock climbing',
 'ugh',
 'yes',
 'alexa do you even know my name',
 'another',
 'it would be playing outside because spring and flowers are so beautiful',
 'i like playing roblox',
 'yeah',
 'ulcer',
 "i can't play roblox because my phone ever ever keeps dying",
 "i like i like fashion famous on roblox because it's so much fun with dresses on their",
 'oh yeah',
 'ye

In [39]:
get_user_text_metrics(new_training_frame['user_text'][1])

{'USER_UTTER_MAX': 22,
 'USER_UTTER_MEAN': 4.943181818181818,
 'USER_UTTER_MIN': 1,
 'USER_UTTER_STD': 4.913705217749855}

In [40]:
def extract_features(dataframe, last_n_turns = False, n_turns = 0, CONV_LENGTH = False, USER_WORD_COUNT = False, user_unigrams = False, user_bigrams = False, user_trigrams = False, system_unigrams = False, 
                     system_bigrams = False, system_trigrams = False, profane_count = False, topic_counts = False, topic_freq=False, desired_topics=None, include_topic_total=False, topic_metrics=False, 
                     topic_metrics_to_use=["mean", "std", "max", "min"], midas_counts=False, midas_freq = False, desired_midas=None, liwc_scores=False, rg_counts=False, rg_freq=False, desired_rgs=None, 
                     system_utter_metrics=False, system_metrics_to_use=["mean", "std", "max", "min"], user_utter_metrics=False, user_metrics_to_use=["mean", "std", "max", "min"]):
  # Extract features from a dataframe and compile them
  # into feature vectors (dictionaries containing features)
  feature_vectors = []
  # Value to cap topic and rg counts at
  bin = np.inf
  for i in range(len(dataframe)):
    vector = {}
    # Place features into feature_vector
    # Prebuilt features
    #prebuilt_features = {
        #"CONV_LENGTH" : dataframe['conversation_length'][i],
        #"DATE_" + dataframe['date'][i] : 1,
        #"USER_WORD_COUNT" : nearest_multiple(dataframe['user_token_count'][i], 10)
    #}
    #vector.update(prebuilt_features)

    # Extract features from the texts
    user_text = dataframe['user_text'][i]
    system_text = dataframe['system_text'][i]
    user_tokens = dataframe['user_tokens'][i]
    system_tokens = dataframe['system_tokens'][i]
    topic_dist = dataframe['topic_dist'][i]
    midas_dist = dataframe['midas_dist'][i]
    rg_dist = dataframe['rg_dist'][i]
    if last_n_turns:
      user_tokens = nltk.word_tokenize(" ".join(user_text[len(user_text)-n_turns:]))
      system_tokens = nltk.word_tokenize(" ".join(system_text[len(system_text)-n_turns:]))

    if CONV_LENGTH:
      vector["CONV_LENGTH"] = dataframe['conversation_length'][i] if dataframe['conversation_length'][i] < 20 else dataframe['conversation_length'][i]
    if USER_WORD_COUNT:
      vector["USER_WORD_COUNT"] = dataframe['user_token_count'][i]
    if user_unigrams:
      vector.update(get_ngram_features(user_tokens, label='USER', n=1, bin=3, lowercase=True))
    if user_bigrams:
      vector.update(get_ngram_features(user_tokens, label='USER', n=2, bin=3, lowercase=True))
    if user_trigrams:
      vector.update(get_ngram_features(user_tokens, label='USER', n=3, bin=3, lowercase=True))
    if system_unigrams:
      vector.update(get_ngram_features(system_tokens, label='SYSTEM', n=1, bin=3, lowercase=True))
    if system_bigrams:
      vector.update(get_ngram_features(system_tokens, label='SYSTEM', n=2, bin=3, lowercase=True))
    if system_trigrams:
      vector.update(get_ngram_features(system_tokens, label='SYSTEM', n=3, bin=3, lowercase=True))
    if profane_count:
      vector["profanities"] = round(dataframe["profanities"][i] / dataframe['conversation_length'][i], 3)
    if topic_counts:
      vector.update(get_topic_features(topic_dist, bin=np.inf, freq=False, desired_topics=desired_topics, include_total=include_topic_total))
    if topic_freq:
      vector.update(get_topic_features(topic_dist, bin=np.nan, freq=True, desired_topics=desired_topics, include_total=include_topic_total))
    if topic_metrics:
      vector.update(get_topic_metrics(topic_dist, metrics=topic_metrics_to_use))
    if midas_counts:
      vector.update(get_midas_features(midas_dist, bin=np.inf, freq=False, desired_midas=desired_midas))
    if midas_freq:
      vector.update(get_midas_features(midas_dist, bin=np.nan, freq=True, desired_midas=desired_midas))
    if liwc_scores:
      vector.update(get_liwc_features(dataframe['LIWC'][i]))
    if rg_counts:
      vector.update(get_rg_features(rg_dist, bin=np.inf, freq=False, desired_rgs=desired_rgs))
    if rg_freq:
      vector.update(get_rg_features(rg_dist, bin=np.nan, freq=True, desired_rgs=desired_rgs))
    if system_utter_metrics:
      vector.update(get_system_text_metrics(system_text, metrics=system_metrics_to_use))
    if user_utter_metrics:
      vector.update(get_user_text_metrics(user_text, metrics=user_metrics_to_use))

    # Append to feature_vectors
    feature_vectors.append(vector)

  return feature_vectors

In [41]:
# Demo of some of the new features
feature_vectors = extract_features(new_training_frame, system_utter_metrics=True, system_metrics_to_use=['mean', 'std', 'max', 'min'], user_utter_metrics=True, user_metrics_to_use=['mean', 'std', 'max', 'min'], rg_freq=True, desired_rgs=['RepeatGenerator', 'redquestion'])
print(feature_vectors[0])
print(feature_vectors[1])

{'RG_FREQ_RepeatGenerator': 0.029, 'SYSTEM_UTTER_MEAN': 33.114285714285714, 'SYSTEM_UTTER_STD': 14.828103488350237, 'SYSTEM_UTTER_MAX': 71, 'SYSTEM_UTTER_MIN': 4, 'USER_UTTER_MEAN': 2.4722222222222223, 'USER_UTTER_STD': 3.1134663703830454, 'USER_UTTER_MAX': 16, 'USER_UTTER_MIN': 1}
{'SYSTEM_UTTER_MEAN': 28.804597701149426, 'SYSTEM_UTTER_STD': 18.72231386160398, 'SYSTEM_UTTER_MAX': 120, 'SYSTEM_UTTER_MIN': 3, 'USER_UTTER_MEAN': 4.943181818181818, 'USER_UTTER_STD': 4.913705217749855, 'USER_UTTER_MAX': 22, 'USER_UTTER_MIN': 1}


In [42]:
feature_vectors = extract_features(new_training_frame, topic_metrics=True)
print(feature_vectors[0])
print(feature_vectors[2])

{'TOPIC_DIST_MEAN': 3.7777777777777777, 'TOPIC_DIST_STD': 4.98392477600129, 'TOPIC_DIST_MAX': 16, 'TOPIC_DIST_MIN': 1}
{'TOPIC_DIST_MEAN': 6.5, 'TOPIC_DIST_STD': 2.5, 'TOPIC_DIST_MAX': 9, 'TOPIC_DIST_MIN': 4}


In [43]:
feature_vectors = extract_features(new_training_frame, last_n_turns=True, n_turns=3, CONV_LENGTH=True, user_bigrams=True, system_bigrams=True)
print(feature_vectors[0])
print(feature_vectors[1])

{'CONV_LENGTH': 35, 'USER_2-GRAM_no_end': 1, 'USER_2-GRAM_end_chat': 1, 'USER_2-GRAM_chat_stop': 1, 'SYSTEM_2-GRAM_so_here': 1, "SYSTEM_2-GRAM_here_'s": 1, "SYSTEM_2-GRAM_'s_an": 1, 'SYSTEM_2-GRAM_an_odd': 1, 'SYSTEM_2-GRAM_odd_topic': 1, 'SYSTEM_2-GRAM_topic_.': 1, 'SYSTEM_2-GRAM_._pirates': 1, 'SYSTEM_2-GRAM_pirates_!': 1, 'SYSTEM_2-GRAM_!_i': 1, 'SYSTEM_2-GRAM_i_can': 1, 'SYSTEM_2-GRAM_can_even': 1, 'SYSTEM_2-GRAM_even_talk': 1, 'SYSTEM_2-GRAM_talk_like': 1, 'SYSTEM_2-GRAM_like_a': 1, 'SYSTEM_2-GRAM_a_pirate': 1, 'SYSTEM_2-GRAM_pirate_!': 1, 'SYSTEM_2-GRAM_!_arr': 1, 'SYSTEM_2-GRAM_arr_my': 1, 'SYSTEM_2-GRAM_my_matey': 1, 'SYSTEM_2-GRAM_matey_.': 1, 'SYSTEM_2-GRAM_._i': 2, 'SYSTEM_2-GRAM_i_know': 1, 'SYSTEM_2-GRAM_know_a': 1, 'SYSTEM_2-GRAM_a_lot': 1, 'SYSTEM_2-GRAM_lot_of': 1, 'SYSTEM_2-GRAM_of_history': 1, 'SYSTEM_2-GRAM_history_and': 1, 'SYSTEM_2-GRAM_and_odd': 1, 'SYSTEM_2-GRAM_odd_facts': 1, 'SYSTEM_2-GRAM_facts_about': 1, 'SYSTEM_2-GRAM_about_pirates': 2, 'SYSTEM_2-GRAM_pirate

# Binary Classification
In this section, we create classifiers which distinguish between two of the rating groups.

In [44]:
def get_training_tuning_sets(train1, tune1, rating1, train2, tune2, rating2, min_length, shuffle=True, random_state=42):
  """
  Compile and return the training and tuning datasets located at the given directories.

  Params:
    train1 : dataframe of the training data of first rating
    tune1 : dataframe of tuning data of first rating
    rating1 : The rating of the first files
    train2 : dataframe of training data of second rating
    tune2 : dataframe of tuning data of second rating
    rating2 : The rating of the second files
    shuffle : bool determining whether the resulting sets are shuffled before returning
    random_state : int, only used if shuffle is True, allows for reproducible shuffling
  Returns:
    training_frame : pandas dataframe containing information compiled from the two
      training files using information compilation and processing functions defined above
    tuning_frame : pandas dataframe containing information compiled from the two
      tuning files using information compilation and processing functions defined above
  """

  # Compile info with rating for each dataframe
  train1_compile = compile_info(train1, rating=rating1, min_length=min_length)
  tune1_compile = compile_info(tune1, rating=rating1, min_length = min_length)
  train2_compile = compile_info(train2, rating=rating2, min_length=min_length)
  tune2_compile = compile_info(tune2, rating=rating2, min_length=min_length)

  # Combine training dataframes and tuning dataframes
  training_combo = pd.concat([train1_compile, train2_compile], ignore_index=True)
  tuning_combo = pd.concat([tune1_compile, tune2_compile], ignore_index=True)

  # Preprocess information (Slow, but saves time in feature extraction by doing all the hard work here)
  tokenize_frame_texts(training_combo)
  tokenize_frame_texts(tuning_combo)
  sum_frame_tokens(training_combo)
  sum_frame_tokens(tuning_combo)
  get_liwc_scores(training_combo)
  get_liwc_scores(tuning_combo)

  # Optional shuffle
  if shuffle:
    training_combo = training_combo.sample(frac=1, random_state=random_state).reset_index(drop=True)
    tuning_combo = tuning_combo.sample(frac=1, random_state=random_state).reset_index(drop=True)
  return training_combo, tuning_combo

In [None]:
def get_training_tuning_sets_reg(train1, tune1, rating1, train2, tune2, rating2, train3, tune3, rating3, train4, tune4, rating4, train5, tune5, rating5, min_length, shuffle=True, random_state=42):
  """
  Compile and return the training and tuning datasets located at the given directories.

  Params:
    train1 : dataframe of the training data of first rating
    tune1 : dataframe of tuning data of first rating
    rating1 : The rating of the first files
    train2 : dataframe of training data of second rating
    tune2 : dataframe of tuning data of second rating
    rating2 : The rating of the second files
    shuffle : bool determining whether the resulting sets are shuffled before returning
    random_state : int, only used if shuffle is True, allows for reproducible shuffling
  Returns:
    training_frame : pandas dataframe containing information compiled from the two
      training files using information compilation and processing functions defined above
    tuning_frame : pandas dataframe containing information compiled from the two
      tuning files using information compilation and processing functions defined above
  """

  # Compile info with rating for each dataframe
  train1_compile = compile_info(train1, rating=rating1, min_length=min_length)
  tune1_compile = compile_info(tune1, rating=rating1, min_length = min_length)
  train2_compile = compile_info(train2, rating=rating2, min_length=min_length)
  tune2_compile = compile_info(tune2, rating=rating2, min_length=min_length)
  train3_compile = compile_info(train3, rating=rating3, min_length=min_length)
  tune3_compile = compile_info(tune3, rating=rating3, min_length = min_length)
  train4_compile = compile_info(train4, rating=rating4, min_length=min_length)
  tune4_compile = compile_info(tune4, rating=rating4, min_length = min_length)
  train5_compile = compile_info(train5, rating=rating5, min_length=min_length)
  tune5_compile = compile_info(tune5, rating=rating5, min_length = min_length)

  # Combine training dataframes and tuning dataframes
  training_combo = pd.concat([train1_compile, train2_compile, train3_compile, train4_compile, train5_compile], ignore_index=True)
  tuning_combo = pd.concat([tune1_compile, tune2_compile, tune3_compile, tune4_compile, tune5_compile], ignore_index=True)

  # Preprocess information (Slow, but saves time in feature extraction by doing all the hard work here)
  tokenize_frame_texts(training_combo)
  tokenize_frame_texts(tuning_combo)
  sum_frame_tokens(training_combo)
  sum_frame_tokens(tuning_combo)
  get_liwc_scores(training_combo)
  get_liwc_scores(tuning_combo)

  # Optional shuffle
  if shuffle:
    training_combo = training_combo.sample(frac=1, random_state=random_state).reset_index(drop=True)
    tuning_combo = tuning_combo.sample(frac=1, random_state=random_state).reset_index(drop=True)
  return training_combo, tuning_combo

In [45]:
train1 = open_file(DATA_DIR + "/Training Data/rating-1-training-set-profanity-tagged.tsv")
tune1 = open_file(DATA_DIR + "/Tuning Data/rating-1-tuning-set-profanity-tagged.tsv")
train2 = open_file(DATA_DIR + "/Training Data/rating-5-training-set-profanity-tagged.tsv")
tune2 = open_file(DATA_DIR + "/Tuning Data/rating-5-tuning-set-profanity-tagged.tsv")
train3 = open_file(DATA_DIR + "/Training Data/rating-2-training-set-profanity-tagged.tsv")
tune3 = open_file(DATA_DIR + "/Tuning Data/rating-2-tuning-set-profanity-tagged.tsv")
trainBad = combine(train1, train3)
tuneBad = combine(tune1, tune3)

In [46]:
training_set, tuning_set = get_training_tuning_sets(train1=trainBad,
                                                    tune1=tuneBad, 
                                                    rating1="bad", 
                                                    train2=train2, 
                                                    tune2=tune2, 
                                                    rating2="good",
                                                    min_length = 7, 
                                                    shuffle=True, 
                                                    random_state=42)

In [47]:
print(training_set.shape, tuning_set.shape)
training_set

(13331, 15) (1679, 15)


Unnamed: 0,conversation_id,rating,date,conversation_length,user_text,system_text,topic_dist,rg_dist,profanities,midas,midas_dist,user_tokens,system_tokens,user_token_count,LIWC
0,1ba8bc33b275aeada15869c1e3805de85bd6d2cb4dca2b...,good,2021-06-07,59,"[let's chat, good, caleb, yeah, yeah, no, go o...","[Hi, this is an Alexa Prize Socialbot. I hope ...","{'introduction': 18, 'movies': 24, 'neutral': ...","{'INTRODUCTION': 18, 'DM_GENERATOR': 1, 'CENTE...",1,"[[], [comment], [statement], [back-channeling]...","{'comment': 2, 'statement': 8, 'back-channelin...","[let's, chat, good, caleb, yeah, yeah, go, vac...","[Hi, ,, this, is, an, Alexa, Prize, Socialbot,...",90,"{'Word Count': 141, 'All Punctuation': 43.9716..."
1,124a3f8842221a21dc505129d9645c95bd19cfcd8f66b3...,good,2021-03-19,12,"[talk to me, good, tanya, yes, repeat, i didn'...","[Hi, this is an Alexa Prize Socialbot. How are...",{'introduction': 11},"{'INTRODUCTION': 8, 'RepeatGenerator': 3, 'GOO...",0,"[[], [back-channeling], [statement], [pos_answ...","{'back-channeling': 3, 'statement': 4, 'pos_an...","[talk, good, tanya, yes, repeat, wear, green, ...","[Hi, ,, this, is, an, Alexa, Prize, Socialbot,...",19,"{'Word Count': 35, 'Social Processes': 2.85714..."
2,428e47f2eab42fc4ce68e596b4f670571c85a96157a5c5...,bad,2021-02-02,10,"[talk to me, good, yes, nothing, restaurants, ...","[Hi, this is an Alexa Prize Socialbot. How are...","{'introduction': 9, 'nutrition': 1}","{'INTRODUCTION': 9, 'DM_GENERATOR': 1}",0,"[[], [back-channeling], [pos_answer], [other_a...","{'back-channeling': 1, 'pos_answer': 2, 'other...","[talk, good, yes, nothing, restaurants, amazin...","[Hi, ,, this, is, an, Alexa, Prize, Socialbot,...",16,"{'Word Count': 28, 'Social Processes': 7.14285..."
3,fcdadfd90c4439c26e04b84fac7195fdaf7340cfbae549...,bad,2021-03-28,10,"[let's chat, good real good, steve, yes, yes, ...","[Hi, this is an Alexa Prize Socialbot. How's i...",{'introduction': 10},"{'INTRODUCTION': 9, 'RepeatGenerator': 1}",0,"[[], [comment, back-channeling, comment], [sta...","{'comment': 2, 'back-channeling': 2, 'statemen...","[let's, chat, good, real, good, steve, yes, ye...","[Hi, ,, this, is, an, Alexa, Prize, Socialbot,...",19,"{'Word Count': 25, 'All Punctuation': 40.0, 'O..."
4,09660507d5cc13c713e64c1e7f6285a33b3bfc0472242e...,good,2021-06-02,12,"[let's chat, good, amelia, yes, yeah, no, uh-o...","[Hi, this is an Alexa Prize Socialbot. I hope ...",{'introduction': 12},{'INTRODUCTION': 12},1,"[[], [comment], [statement], [pos_answer], [po...","{'comment': 1, 'statement': 5, 'pos_answer': 2...","[let's, chat, good, amelia, yes, yeah, uh-oh, ...","[Hi, ,, this, is, an, Alexa, Prize, Socialbot,...",13,"{'Word Count': 24, 'All Punctuation': 54.16666..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13326,0eaf5be3b8f18a5b8301ba016976b0ab547cffa68a5331...,good,2021-03-19,20,"[let's chat, it's going great this what are ne...","[Hi, this is an Alexa Prize Socialbot. How's i...","{'introduction': 12, 'harry_potter': 6, 'menu_...","{'INTRODUCTION': 12, 'DM_GENERATOR': 1, 'HARRY...",1,"[[], [statement, comment, abandon, open_questi...","{'statement': 11, 'comment': 5, 'abandon': 3, ...","[let's, chat, going, great, new, students, com...","[Hi, ,, this, is, an, Alexa, Prize, Socialbot,...",73,"{'Word Count': 117, 'All Punctuation': 17.0940..."
13327,f4e0ff68bc826bdf2e25d5109a3088c91359782da9e6d7...,good,2021-04-14,15,"[let's chat, good how's it going to you, thank...","[Hi, this is an Alexa Prize Socialbot. How's i...","{'introduction': 13, 'neutral': 2}","{'INTRODUCTION': 11, 'SB_INDEX': 3, 'RepeatGen...",0,"[[], [open_question_factual, back-channeling, ...","{'open_question_factual': 3, 'back-channeling'...","[let's, chat, good, how's, going, thanks, cath...","[Hi, ,, this, is, an, Alexa, Prize, Socialbot,...",35,"{'Word Count': 59, 'All Punctuation': 35.59322..."
13328,f8cc0427f2aa6b1ab7099d097e65c673071578dd58b476...,good,2021-04-06,10,"[let's chat, i'm great, big fat so, no i can't...","[Hi, this is an Alexa Prize Socialbot. How are...","{'introduction': 8, 'nature': 1, 'video_games'...","{'INTRODUCTION': 8, 'DM_GENERATOR': 1, 'VIDEO_...",2,"[[], [statement], [abandon], [statement, neg_a...","{'statement': 5, 'abandon': 1, 'neg_answer': 1...","[let's, chat, i'm, great, big, fat, can't, wal...","[Hi, ,, this, is, an, Alexa, Prize, Socialbot,...",27,"{'Word Count': 44, 'All Punctuation': 22.72727..."
13329,3ba1ed533d34bd2f986a7f1d2b83ff3776f9bf606df972...,bad,2021-03-18,31,"[have a conversation, i'm okay how are you, ne...","[Hi, this is an Alexa Prize Socialbot. How are...","{'introduction': 11, 'nature': 9, 'food': 8, '...","{'INTRODUCTION': 10, 'RepeatGenerator': 2, 'DM...",1,"[[], [open_question_factual, back-channeling, ...","{'open_question_factual': 6, 'back-channeling'...","[conversation, i'm, okay, next, wanna, know, u...","[Hi, ,, this, is, an, Alexa, Prize, Socialbot,...",56,"{'Word Count': 118, 'Six Letter Words': 12.711..."


In [48]:
# Baseline accuracy
print("Baseline Accuracy: {}".format(np.sum(tuning_set['rating']=='good')/len(tuning_set)))

Baseline Accuracy: 0.6521739130434783


Now we can extract features from the training set, and train some algorithms.

In [52]:
# Extract features and pair with rating

#feature_vectors = extract_features(training_set, user_unigrams=False, user_bigrams= True, profane_count= False, USER_WORD_COUNT=False, topic_counts=False, user_trigrams=True, CONV_LENGTH=True)
#feature_vectors = extract_features(training_set, last_n_turns=True, n_turns=3, CONV_LENGTH=True, user_bigrams=True, system_bigrams=True)
#topic_counts=True, desired_topics=['none'], include_topic_total=True, topic_metrics=True
#rg_counts=True, desired_rgs=['redquestion', 'RepeatGenerator'], midas_counts=True, desired_midas=['neg_answer', 'complaint'], user_unigrams=True
"""feature_vectors = extract_features(training_set, system_utter_metrics=True, system_metrics_to_use=['mean', 'std', 'max', 'min'], user_utter_metrics=True, user_metrics_to_use=['mean', 'std', 'max', 'min'], 
                                   rg_counts=True, desired_rgs=['RepeatGenerator', 'redquestion'], topic_counts=True, include_topic_total=True, topic_metrics=True, midas_counts=True, desired_midas=['neg_answer', 'complaint'],
                                   CONV_LENGTH=True, profane_count=True)"""
feature_vectors = extract_features(training_set, user_unigrams=True, user_bigrams=False, user_trigrams=False, profane_count=False, topic_freq=False, CONV_LENGTH=False, liwc_scores=False, midas_freq=False)
training_features = list(zip(feature_vectors, training_set['rating']))
print(training_features[0])
print(training_features[1])

({"USER_1-GRAM_let's": 2, 'USER_1-GRAM_chat': 1, 'USER_1-GRAM_good': 1, 'USER_1-GRAM_caleb': 1, 'USER_1-GRAM_yeah': 3, 'USER_1-GRAM_go': 2, 'USER_1-GRAM_vacation': 1, 'USER_1-GRAM_definitely': 2, 'USER_1-GRAM_wearing': 1, 'USER_1-GRAM_masks': 1, 'USER_1-GRAM_movie': 2, 'USER_1-GRAM_yes': 3, 'USER_1-GRAM_probably': 3, 'USER_1-GRAM_pet': 1, "USER_1-GRAM_i'm": 1, 'USER_1-GRAM_new': 2, 'USER_1-GRAM_york': 1, 'USER_1-GRAM_need': 1, 'USER_1-GRAM_repeat': 1, 'USER_1-GRAM_may': 1, 'USER_1-GRAM_school': 2, 'USER_1-GRAM_nah': 1, 'USER_1-GRAM_maybe': 1, 'USER_1-GRAM_try': 1, 'USER_1-GRAM_stuff': 1, "USER_1-GRAM_what's": 1, 'USER_1-GRAM_favorite': 1, 'USER_1-GRAM_color': 1, 'USER_1-GRAM_wyatt': 1, 'USER_1-GRAM_wow': 2, 'USER_1-GRAM_hmm': 3, 'USER_1-GRAM_ha-ha': 1, 'USER_1-GRAM_seen': 2, 'USER_1-GRAM_high': 1, 'USER_1-GRAM_musical': 1, 'USER_1-GRAM_two': 1, 'USER_1-GRAM_love': 1, 'USER_1-GRAM_knew': 1, 'USER_1-GRAM_today': 1, 'USER_1-GRAM_aye': 2, 'USER_1-GRAM_ooh': 1, 'USER_1-GRAM_indian': 1, 'USE

## Train a Model

In [53]:
# Here we train a model on the feature list we extracted
# NaiveBayesClassifer from nltk
#classifier = NaiveBayesClassifier.train(training_features)
# sklearn BernoulliNB classifier
#classifier = SklearnClassifier(BernoulliNB()).train(training_features)
# Decision tree
#classifier = SklearnClassifier(DecisionTreeClassifier()).train(training_features)
# SVM
#classifier = SklearnClassifier(SVC()).train(training_features)
# MLP
#classifier = SklearnClassifier(MLPClassifier(max_iter=400)).train(training_features)
# Linear Regression
classifier = SklearnClassifier(linear_model.LinearRegression()).train(training_features)

## Evaluate

In [None]:
from numpy.core.numeric import False_
# Function to get accuracy, precision, recall, f_measure, and confusion matrix
def evaluate_model(model, data, mistakes=False, last_n_turns = False, n_turns = 0, CONV_LENGTH = False, USER_WORD_COUNT = False, user_unigrams = False, user_bigrams = False, user_trigrams = False, system_unigrams = False, 
                   system_bigrams = False, system_trigrams = False, profane_count = False, topic_counts = False, topic_freq=False, desired_topics=None, include_topic_total=False, topic_metrics=False, topic_metrics_to_use=["mean", "std", "max", "min"],
                   midas_counts=False, midas_freq=False, desired_midas=None, liwc_scores=False, rg_counts=False, rg_freq=False, desired_rgs=None, system_utter_metrics=False, system_metrics_to_use=["mean", "std", "max", "min"], 
                   user_utter_metrics=False, user_metrics_to_use=["mean", "std", "max", "min"]):
  # If mistakes==True, return the indices of the reviews that were misclassified
  # Get feature vectors for the data
  test_vectors = extract_features(data, last_n_turns=last_n_turns, n_turns=n_turns, CONV_LENGTH = CONV_LENGTH, USER_WORD_COUNT = USER_WORD_COUNT, user_unigrams = user_unigrams, user_bigrams = user_bigrams, 
                                  user_trigrams = user_trigrams, system_unigrams = system_unigrams, system_bigrams = system_bigrams, system_trigrams = system_trigrams, topic_counts=topic_counts, topic_freq=topic_freq, 
                                  desired_topics=desired_topics, include_topic_total=include_topic_total, topic_metrics=topic_metrics, topic_metrics_to_use=topic_metrics_to_use, midas_counts=midas_counts, 
                                  midas_freq=midas_freq, desired_midas=desired_midas, liwc_scores=liwc_scores, rg_counts=rg_counts, rg_freq=rg_freq, desired_rgs=desired_rgs, system_utter_metrics=system_utter_metrics,
                                  system_metrics_to_use=system_metrics_to_use, user_utter_metrics=user_utter_metrics, user_metrics_to_use=user_metrics_to_use)
  test_labels = list(data['rating'])
  # Make predictions
  predictions = model.classify_many(test_vectors)
  # Get the misclassification indices
  mistake_indices = None
  if mistakes:
    mistake_indices = []
    for i in range(len(predictions)):
      if not predictions[i] == test_labels[i]:
        mistake_indices.append(i)

  matrix = confusion_matrix(test_labels, predictions)
  accuracy = round(nltk.classify.accuracy(model, list(zip(test_vectors, test_labels))), 3)
  TP = matrix[1, 1]
  TN = matrix[0, 0]
  FP = matrix[0, 1]
  FN = matrix[1, 0]
  TPR = TP/np.sum(matrix[1])
  TNR = TN/np.sum(matrix[0])
  balanced_acc = round((TPR+TNR)/2, 3)
  nltk_matrix = ConfusionMatrix(test_labels, predictions)
  precision = round(TP/(TP+FP), 3)
  recall = round(TP/(TP+FN), 3)
  f_score = round((2*precision*recall)/(precision + recall), 3)
  most_informative = []
  if isinstance(model, NaiveBayesClassifier):
    most_informative = model.most_informative_features(100)
  return accuracy, balanced_acc, precision, recall, f_score, matrix, nltk_matrix, mistake_indices, most_informative

In [None]:
# Evaluate the trained model on the tuning set
# Note: Balanced Accuracy is a way of measuring the accuracy while taking into account
# an imbalance in the number of samples in each category
#acc, balanced_acc, prec, recall, f_score, matrix, nltk_matrix, mistakes, most_informative = evaluate_model(classifier, tuning_set, user_unigrams=False, user_bigrams = True, profane_count= True, USER_WORD_COUNT=False, topic_counts=False, user_trigrams= True, CONV_LENGTH = True)
#acc, balanced_acc, prec, recall, f_score, matrix, nltk_matrix, mistakes, most_informative = evaluate_model(classifier, tuning_set, user_unigrams=True, profane_count=False, topic_freq=False, user_bigrams=False, user_trigrams=False, CONV_LENGTH=False, liwc_scores=False, midas_freq=False)
acc, balanced_acc, prec, recall, f_score, matrix, nltk_matrix, mistakes, most_informative = evaluate_model(classifier, tuning_set, user_unigrams=True, user_bigrams=True, user_trigrams=True, midas_freq=True)
print("Accuracy: {}".format(acc))
print("Balanced Accuracy: {}".format(balanced_acc))
print("Precision: {}".format(prec))
print("Recall: {}".format(recall))
print("f_measure: {}".format(f_score))
print("Confusion matrix:\n{!s}".format(nltk_matrix))

In [None]:
print(most_informative)

In [None]:
classifier.show_most_informative_features(100)

# Feature Selection

In [None]:
# Run feature selection with most informative features from Naive Bayes
def run_feature_selection(train_df, test_df, model_obj, start, end, step, last_n_turns = False, n_turns = 0, CONV_LENGTH = False, USER_WORD_COUNT = False, user_unigrams = False, user_bigrams = False, user_trigrams = False, 
                          system_unigrams = False, system_bigrams = False, system_trigrams = False, profane_count = False, topic_counts = False, topic_freq=False, desired_topics=None, include_topic_total=False, 
                          topic_metrics=False, topic_metrics_to_use=["mean", "std", "max", "min"], midas_counts=False, midas_freq=False, desired_midas=None, liwc_scores=False, rg_counts=False, rg_freq=False, 
                          desired_rgs=None, system_utter_metrics=False, system_metrics_to_use=["mean", "std", "max", "min"], user_utter_metrics=False, user_metrics_to_use=["mean", "std", "max", "min"]):
  # Run classification using all given features, then take from that
  # different sized subsets to train on subsequently.
  # Starting with number start, then in steps of size step, until end
  feature_vectors = extract_features(train_df, last_n_turns=last_n_turns, n_turns=n_turns, CONV_LENGTH = CONV_LENGTH, USER_WORD_COUNT = USER_WORD_COUNT, user_unigrams = user_unigrams, user_bigrams = user_bigrams, 
                                  user_trigrams = user_trigrams, system_unigrams = system_unigrams, system_bigrams = system_bigrams, system_trigrams = system_trigrams, topic_counts=topic_counts, topic_freq=topic_freq, 
                                  desired_topics=desired_topics, include_topic_total=include_topic_total, topic_metrics=topic_metrics, topic_metrics_to_use=topic_metrics_to_use, midas_counts=midas_counts, 
                                  midas_freq=midas_freq, desired_midas=desired_midas, liwc_scores=liwc_scores, rg_counts=rg_counts, rg_freq=rg_freq, desired_rgs=desired_rgs, system_utter_metrics=system_utter_metrics,
                                  system_metrics_to_use=system_metrics_to_use, user_utter_metrics=user_utter_metrics, user_metrics_to_use=user_metrics_to_use)
  #test_features = extract_features(test_df)
  training_features = list(zip(feature_vectors, train_df['rating']))
  model = NaiveBayesClassifier.train(training_features)
  informative_features = [feat for feat, val in model.most_informative_features(end)]
  print(len(informative_features))
  
  best = (-1*np.inf, 0, None)
  for i in range(start, end, step):
    # For each subset of the informative features, create a new feature set
    # from the total number of features extracted where only features in that
    # subset appear.
    subset = set(informative_features[:i])
    new_features = []
    for vector, label in training_features:
      new_vector = {}
      for key in vector:
        if key in subset:
          new_vector[key] = vector[key]
      new_features.append((new_vector, label))
    # We now have a new set of features to train on
    model = model_obj.train(new_features)
    accuracy, balanced_acc, precision, recall, f_measure, matrix, nltk_matrix, mistakes, informative = evaluate_model(model, test_df, mistakes=False, last_n_turns=last_n_turns, n_turns=n_turns, 
                                                                                                                      CONV_LENGTH = CONV_LENGTH, USER_WORD_COUNT = USER_WORD_COUNT, 
                                                                                                                      user_unigrams = user_unigrams, user_bigrams = user_bigrams, user_trigrams = user_trigrams, 
                                                                                                                      system_unigrams = system_unigrams, system_bigrams = system_bigrams, 
                                                                                                                      system_trigrams = system_trigrams, topic_counts=topic_counts, topic_freq=topic_freq, 
                                                                                                                      desired_topics=desired_topics, include_topic_total=include_topic_total, 
                                                                                                                      topic_metrics=topic_metrics, topic_metrics_to_use=topic_metrics_to_use, 
                                                                                                                      midas_counts=midas_counts, midas_freq=midas_freq, desired_midas=desired_midas, 
                                                                                                                      liwc_scores=liwc_scores, rg_counts=rg_counts, rg_freq=rg_freq, 
                                                                                                                      desired_rgs=desired_rgs, system_utter_metrics=system_utter_metrics, 
                                                                                                                      system_metrics_to_use=system_metrics_to_use, user_utter_metrics=user_utter_metrics, 
                                                                                                                      user_metrics_to_use=user_metrics_to_use)
    print("Accuracy {} and f_measure {} with {} features.".format(accuracy, f_measure, i))
    if f_measure > best[0]:
      best = (f_measure, i, new_features)
  # Return the best accuracy and the number of features associated and the features themselves
  return best


In [None]:
# Naive Bayes
#best_acc, best_num, best_features = run_feature_selection(training_set, tuning_set, NaiveBayesClassifier, 125000, 135000, 1000, user_unigrams=True, user_bigrams=True, user_trigrams=True, midas_freq=True)
#best_acc, best_num, best_features = run_feature_selection(training_set, tuning_set, NaiveBayesClassifier, 20, 200, 20, user_unigrams=True)
#print(best_acc, best_num)
#best_model = NaiveBayesClassifier.train(best_features)
#NBModel = best_model

# SVM
best_acc, best_num, best_features = run_feature_selection(training_set, tuning_set, SklearnClassifier(SVC()), 1000, 100000, 10000, user_unigrams=True, user_bigrams=True, user_trigrams=True, midas_freq=True)
#best_acc, best_num, best_features = run_feature_selection(training_set, tuning_set, SklearnClassifier(SVC()), 20, 200, 20, user_unigrams=True)
#print(best_acc, best_num)
best_model = SklearnClassifier(SVC()).train(best_features)
SVMModel = best_model

In [None]:
acc, balanced_acc, prec, recall, f_score, matrix, nltk_matrix, mistakes, informative = evaluate_model(best_model, tuning_set, mistakes=False, user_unigrams=True, user_bigrams=True, user_trigrams=True, midas_freq=True)
print("Accuracy: {}".format(acc))
print("Precision: {}".format(prec))
print("Recall: {}".format(recall))
print("f_measure: {}".format(f_score))
print("Confusion matrix:\n{!s}".format(nltk_matrix))

## Adding experiments to a file
This is a csv file containing results from experiments for binary classification. As one performs experiments, one can add rows to the binary_experiments dataframe containing information about the experiment. Then, when one is ready, one can update the csv file with the experiments contained in the dataframe.\
Note: Implement a way of adding notes to each experiment.

In [None]:
# Place filepath of folder containing experiment files here
TEMP_DIR = '/content/drive/MyDrive/Research/WalkerResearch/Experiments'

In [None]:
# Create binary_experiments frame from first experiment
desc = "Baseline"
binary_experiments = pd.DataFrame([[desc, matrix[1, 1], matrix[0, 1], matrix[0, 0], matrix[1, 0], acc, balanced_acc, recall, prec, f_score, most_informative]], 
                                  columns=["Description", "TP", "FP", "TN", "FN", "Accuracy", "Balanced Accuracy", "Recall", "Precision", "F Measure", "Most informative"])
binary_experiments

In [None]:
# Update frame with new experiment
desc = "user unigrams(bins of 3), user bigrams(bins of 3), midas freq, user trigrams(bins of 3)"
binary_experiments = binary_experiments.append(pd.DataFrame([[desc, matrix[1, 1], matrix[0, 1], matrix[0, 0], matrix[1, 0], acc, balanced_acc, recall, prec, f_score, most_informative]], columns=["Description", "TP", "FP", "TN", "FN", "Accuracy", "Balanced Accuracy", "Recall", "Precision", "F Measure", "Most informative"]), ignore_index=True)
binary_experiments

In [None]:
def update_experiment_file(filepath, experiments):
  with open(filepath, 'a+') as csv_file:
    csv_writer = csv.writer(csv_file, delimiter=',')
    csv_file.seek(0)
    # If empty file, add title row
    if not csv_file.read(1):
      csv_writer.writerow(["Description", "TP", "FP", "TN", "FN", "Accuracy", "Balanced Accuracy", "Recall", "Precision", "F Measure"])
    csv_file.seek(0)
    # Append dataframe rows to file
    for row in experiments.iterrows():
      csv_writer.writerow(list(row[1]))

def print_experiment_file(filepath):
  with open(filepath, 'r') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    csv_file.seek(0)
    if not csv_file.read(1):
      print("Empty File")
    csv_file.seek(0)
    index = 0
    for row in csv_reader:
      if index==0:
        print(row)
      else:
        print("Description: " + row[0])
        print("\t" + str(row[1:]))
      index += 1

In [None]:
# Update file
update_experiment_file("/content/drive/Shareddrives/Alexa Prize 4 (2020 21)/Data/Rating Analysis/Rating-wise grouped conversations/Experiments/BinaryExperimentsSVM_RatingBadRating5_min7.csv", binary_experiments)

In [None]:
# Print file
print_experiment_file("/content/drive/Shareddrives/Alexa Prize 4 (2020 21)/Data/Rating Analysis/Rating-wise grouped conversations/Experiments/BinaryExperimentsSVM_RatingBadRating5_min7.csv")

# Looking at Ngrams
We calculate ngrams across all files and look at the distribution. The idea is to remove ngrams which have a very low occurrence.

In [None]:
all_files = ['rating-1-logs-2021-01-01-2021-06-16-min-len-3.tsv',
            'rating-2-logs-2021-01-01-2021-06-16-min-len-3.tsv',
            'rating-3-logs-2021-01-01-2021-06-16-min-len-3.tsv',
            'rating-4-logs-2021-01-01-2021-06-16-min-len-3.tsv',
            'rating-5-logs-2021-01-01-2021-06-16-min-len-3.tsv']

In [None]:
# Extract and compile text in each conversation
def compile_texts(dataframe, rating, min_length):
  """
  Iterate through the dataframe and compile information about each conversation.
  This information includes:
    The conversation ID 
    The length of the conversation
    The date at which the conversation took place
    The user utterances
    The system utterances
  
  Params:
    dataframe : pandas DataFrame of a logfile
    rating : int, the rating of the conversations which appear in the dataframe
    min_length : int, the minimum number of turns allowed
  Returns:
    A new pandas DataFrame containing the above information. Each row is one conversation.
  """
  # compiled_info is a list of rows, each in the form of a list
  compiled_info = []
  # Iterate through the dataframe, extracting data about conversations
  # and adding it to the compiled_info list
  current_conv_id = dataframe['conversation_id'][0]
  # Objects to hold information about the current conversation
  user_text = []
  system_text = []
  conv_len = 0
  for i in range(len(dataframe)):
    # If the conversation id changes, we have a new conversation beginning
    if not dataframe['conversation_id'][i] == current_conv_id:
      # If conversation is long enough,
      # Add info to compiled_info list
      if conv_len >= min_length:
        compiled_info.append([current_conv_id, rating, dataframe['date'][i-1], conv_len, user_text, system_text])
      # Reset the conversation
      user_text = []
      system_text = []
      conv_len = 0
      # Update current conversation id
      current_conv_id = dataframe['conversation_id'][i]
    # Compile info for each row in a conversation
    # If we have a non-terminal row, add one to the conversation length
    if not np.isnan(dataframe['turn_count'][i]):
      conv_len += 1
    # If the user had a valid utterance, append it
    if type(dataframe['text'][i]) is str:
      user_text.append(dataframe['text'][i])
    # If the system had a valid utterance, append the extracted text
    system_utter = dataframe['response'][i]
    if type(system_utter) is str:
      system_text.append(extract_system_text(system_utter))
    #else: The box is empty (end of a conversation)
  # Add info for last conversation (current id doesn't change at the end of the file, but we still want the conversation info)
  if conv_len >= min_length:
    compiled_info.append([current_conv_id, rating, dataframe['date'][len(dataframe)-1], conv_len, user_text, system_text])

  # Construct new dataframe from the info (each list in compiled_info becomes a row)
  new_frame = pd.DataFrame(compiled_info, columns=['conversation_id', 'rating', 'date', 'conversation_length', 'user_text', 'system_text'])
  return new_frame

In [None]:
data = []
index = 0
for logfile in all_files:
  data.append(compile_texts(open_file(GROUP_FOLDER + "/Processed Logs/" + logfile), rating=index+1, min_length=5))
  tokenize_frame_texts(data[-1])
  index += 1
data[0]

In [None]:
def gather_ngrams(dataframe_list, n):
  ngramFDist = nltk.FreqDist()
  for dataframe in dataframe_list:
    for i in range(len(dataframe)):
      tokens = [tok.lower() for tok in dataframe['user_tokens'][i]]
      if len(tokens) >= n:
        ngrams = nltk.ngrams(tokens, n=n)
        for seq in ngrams:
          ngramFDist[" ".join(seq)] += 1
  return ngramFDist

In [None]:
ngram_dist = gather_ngrams(data, 3)

In [None]:
print("There were {} unique ngrams".format(ngram_dist.B()))

In [None]:
ngram_dist.most_common(50)

In [None]:
ngram_dist.plot(50)

In [None]:
ngram_mean = np.mean(list(ngram_dist.values()))
#ngram_std = np.std(list(ngram_dist.values()))
values = list(ngram_dist.values())
above_mean=0
for i in range(len(values)):
  if values[i] >= ngram_mean:
    above_mean += 1
  #print("Z-score: {}".format(z_scores[i])

In [None]:
#above_mean = np.sum(z_scores >= 0)
print("{:.3f}% of the ngrams are above the mean. This amounts to {} unique ngrams.".format(above_mean*100/ngram_dist.B(), above_mean))

# Tensorflow Experiments

In [None]:
# Imports
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Embedding, Dropout, Conv1D, GlobalMaxPooling1D, Bidirectional, LSTM
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.callbacks import ModelCheckpoint

In [None]:
# Functions to plot loss and accuracy history
def plot_losses(hist):
    plt.plot(hist.history['loss'])
    plt.plot(hist.history['val_loss'])
    plt.title('Model Loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Val'])
    plt.show()
def plot_accuracies(hist):
    plt.plot(hist.history['accuracy'])
    plt.plot(hist.history['val_accuracy'])
    plt.title('Model Accuracy')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Val'])
    plt.show()

In [None]:
# Get training frame (or use training_set from above)
rating_1_training_raw = open_file(DATA_DIR+"/Training Data/rating-1-training-set-profanity-tagged.tsv")
rating_1_compiled = compile_info(rating_1_training_raw, rating=1)
rating_5_training_raw = open_file(DATA_DIR+"/Training Data/rating-5-training-set-profanity-tagged.tsv")
rating_5_compiled = compile_info(rating_5_training_raw, rating=5)
training_data = combine(rating_1_compiled, rating_5_compiled)
#training_data = training_set
training_data

In [None]:
# Add column for class labels, label 0 means bad or rating 1, label 1 means good or rating 5
training_data['class_label'] = training_data['rating'].apply(lambda val: 0 if val==1 else 1)
training_data

In [None]:
# Shuffle dataframe so the validation set isn't all one class
training_data = training_data.sample(frac=1, random_state=42).reset_index(drop=True)
training_data

In [None]:
# Get tuning frame
rating_1_tuning_raw = open_file(DATA_DIR+"/Tuning Data/rating-1-tuning-set-profanity-tagged.tsv")
rating_1_tuning_compiled = compile_info(rating_1_tuning_raw, rating=1)
rating_5_tuning_raw = open_file(DATA_DIR+"/Tuning Data/rating-5-tuning-set-profanity-tagged.tsv")
rating_5_tuning_compiled = compile_info(rating_5_tuning_raw, rating=5)
tuning_data = combine(rating_1_tuning_compiled, rating_5_tuning_compiled)
tuning_data

In [None]:
# Add column for class labels, label 0 means bad or rating 1, label 1 means good or rating 5
tuning_data['class_label'] = tuning_data['rating'].apply(lambda val: 0 if val==1 else 1)
tuning_data

In [None]:
test = pd.DataFrame({'user_text':[training_data['user_text'][0]], 'system_text':[training_data['system_text'][0]]})
test

In [None]:
stitch_conversations_and_return(test)

In [None]:
# Instead of using just the user texts, let's stitch together the conversation turn by turn
def stitch_conversations_and_return(dataframe):
  conversations = []
  for i in range(len(dataframe)):
    stitch = ""
    user_utters = dataframe['user_text'][i]
    system_utters = dataframe['system_text'][i]
    if len(user_utters)==len(system_utters):
      for j in range(len(user_utters)):
        stitch += user_utters[j] + ". " + system_utters[j] + ". "
    elif len(user_utters) > len(system_utters):
      for j in range(len(system_utters)):
        stitch += user_utters[j] + ". " + system_utters[j] + ". "
      for k in range(len(system_utters), len(user_utters), 1):
        stitch += user_utters[k] + ". "
    else:
      for j in range(len(user_utters)):
        stitch += user_utters[j] + ". " + system_utters[j] + ". "
      for k in range(len(user_utters), len(system_utters), 1):
        stitch += user_utters[k] + ". "

    conversations.append(stitch)
  return conversations


In [None]:
# Set up text vectorization
max_features = 20000
embedding_dim = 128
sequence_length = 500
vectorize_layer = TextVectorization(
    max_tokens=max_features,
    output_sequence_length=sequence_length
)
# Get user texts
#texts_ds = [". ".join(text_list) for text_list in training_data['user_text']]
texts_ds = stitch_conversations_and_return(training_data)
print(texts_ds[:2])
vectorize_layer.adapt(texts_ds)

In [None]:
x_train = np.array(texts_ds)
y_train = training_data['class_label'].to_numpy()
print(x_train.shape, y_train.shape)
print(x_train[0], y_train[0])

In [None]:
#x_tuning = np.array([". ".join(text_list) for text_list in tuning_data['user_text']])
x_tuning = np.array(stitch_conversations_and_return(tuning_data))
y_tuning = tuning_data['class_label'].to_numpy()
print(x_tuning.shape, y_tuning.shape)
print(x_tuning[0], y_tuning[0])

In [None]:
# Set up model structure to take raw strings, vectorize, embed, and set up output layers
tf.keras.backend.clear_session()
text_input = Input(shape=(1,), dtype=tf.string)
x = vectorize_layer(text_input)
x = Embedding(max_features, embedding_dim, mask_zero=True)(x)
x = Dropout(0.5)(x)
x = Conv1D(filters=128, kernel_size=7, strides=3, activation='relu')(x)
x = Conv1D(filters=128, kernel_size=7, strides=3, activation='relu')(x)
x = GlobalMaxPooling1D()(x)
x = Dense(128, activation='relu')(x)
x = Dropout(0.5)(x)
predictions = Dense(1, activation='sigmoid')(x)
model = Model(inputs=text_input, outputs=predictions)
model.summary()

In [None]:
# Set up another model structure, this time using an RNN
tf.keras.backend.clear_session()
text_input = Input(shape=(1,), dtype=tf.string)
x = vectorize_layer(text_input)
x = Embedding(max_features, embedding_dim, mask_zero=True)(x)
x = Bidirectional(LSTM(units=64))(x)
x = Dense(64, activation='relu')(x)
predictions = Dense(1, activation='sigmoid')(x)
model = Model(inputs=text_input, outputs=predictions)
model.summary()

In [None]:
CHECKPOINT_DIR = '/content/drive/MyDrive/Research/WalkerResearch/Model Checkpoints'

In [None]:
print("Baseline Accuracy: {}".format(np.sum(tuning_data['class_label'])/len(tuning_data)))

In [None]:
# Compile and fit
opt = Adam(learning_rate=0.001)
model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'])

checkpoint_filepath = CHECKPOINT_DIR + "/test3"
model_checkpoint = ModelCheckpoint(
    filepath = checkpoint_filepath,
    monitor='val_accuracy',
    save_best_only = True,
    save_weights_only=True,
    mode = 'max'
)

history = model.fit(x_train, y_train,
          epochs=4,
          verbose=1,
          batch_size=128,
          validation_split=0.1,
          callbacks=[model_checkpoint])

In [None]:
model.load_weights(checkpoint_filepath)

In [None]:
plot_accuracies(history)
plot_losses(history)

In [None]:
# Test on the tuning set
model.evaluate(x_tuning, y_tuning, batch_size=64, verbose=1)

In [None]:
# Get predictions
pred_prob = model.predict(x_tuning)
pred_class = []
for i in range(len(pred_prob)):
  if pred_prob[i] < 0.5:
    pred_class.append(0)
  else:
    pred_class.append(1)
pred_class = np.array(pred_class)
print(pred_prob[0], pred_class[0])

In [None]:
matrix = ConfusionMatrix(list(y_tuning), list(pred_class))
TP = matrix[1, 1]
TN = matrix[0, 0]
FP = matrix[0, 1]
FN = matrix[1, 0]
acc = round((TP+TN)/(TP+TN+FP+FN), 3)
prec = round(TP/(TP+FP), 3)
recall = round(TP/(TP+FN), 3)
f_score = round((2*prec*recall)/(prec + recall), 3)
print("Accuracy: {}".format(acc))
print("Precision: {}".format(prec))
print("Recall: {}".format(recall))
print("f_measure: {}".format(f_score))
print("Confusion matrix:\n{!s}".format(matrix))

# Automatic Evaluation
Here, we use regression models to predict the overall rating of the conversations.