# Clean Text
* Import Transcripts
* Extract Coach Text
* Remove Time Labels
* Replace avatar names for key words?
* Create transcript dataframe

In [1]:
import docx
import os
import nltk
import re
from nltk import sent_tokenize, word_tokenize
import pandas as pd

from nltk import WordNetLemmatizer 
import operator

import fnmatch

import statistics
lemmatizer = WordNetLemmatizer() 

from sklearn.feature_extraction.text import TfidfVectorizer

from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy import spatial
from sklearn import metrics

  from collections import Iterable


In [2]:
fall2017_filepath = '/Users/kylieleblancKylie/domino/docsim/data/fall_2017/coaching/'
fall2018_filepath = '/Users/kylieleblancKylie/domino/docsim/data/fall_2018/coaching/'
spring2018_filepath = '/Users/kylieleblancKylie/domino/docsim/data/spring_2018/coaching/'
spring2019_filepath = '/Users/kylieleblancKylie/domino/docsim/data/spring_2019/coaching/'
fall2019_filepath = "/Users/kylieleblancKylie/domino/docsim/data/fall_2019_TAP/coaching/"
clean_filepath = "/Users/kylieleblancKylie/domino/docsim/data/clean/"

# Extract Text from Documents

In [3]:
def make_text_dict(filepath, pattern):
    os.chdir(filepath)
    doc_text = {}
    for file in os.listdir():
        if fnmatch.fnmatch(file, pattern):
            doc_text[file] = getCoachText(filepath + file)
    return doc_text

def getCoachText(filename):
    doc = docx.Document(filename)
    fullText = []
    for para in doc.paragraphs:
        if len(para.text) > 5:
            new_text = replace_alt_words_for_coach(para.text)
            if new_text[0:6] == 'Coach:' or new_text[11:17] == 'Coach:':
                new_text = drop_labels(new_text)
                new_text = fix_typos(new_text)
                fullText.append(new_text)
    new_text = ' '.join(fullText)
    #new_text = new_text.replace('\\n','')
    return new_text

def replace_alt_words_for_coach(string):
    new_text = string.replace('Instructor:', 'Coach:')
    new_text = new_text.replace('Tutor:', 'Coach:')
    new_text = new_text.replace('Interviewer:', 'Coach:')
    new_text = new_text.replace('Interviewer:', 'Coach:')
    new_text = new_text.replace('COACH:', 'Coach:')
    new_text = new_text.replace('Interviewer:', 'Coach:')
    new_text = new_text.replace('Announcer:', 'Coach:')
    new_text = new_text.replace('Female Speaker:', 'Coach:')
    new_text = new_text.replace('Male Speaker:', 'Coach:')
    new_text = new_text.replace('Arielle:', 'Coach:')
    new_text = new_text.replace('Mike Grille:', 'Coach:')
    new_text = new_text.replace('Rosalie:', 'Coach:')
    new_text = new_text.replace('Moderator:', 'Coach:')
    new_text = new_text.replace('Anna Myers:', 'Coach:')
    return new_text

def drop_labels(string):
    new_text = string.replace('\\n','')
    new_text = re.sub(r'\[(.*?)\]', '', new_text)
    new_text = re.sub(r'([a-zA-Z]+)\:', '', new_text)
    return new_text

def fix_typos(string):
    new_text = string.replace('00:00:11]', '')
    new_text = new_text.replace('Dave', 'Dev')
    return new_text

In [4]:
fall2017_dict = make_text_dict(fall2017_filepath, '*Transcript*')
spring2018_dict = make_text_dict(spring2018_filepath, '*docx')
fall2018_dict = make_text_dict(fall2018_filepath, '2018*docx')
spring2019_dict = make_text_dict(spring2019_filepath, '2019*')
fall2019_dict = make_text_dict(fall2019_filepath, '*Transcript.docx')

In [5]:
fall2019_dict['01_1920_05_031_22c_Transcript.docx']

'   I have to set a little timer for us here.  Okay, we’re going to work on some strategies for the simulation for the next five minutes.  This is Mike   coaching session for William Sweet.  So, let’s start.  What did you think?  How do you feel about it?      Right.  What was challenging about it?     Yeah, that’s tough. .    So, well, what you did a good job of was that you are able to like you identified the behaviors quickly and you were talking to the students about what they should have been doing, right?  So as soon as their misbehavior began you would address it you would talk to Dev or Ethan, and you moved to redirect them, which is the important first step.  But this time what I want us to focus on is making your redirections a little bit more specific, right?  So, let me give you an example of what I mean by that.  So, for example, when you were talking to Ethan about beatboxing, right?  And this is just for that simulation, so Ethan, that’s not very respectful, we want to s

In [6]:
fall2018_dict['2018_112_3C_Transcript.docx']

'   But I’d love to hear from you how felt the simulation went.     Let’s put aside the peer interaction right now, and focus on what you mentioned before about responding to their answers and specifically what I want you to focus on--first thing I want to say actually is that you did really well and you especially did well making sure that they were having to answer with text evidence in mind and that you give praise when a student did use some evidence. So that’s awesome. That’s like the first thing that we want to make sure that we can do in responding to student answers.  So, keep doing that.     And what I want you to work on is actually using that ability where you’re already pushing students to get text evident to help make sure that students who maybe don’t have a very strong answers, give stronger answers.      Specifically in this case there were two instances where the student answers were not really correct, and so at the end they kind of walked away those student might not

## Create dataframe

In [7]:
def create_df(textdict, year, semester, scenario):
    df = pd.DataFrame.from_dict(data = textdict, orient = 'index').reset_index()
    df = df.rename({'index': 'doc', 0: 'text'}, axis = 'columns')
    df['year'] = year
    df['semester'] = semester
    df['scenario'] = scenario
    df = df.set_index('doc')
    return df
fall2017 = create_df(fall2017_dict, '2017-18', 'fall', 'feedback')
spring2018 = create_df(spring2018_dict, '2017-18', 'spring', 'behavior')
fall2018 = create_df(fall2018_dict, '2018-19', 'fall', 'feedback')
spring2019 = create_df(spring2019_dict, '2018-19', 'spring', 'behavior')
fall2019 = create_df(fall2019_dict, '2019-20', 'fall', 'behavior')

In [8]:
spring2019.sample()

Unnamed: 0_level_0,text,year,semester,scenario
doc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019_114_5C_Transcript.docx,"How do you feel about, are you feeling abou...",2018-19,spring,behavior


In [9]:
corpus_df = fall2017.append(spring2018)
corpus_df = corpus_df.append(fall2018)
corpus_df = corpus_df.append(spring2019)
corpus_df = corpus_df.append(fall2019)
corpus_df.sample(10)

Unnamed: 0_level_0,text,year,semester,scenario
doc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
122_c_Transcript.docx,"Yeah, and don’t be afraid, seriously, if lik...",2017-18,fall,feedback
56_c_Transcript.docx,So what are some things that you think went...,2017-18,fall,feedback
2018_109_3C_Transcript.docx,How do you think that went? Went well for y...,2018-19,fall,feedback
42_c_Transcript.docx,"All right. As it relates to feedback, what...",2017-18,fall,feedback
52-2C.docx,"All right. So Danielle, how are you feeling ...",2017-18,spring,behavior
2019_40_5C_Transcript.docx,"They’re going to turn again to you. Oh,...",2018-19,spring,behavior
111_c_Transcript.docx,What do you think went well as it relates t...,2017-18,fall,feedback
2018_75_3C_Transcript.docx,"All right. So, how do you feel about it? ...",2018-19,fall,feedback
2018_3_3C_Transcript.docx,All right. How did you think the first sim...,2018-19,fall,feedback
122-2C.docx,"First of all, how do you feel? How do you th...",2017-18,spring,behavior


# Send to CSV

In [10]:
corpus_df.to_csv(clean_filepath + 'text_transcripts.csv')