In [68]:
import similarity
import operator
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from scipy import spatial
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
from statistics import stdev
import seaborn as sns
from nltk.stem import PorterStemmer

In [69]:
spring2018_filepath = '/Users/kylieleblancKylie/domino/docsim/data/spring_2018/coaching/'
spring2019_filepath = '/Users/kylieleblancKylie/domino/docsim/data/spring_2019/coaching/'
results_filepath = "/Users/kylieleblancKylie/domino/docsim/results/"

# Extract text from documents

In [70]:
spring2018_dict = similarity.make_text_dict(spring2018_filepath, '*docx')
spring2019_dict = similarity.make_text_dict(spring2019_filepath, '2019*')
def create_df(textdict, year, semester):
    df = pd.DataFrame.from_dict(data = textdict, orient = 'index').reset_index()
    df = df.rename({'index': 'doc', 0: 'text'}, axis = 'columns')
    df['year'] = year
    df['semester'] = semester
    return df
spring2018 = create_df(spring2018_dict, 2018, 'spring')
spring2019 = create_df(spring2019_dict, 2019, 'spring')

In [71]:
df = spring2018.append(spring2019)
df = df.reset_index(drop = True)
df.sample(10)

Unnamed: 0,doc,text,year,semester
81,2019_40_5C_Transcript.docx,"Coach: [0:00:05] Oh, do you have – no, okay, ...",2019,spring
78,2019_35_5C_Transcript.docx,"Coach: [00:00:00] First time, yes. [00:00:01...",2019,spring
52,11-2C.docx,[00:00:01] Coach: How do you feel?\n[00:00:04]...,2018,spring
47,115-2C.docx,"[00:00:00] Coach: And so, how do you feel abou...",2018,spring
55,37-2C.docx,"[00:00:00] Coach: So, how do you feel about th...",2018,spring
26,17-2C.docx,[00:00:00] Coach: How do you think that went? ...,2018,spring
70,2019_58_5C_Transcript.docx,Coach: [00:00:07] Alright. Nice job. Like s...,2019,spring
41,60-2C.docx,[00:00:00] Coach: How do you feel about that f...,2018,spring
96,2019_71_5C_Transcript.docx,Coach: [00:00:00] Okay so how are you feeling...,2019,spring
34,69-2C.docx,"[00:00:00] Coach: Um, so how did you feel abou...",2018,spring


In [72]:
len(df)

113

# Create document term matrix for full copus

In [73]:
corpus = list(df.text)
stemmer = PorterStemmer()
analyzer = CountVectorizer(token_pattern=r'\b[^\d\W]+\b').build_analyzer()
def stemmed_words(doc):
    return (stemmer.stem(w) for w in analyzer(doc))

stem_vectorizer = CountVectorizer(analyzer=stemmed_words)

vectors = stem_vectorizer.fit_transform(corpus)
print("Words extracted from corpus: ")
for word in stem_vectorizer.get_feature_names()[0:200]:
    print(word, end = ", ")
count_vect_df = pd.DataFrame(vectors.todense(), columns=stem_vectorizer.get_feature_names())
word_freq = count_vect_df.values.tolist()
df['term_freq'] = word_freq
#df = df.reset_index()
df.sample(5)

Words extracted from corpus: 
a, abbi, abil, abl, about, abras, abruptli, absolut, absorb, academ, accept, acclim, accomplish, acknowledg, across, act, action, activ, actual, ad, add, addit, address, adjust, admir, advantag, advers, advic, advoc, affect, afraid, after, afterward, again, against, aggress, agit, agre, ah, ahead, aim, air, al, alarm, alex, alic, align, alittl, all, allow, allright, almost, along, alreadi, alright, alrighti, also, alter, altern, although, alway, am, amaz, ambigu, amic, amount, an, anatoli, and, anger, angl, angri, ani, announc, annoy, anoth, answer, anxieti, anxiou, anymor, anyon, anyth, anytim, anyway, appear, appli, applic, appreci, approach, appropri, approxim, are, area, aren, argu, argument, ariel, aris, arm, around, as, asid, ask, aspect, assert, associ, assum, assumpt, at, atmospher, attach, attack, attempt, attend, attent, attitud, audio, ava, avail, avatar, avoid, awar, away, awesom, awhil, awkward, b, ba, back, background, backpack, bad, bag, bag

Unnamed: 0,doc,text,year,semester,term_freq
53,107-2C.docx,[00:00:01] Coach: How are you feeling?\n[00:00...,2018,spring,"[8, 0, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
50,57-2C.docx,"[00:00:00] Coach: Uh, how do you feel about th...",2018,spring,"[16, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3,..."
94,2019_62_5C_Transcript.docx,Coach: [00:00:04] Yes. Hello. [00:00:05]\nCo...,2019,spring,"[9, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
37,46-2C.docx,"[00:00:00] Coach: So, how you feeling? How you...",2018,spring,"[19, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
112,2019_54_5C_Transcript.docx,"Coach: [00:00:10] So, you probably heard me c...",2019,spring,"[11, 0, 0, 0, 5, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0,..."


In [74]:
df['within_study'] = np.nan
def within_study_mean(df, semester, year):
    for maindoc in df[(df.semester == semester) & (df.year == year)].index:
        pairwise_sim = []
        for doc in df[(df.semester == semester) & (df.year == year)].index:
            sim = 1 - spatial.distance.cosine(df.term_freq.loc[maindoc], df.term_freq.loc[doc])
            pairwise_sim.append(sim)
        average = (sum(pairwise_sim) - 1)/(len(pairwise_sim) - 1) # don't include relationship with self
        df.at[maindoc, 'within_study'] = average
    return df
df = within_study_mean(df, 'spring', 2018)
df = within_study_mean(df, 'spring', 2019)
df.sample(5)

Unnamed: 0,doc,text,year,semester,term_freq,within_study
49,78-2C.docx,"[00:00:03] Coach: There's, like, a lot of clic...",2018,spring,"[7, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...",0.799693
99,2019_22_5C_Transcript.docx,Coach: [00:00:00] Just go ahead and take a se...,2019,spring,"[9, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...",0.877744
78,2019_35_5C_Transcript.docx,"Coach: [00:00:00] First time, yes. [00:00:01...",2019,spring,"[8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.816146
112,2019_54_5C_Transcript.docx,"Coach: [00:00:10] So, you probably heard me c...",2019,spring,"[11, 0, 0, 0, 5, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0,...",0.816194
46,105-2C.docx,[00:00:00] Coach: Alrighty. So how do you thin...,2018,spring,"[18, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",0.850183


In [75]:
df['corpus_distance'] = np.nan
def corpus_mean(df):
    for maindoc in df.index:
        pairwise_sim = []
        for doc in df.index:
            sim = 1 - spatial.distance.cosine(df.term_freq.loc[maindoc], df.term_freq.loc[doc])
            pairwise_sim.append(sim)
        average = (sum(pairwise_sim) - 1)/(len(pairwise_sim) - 1) # don't include relationship with self
        df.at[maindoc, 'corpus_distance'] = average
    return df
df = corpus_mean(df)
df.sample(5)

Unnamed: 0,doc,text,year,semester,term_freq,within_study,corpus_distance
99,2019_22_5C_Transcript.docx,Coach: [00:00:00] Just go ahead and take a se...,2019,spring,"[9, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...",0.877744,0.852012
101,2019_87_5C_Transcript.docx,Coach: [00:00:01] What do you think you did w...,2019,spring,"[13, 3, 0, 0, 7, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",0.850719,0.813331
67,96-2C.docx,[00:00:01] Coach: Okay\n[00:00:07] Coach: I'm ...,2018,spring,"[10, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",0.843533,0.834981
5,32-2C.docx,[00:00:00] Coach: And how are you feeling abou...,2018,spring,"[5, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, ...",0.794401,0.797608
63,72-2C.docx,[00:00:00] Coach: So how do you feel about tha...,2018,spring,"[13, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",0.833635,0.815239


In [76]:
df['replication'] = np.nan
def across_study_mean(df):
    for maindoc in df.index:
        pairwise_sim = []
        for doc in df[(df.year != int(df[df.index == 0].year))].index:
            sim = 1 - spatial.distance.cosine(df.term_freq.loc[maindoc], df.term_freq.loc[doc])
            pairwise_sim.append(sim)
        average = (sum(pairwise_sim))/(len(pairwise_sim))
        df.at[maindoc, 'replication'] = average
    return df
df = across_study_mean(df)
df.sample(10)

Unnamed: 0,doc,text,year,semester,term_freq,within_study,corpus_distance,replication
97,2019_61_5C_Transcript.docx,Coach: [00:00:00] Go ahead and tell me your th...,2019,spring,"[10, 0, 0, 0, 8, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...",0.87276,0.848734,0.875588
87,2019_20_5C_Transcript.docx,"Coach: [00:00:04] All right, so come on over....",2019,spring,"[15, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,...",0.827789,0.807323,0.831616
50,57-2C.docx,"[00:00:00] Coach: Uh, how do you feel about th...",2018,spring,"[16, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3,...",0.815322,0.80619,0.792593
11,112-2C.docx,[00:00:00] Coach: You can come and have a seat...,2018,spring,"[10, 0, 0, 0, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1...",0.853348,0.842443,0.826207
17,89-2C.docx,"[00:00:00] Coach: All righty. So, first of all...",2018,spring,"[19, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,...",0.823074,0.819649,0.81455
77,2019_38_5C_Transcript.docx,Coach: [00:00:11] How are you feeling about t...,2019,spring,"[5, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.84934,0.833972,0.852688
31,53-2C.docx,"[00:00:00] Coach: So, how do you feel about th...",2018,spring,"[11, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",0.844786,0.818968,0.780527
1,103-2C.docx,[00:00:03] Coach: All right. Why don't you hav...,2018,spring,"[5, 0, 0, 0, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...",0.78421,0.782493,0.779936
68,2019_75_5C_Transcript.docx,"Coach: [00:00:02] Alright. So, how do you f...",2019,spring,"[8, 0, 0, 1, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.821573,0.82323,0.825538
48,123-2C.docx,"[00:00:00] Coach: Um, so how do you feel about...",2018,spring,"[6, 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.78761,0.777824,0.763254


In [77]:
# Results - Corpus

In [78]:
print(df.corpus_distance.mean())
print(df.corpus_distance.std())

0.824088391736315
0.020034033919407918


In [79]:
print(df.replication.mean())
print(df.replication.std())

0.8248845177470735
0.03139696142995132


In [80]:
from scipy import stats
stats.ttest_ind(df.within_study, df.replication)

Ttest_indResult(statistic=4.056699817956279, pvalue=6.874217018760697e-05)

In [81]:
len(df)

113

In [82]:
## Define 2 random distributions
#Sample Size
N = len(df)

a = df.within_study
b = df.replication


## Calculate the Standard Deviation
#Calculate the variance to get the standard deviation

#For unbiased max likelihood estimate we have to divide the var by N-1, and therefore the parameter ddof = 1
var_a = a.var(ddof=1)
var_b = b.var(ddof=1)

#std deviation
s = np.sqrt((var_a + var_b)/2)
s



## Calculate the t-statistics
t = (a.mean() - b.mean())/(s*np.sqrt(2/N))



## Compare with the critical t-value
#Degrees of freedom
degrees = 2*N - 2

#p-value after comparison with the t 
p = 1 - stats.t.cdf(t,df=degrees)


print("t = " + str(t))
print("p = " + str(2*p))
### You can see that after comparing the t statistic with the critical t value (computed internally) we get a good p value of 0.0005 and thus we reject the null hypothesis and thus it proves that the mean of the two distributions are different and statistically significant.


t = 4.056699817956279
p = 6.874217018748574e-05


# Results - Replication where context varies

In [83]:
df.corpus_distance.mean()

0.824088391736315

In [84]:
df.corpus_distance.std()

0.020034033919407918

In [88]:
df.within_study.mean()

0.8395017828973319

In [96]:
print(df[df.year == 2019].within_study.mean())
print(len(df[df.year == 2019].within_study))

0.8459020736713982
45


In [97]:
print(df[df.year == 2018].within_study.mean())
print(len(df[df.year == 2018].within_study))

0.8352662963556701
68


In [98]:
df.replication.mean()

0.8248845177470735

In [89]:
df.replication.mean()

0.8248845177470735

In [90]:
df.within_study.std()

0.021939655521082575

In [91]:
df.replication.std()

0.03139696142995132

In [94]:
se = df.corpus_distance.std()/np.sqrt(len(df))
se

0.001884643378589186