In [3]:
import pandas as pd
from scipy import spatial
from openpyxl import load_workbook
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from library import start

In [4]:
clean_filepath = start.clean_filepath

In [5]:
docs = pd.read_csv(clean_filepath + 'text_transcripts.csv')
docs = docs.set_index('doc')

In [6]:
techniques = ['_stop', '_stop_wgt', '_stem', '_stem_stop', '_stem_stop_wgt', '_lsa', '_lsa_stop', '_lsa_wgt_stop']


## Set pre-processing of interest and import matrix

In [234]:
matrix_transcripts_file = 'matrix_transcripts_lsa_stop.csv'
matrix_scripts_file = 'matrix_scripts_lsa_stop.csv'
topics_file = 'lsa_stop_topics.csv'

In [235]:
matrix_transcripts = pd.read_csv(clean_filepath + matrix_transcripts_file)
matrix_transcripts = matrix_transcripts.set_index('doc')
matrix_transcripts.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
doc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
68_c_Transcript.docx,0.628542,-0.350509,0.083802,-0.318494,-0.16303,0.038318,0.082333,0.252457,-0.029988,-0.131897,...,-0.032332,-0.01134,0.055192,0.008693,0.010974,-0.003002,-0.04392,-0.032214,-0.071731,-0.016648
90_c_Transcript.docx,0.669009,-0.33423,0.116053,-0.23379,-0.041996,0.033117,-0.067874,-0.046766,-0.099285,0.062094,...,0.052852,0.073113,0.050349,0.040452,0.008791,-0.011361,0.013806,-0.003666,-0.038813,-0.032981
122_c_Transcript.docx,0.593927,-0.373292,0.286311,-0.126166,-0.30224,-0.019503,0.004667,0.153041,0.008615,-0.055725,...,0.082353,0.011091,0.015251,-0.103821,-0.021088,0.014867,0.078008,0.041382,0.042136,-0.027652
12_c_Transcript.docx,0.590151,-0.293959,0.070318,-0.073973,-0.173875,-0.014849,-0.022558,0.085463,-0.094699,-0.032562,...,0.068643,0.056804,-0.074281,0.049042,-0.078821,0.042459,0.006594,0.012062,0.019293,0.046948
113_c_Transcript.docx,0.642546,-0.359431,0.222182,-0.219779,0.074334,0.168001,-0.101566,0.023903,-0.103592,-0.032868,...,0.02227,0.051272,0.012543,-0.069186,0.028054,0.045708,0.01835,-0.052213,-0.002908,-0.016798


In [236]:
matrix_scripts = pd.read_csv(clean_filepath + matrix_scripts_file)
matrix_scripts = matrix_scripts.set_index('doc')
matrix_scripts.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
doc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Feedback Model C22.docx,0.512281,-0.290224,-0.152412,0.243548,-0.238556,-0.038534,-0.101865,0.01088,0.055679,-0.040573,...,-0.019766,-0.009136,0.024823,-0.017092,-0.031201,0.007197,0.006651,-0.006528,0.019026,0.010656
Feedback Model D21.docx,0.436118,-0.495929,-0.248477,0.271789,-0.157235,-0.030228,0.077344,0.18067,0.042368,-0.057505,...,-0.026362,-0.005423,-0.028882,0.009753,0.034002,0.023909,0.013158,-0.011543,0.007245,-0.003051
Feedback Model A22.docx,0.493254,-0.476017,-0.344168,0.300932,-0.034799,-0.010202,0.090161,0.158146,0.075198,-0.038615,...,0.032167,0.03695,0.060578,-0.018571,-0.050843,-0.002922,-0.010184,-0.000999,0.035424,0.020685
Feedback Model B11.docx,0.444534,-0.52321,-0.396202,0.357982,0.045669,-0.047634,0.15557,0.131209,0.035382,-0.048386,...,-0.019427,0.006651,0.000179,0.001871,-0.005615,-0.023467,0.023844,-0.000942,-0.00677,0.033466
Feedback Model C12.docx,0.520611,-0.280116,-0.155644,0.244462,-0.236097,-0.036916,-0.111999,0.022172,0.052813,-0.048076,...,-0.020142,-0.006981,0.024795,-0.01175,-0.032257,0.007124,0.007268,-0.008886,0.018046,0.014084


# What to include in comparison dataframe

# Scripts

In [237]:
docs_of_interest = list(matrix_scripts[matrix_scripts.index.str.contains('Management')].index)
subset = matrix_scripts[matrix_scripts.index.isin(docs_of_interest)]
centroid = pd.DataFrame.mean(subset)
main_words = list(centroid.sort_values(ascending = False)[0:25].index)
centroid.sort_values(ascending = False)[0:25]

0     0.534107
3     0.281866
1     0.142961
5     0.085712
15    0.074387
9     0.050855
16    0.046443
23    0.021229
13    0.018147
37    0.017594
51    0.016298
14    0.012467
18    0.010925
22    0.009963
78    0.009081
77    0.008419
11    0.007875
83    0.007649
21    0.007636
44    0.007581
95    0.007415
48    0.007145
62    0.006733
50    0.006304
53    0.005746
dtype: float64

# Session Similarity

In [238]:
docs_of_interest = list(docs[(docs.year == '2017-18') & (docs.semester == 'spring')].index)
subset = matrix_transcripts[matrix_transcripts.index.isin(docs_of_interest)]
centroid = pd.DataFrame.mean(subset)
centroid.sort_values(ascending = False)[0:25]

0     0.694130
1     0.186201
2     0.154583
5     0.118259
3     0.077101
7     0.034407
9     0.033888
4     0.032798
10    0.030913
38    0.014756
6     0.012886
15    0.010850
25    0.009252
58    0.008538
21    0.007919
35    0.007161
29    0.007092
30    0.006550
47    0.006295
64    0.006045
90    0.004889
41    0.004868
65    0.004674
31    0.004662
39    0.003903
dtype: float64

In [239]:
docs_of_interest = list(docs[(docs.year == '2018-19') & (docs.semester == 'spring')].index)
subset = matrix_transcripts[matrix_transcripts.index.isin(docs_of_interest)]
centroid = pd.DataFrame.mean(subset)
centroid.sort_values(ascending = False)[0:25]

0     0.694434
1     0.215121
8     0.072670
13    0.038896
20    0.031524
17    0.029531
32    0.027449
10    0.022609
28    0.022545
26    0.019460
37    0.015097
38    0.013114
43    0.012226
16    0.011560
36    0.010097
42    0.008914
65    0.008470
39    0.008346
29    0.007680
82    0.007296
67    0.006527
24    0.006452
85    0.006238
60    0.006234
59    0.005811
dtype: float64

In [240]:
docs_of_interest = list(docs[(docs.year == '2019-20') & (docs.semester == 'fall')].index)
subset = matrix_transcripts[matrix_transcripts.index.isin(docs_of_interest)]
centroid = pd.DataFrame.mean(subset)
centroid.sort_values(ascending = False)[0:25]

0     0.682255
1     0.168602
6     0.070664
15    0.047864
14    0.042876
11    0.021274
21    0.019872
24    0.018566
34    0.017365
30    0.015586
12    0.013086
33    0.012814
31    0.011715
17    0.011547
23    0.010193
62    0.010120
71    0.007899
22    0.007553
66    0.007256
46    0.006153
74    0.006113
52    0.005912
70    0.005653
53    0.005408
27    0.004943
dtype: float64

# LSA Topics

In [241]:
topics = pd.read_csv(clean_filepath + topics_file, index_col = 0)

In [242]:
topics.nlargest(25, '0')

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
think,0.290303,-0.288726,0.307304,-0.260755,-0.298365,0.170081,-0.018481,0.418044,-0.081922,-0.270389,...,0.006533,-0.009785,-0.005955,0.009738,-0.011316,-0.010695,-0.004343,0.004058,-0.031055,0.008738
want,0.269294,0.14514,-0.423291,-0.246695,-0.224892,0.042348,0.224921,-0.18602,-0.04424,0.003403,...,0.012079,0.013866,0.002262,-0.003907,0.030467,0.012039,0.01009,-0.073639,-0.031721,0.001122
right,0.26816,0.182677,0.010543,-0.072876,0.470688,-0.258122,-0.025698,0.370433,0.234561,0.259105,...,-0.0331,0.000781,-0.004865,0.024009,-0.006461,0.021118,0.010036,0.010811,-0.005159,-0.025566
know,0.244968,0.110254,0.405732,0.23498,0.026845,-0.036548,0.332665,-0.376392,0.069307,-0.213931,...,-0.00657,0.017775,0.014442,0.00582,-0.007384,-0.007785,0.012328,-0.007707,-0.034674,-0.009993
really,0.234931,-0.087126,0.064694,-0.031339,0.162888,0.45722,0.059545,-0.208647,0.064031,0.32533,...,0.008947,-0.011647,0.050035,0.011772,-0.024591,0.003857,0.020148,-0.044815,0.03812,0.031309
say,0.176294,0.103234,-0.038536,-0.037306,0.020705,-0.216633,-0.053297,0.050031,-0.199346,-0.021919,...,0.025072,-0.054338,0.01273,0.010771,0.026288,0.00753,0.046693,0.012943,0.04758,0.019807
ethan,0.165417,0.169069,-0.033515,0.042657,0.079517,-0.012874,-0.210166,0.192683,-0.067492,-0.147154,...,-0.032653,-0.008822,0.002169,-0.014436,-0.00475,0.018297,-0.053273,0.002901,-0.005846,0.022502
specific,0.155369,0.234052,0.072805,0.209065,-0.027028,0.141003,0.070653,0.107701,-0.169765,0.032382,...,0.039726,-0.026298,0.002765,0.04609,-0.042654,-0.036247,-0.066905,0.053194,0.000442,-0.037511
one,0.151805,0.000934,0.054222,0.115979,-0.066228,0.032573,0.041871,-0.011178,0.014196,0.049639,...,0.009339,0.010015,0.000934,0.039429,0.004543,-0.049234,0.088548,0.053501,0.006756,-0.027164
could,0.143928,-0.028645,0.065376,0.1105,-0.045838,-0.199751,-0.200667,-0.070368,-0.021276,-0.018876,...,-0.001292,0.062015,-0.025956,0.024775,-0.012652,0.007508,0.003386,0.01317,-0.059775,0.001325
