In [27]:
#Import dataset
import pandas as pd
import nltk
import re
df = pd.read_csv(r"C:\Users\NCiaccia\Documents\Data Science Academy\Week 5\Synthetic Dataset\PSTRE_syntheticData.csv", sep='\t')
df.head()

Unnamed: 0,TestTakerID,Timestamp,Coding
0,1001042,0,START
1,1001042,44985,SS_Se
2,1001042,55852,SS_Se_OK
3,1001042,55950,SS_Type_2
4,1001042,66546,E


**Create two new variables: action sequence variable and time interval sequence for each participant**

In [33]:
#create action_sequence variable for each participant
df2=df.groupby('TestTakerID')['Coding'].apply(' '.join).reset_index()
df2.rename(columns={"Coding": "action_sequence"}, inplace=True)
df2.head()

Unnamed: 0,TestTakerID,action_sequence
0,1001042,START SS_Se SS_Se_OK SS_Type_2 E SS E Next Nex...
1,1001079,START SS_So SS_So_1B SS_So_OK SS E SS E SS E S...
2,1001103,START SS_Se SS_Se_OK SS_Type_2 E SS E SS E Nex...
3,1001112,START E SS E Next Next_OK END
4,1002087,START SS_Se_OK SS_Type_200 SS_So_1B SS_So_OK E...


In [34]:
#create time interval sequence for each participant 
df['time_interval']=df.groupby('TestTakerID')['Timestamp'].diff()
df.head(20)

Unnamed: 0,TestTakerID,Timestamp,Coding,time_interval
0,1001042,0,START,
1,1001042,44985,SS_Se,44985.0
2,1001042,55852,SS_Se_OK,10867.0
3,1001042,55950,SS_Type_2,98.0
4,1001042,66546,E,10596.0
5,1001042,90189,SS,23643.0
6,1001042,96096,E,5907.0
7,1001042,101813,Next,5717.0
8,1001042,105379,Next_OK,3566.0
9,1001042,105382,END,3.0


**Extract time for the first action for each participant**

In [35]:
#Identify event number for each action
df['event_num']=df.groupby('TestTakerID').cumcount()
df.head()

Unnamed: 0,TestTakerID,Timestamp,Coding,time_interval,event_num
0,1001042,0,START,,0
1,1001042,44985,SS_Se,44985.0,1
2,1001042,55852,SS_Se_OK,10867.0,2
3,1001042,55950,SS_Type_2,98.0,3
4,1001042,66546,E,10596.0,4


In [36]:
#Keep only first action
first_event=df.loc[df['event_num'] == 1]
first_event.head()

Unnamed: 0,TestTakerID,Timestamp,Coding,time_interval,event_num
1,1001042,44985,SS_Se,44985.0,1
11,1001079,21234,SS_So,21234.0,1
27,1001103,39002,SS_Se,39002.0,1
39,1001112,54895,E,54895.0,1
46,1002087,158089,SS_Se_OK,158089.0,1


In [37]:
#Extract time_interval
first_event_time=first_event.drop(columns=['Timestamp', 'Coding', 'event_num'])
first_event_time.head()

Unnamed: 0,TestTakerID,time_interval
1,1001042,44985.0
11,1001079,21234.0
27,1001103,39002.0
39,1001112,54895.0
46,1002087,158089.0


**Represent each action sequence by n-grams (n=2)**

In [38]:
# Put all of the action lists into one list
action_list= df2.action_sequence.tolist()
action_list[:5]

['START SS_Se SS_Se_OK SS_Type_2 E SS E Next Next_OK END',
 'START SS_So SS_So_1B SS_So_OK SS E SS E SS E SS E E Next Next_OK END',
 'START SS_Se SS_Se_OK SS_Type_2 E SS E SS E Next Next_OK END',
 'START E SS E Next Next_OK END',
 'START SS_Se_OK SS_Type_200 SS_So_1B SS_So_OK E SS E Next Next_OK END']

In [39]:
#Split out each action so it's a list of lists
action_list2 = []
for x in action_list:
    y = re.sub("\s" , "||", x).split("||")     
    action_list2.append(y)

action_list2[:5]

[['START',
  'SS_Se',
  'SS_Se_OK',
  'SS_Type_2',
  'E',
  'SS',
  'E',
  'Next',
  'Next_OK',
  'END'],
 ['START',
  'SS_So',
  'SS_So_1B',
  'SS_So_OK',
  'SS',
  'E',
  'SS',
  'E',
  'SS',
  'E',
  'SS',
  'E',
  'E',
  'Next',
  'Next_OK',
  'END'],
 ['START',
  'SS_Se',
  'SS_Se_OK',
  'SS_Type_2',
  'E',
  'SS',
  'E',
  'SS',
  'E',
  'Next',
  'Next_OK',
  'END'],
 ['START', 'E', 'SS', 'E', 'Next', 'Next_OK', 'END'],
 ['START',
  'SS_Se_OK',
  'SS_Type_200',
  'SS_So_1B',
  'SS_So_OK',
  'E',
  'SS',
  'E',
  'Next',
  'Next_OK',
  'END']]

In [40]:
#create bigrams
bigram_list = []
for a in action_list2: 
    bigrams=pd.Series(nltk.ngrams(a, 2))
    bigram_list.append(bigrams)
bigram_list[:5]

[0           (START, SS_Se)
 1        (SS_Se, SS_Se_OK)
 2    (SS_Se_OK, SS_Type_2)
 3           (SS_Type_2, E)
 4                  (E, SS)
 5                  (SS, E)
 6                (E, Next)
 7          (Next, Next_OK)
 8           (Next_OK, END)
 dtype: object,
 0           (START, SS_So)
 1        (SS_So, SS_So_1B)
 2     (SS_So_1B, SS_So_OK)
 3           (SS_So_OK, SS)
 4                  (SS, E)
 5                  (E, SS)
 6                  (SS, E)
 7                  (E, SS)
 8                  (SS, E)
 9                  (E, SS)
 10                 (SS, E)
 11                  (E, E)
 12               (E, Next)
 13         (Next, Next_OK)
 14          (Next_OK, END)
 dtype: object,
 0            (START, SS_Se)
 1         (SS_Se, SS_Se_OK)
 2     (SS_Se_OK, SS_Type_2)
 3            (SS_Type_2, E)
 4                   (E, SS)
 5                   (SS, E)
 6                   (E, SS)
 7                   (SS, E)
 8                 (E, Next)
 9           (Next, Next_OK)
 10   

In [None]:
#parse out all of the bigrams
bigram_flattened = [x for users_events in bigram_list for x in users_events]
bigram_flattened

**Extra: Do something with the bigrams** \
*Answering the question: What are the 10 most common bigrams from this dataset and their frequencies?*

In [56]:
#count bigrams and sort. 
frequency = {}
for element in bigram_flattened:
    if element not in frequency:
        frequency[element] = 0
    frequency[element] += 1
    
sorted_freqs = sorted(frequency.items(), key=lambda x: x[1], reverse=True)
sorted_freqs

[(('Next', 'Next_OK'), 1076),
 (('Next_OK', 'END'), 1076),
 (('SS', 'E'), 784),
 (('E', 'SS'), 769),
 (('E_S', 'Next'), 575),
 (('E', 'E_S'), 558),
 (('START', 'E'), 553),
 (('E', 'Next'), 520),
 (('Next', 'Next_C'), 326),
 (('E', 'E'), 316),
 (('SS_Se_OK', 'SS_Type_0'), 225),
 (('SS_So_1B', 'SS_So_OK'), 192),
 (('SS_Se_OK', 'SS_Type_2'), 190),
 (('SS_So_OK', 'E'), 177),
 (('SS_Se', 'SS_Se_OK'), 171),
 (('Next_C', 'Next'), 166),
 (('SS_Type_0', 'SS_Se_OK'), 163),
 (('SS_So', 'SS_So_1B'), 126),
 (('START', 'SS_Se'), 124),
 (('SS_Type_2', 'E'), 117),
 (('START', 'Next'), 101),
 (('SS', 'SS'), 94),
 (('START', 'SS'), 89),
 (('START', 'SS_So'), 78),
 (('SS_Type_2', 'SS_Se_OK'), 57),
 (('Next_C', 'E_S'), 56),
 (('Next_C', 'E'), 51),
 (('Next_C', 'SS'), 50),
 (('E_S', 'E_S'), 49),
 (('SS_So_OK', 'SS_So'), 49),
 (('START', 'SS_So_1B'), 48),
 (('SS', 'SS_So'), 40),
 (('SS_Save', 'E'), 37),
 (('SS_Se_OK', 'SS_Type_1'), 37),
 (('START', 'SS_Se_OK'), 36),
 (('SS', 'SS_Se'), 35),
 (('SS_So', 'SS_S

In [57]:
#keep 10 most popular 
most_popular=sorted_freqs[:10]
most_popular

[(('Next', 'Next_OK'), 1076),
 (('Next_OK', 'END'), 1076),
 (('SS', 'E'), 784),
 (('E', 'SS'), 769),
 (('E_S', 'Next'), 575),
 (('E', 'E_S'), 558),
 (('START', 'E'), 553),
 (('E', 'Next'), 520),
 (('Next', 'Next_C'), 326),
 (('E', 'E'), 316)]

In [None]:
#Solution from Oren 
#[x for x in nltk.bigrams(df['Coding']) if x != ('END', 'START')]   # Get rid of End, Start because looking at people end to end

In [55]:
#Solution from Oren without getting rid of End, Start bigram
#list(nltk.bigrams(df['Coding'])) 