# Stepic ML Contest

### Churn from 'Statistics in R' prediction

In [1382]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [1383]:
events_df = pd.read_csv("event_data_train.csv")
events_df.head()

Unnamed: 0,step_id,timestamp,action,user_id
0,32815,1434340848,viewed,17632
1,32815,1434340848,passed,17632
2,32815,1434340848,discovered,17632
3,32811,1434340895,discovered,17632
4,32811,1434340895,viewed,17632


In [1384]:
subm_df = pd.read_csv("submissions_data_train.csv")
subm_df.head()

Unnamed: 0,step_id,timestamp,submission_status,user_id
0,31971,1434349275,correct,15853
1,31972,1434348300,correct,15853
2,31972,1478852149,wrong,15853
3,31972,1478852164,correct,15853
4,31976,1434348123,wrong,15853


In [1385]:
events_df.action.unique()

array(['viewed', 'passed', 'discovered', 'started_attempt'], dtype=object)

In [1386]:
events_df['date'] = pd.to_datetime(events_df.timestamp, unit='s')
subm_df['date'] = pd.to_datetime(subm_df.timestamp, unit='s')

In [1387]:
events_df.date.agg(['min', 'max'])

min   2015-06-15 04:00:48
max   2018-05-19 23:33:31
Name: date, dtype: datetime64[ns]

In [1388]:
events_df.shape

(3480703, 5)

In [1389]:
subm_df.shape

(509104, 5)

In [1390]:
events_df['day'] = events_df.date.dt.date
subm_df['day'] = subm_df.date.dt.date

In [1391]:
events_df.head()

Unnamed: 0,step_id,timestamp,action,user_id,date,day
0,32815,1434340848,viewed,17632,2015-06-15 04:00:48,2015-06-15
1,32815,1434340848,passed,17632,2015-06-15 04:00:48,2015-06-15
2,32815,1434340848,discovered,17632,2015-06-15 04:00:48,2015-06-15
3,32811,1434340895,discovered,17632,2015-06-15 04:01:35,2015-06-15
4,32811,1434340895,viewed,17632,2015-06-15 04:01:35,2015-06-15


In [1392]:
# Number of users during each day
events_df.groupby('day').user_id.nunique().head()

day
2015-06-15    705
2015-06-16    526
2015-06-17    440
2015-06-18    411
2015-06-19    377
Name: user_id, dtype: int64

In [1393]:
sns.set(rc={'figure.figsize':(9,6)})
#events_df.groupby('day').user_id.nunique().plot();

In [1394]:
# Number of passed steps per user:
""" 1)Choose only rows with action 'passed'
    2)Group them by user id, get rid from multiindex with as_index=False
    3)Apply 'count' function to step_id column
    4)Rename 'step_id' column to 'passed_steps' and show first 5 entries"""

events_df[events_df.action == 'passed'] \
    .groupby('user_id', as_index=False) \
    .agg({'step_id':'count'}) \
    .rename(columns={'step_id':'passed_steps'}).head()

Unnamed: 0,user_id,passed_steps
0,2,9
1,3,87
2,5,11
3,7,1
4,8,84


In [1395]:
#BAD SOLUTION. Some data lost - we don't count those, who didn't pass any steps
# events_df[events_df.action == 'passed'] \
#     .groupby('user_id', as_index=False) \
#     .agg({'step_id':'count'}) \
#     .rename(columns={'step_id':'passed_steps'}) \
#     .passed_steps.hist();

In [1396]:
# Pivot our data! 
# For every user (as index) and every type of action (as columns) we count number of step_id's
# Also fill NaN's with 0 and get rid from multiindex ('action' becomes simple enumeration index)
events_df.pivot_table(index='user_id', columns='action', values='step_id', \
               aggfunc='count', fill_value=0).reset_index().head()
# Note that 'action' now is just a name of index column, nothing more than ordinal number.

action,user_id,discovered,passed,started_attempt,viewed
0,1,1,0,0,1
1,2,9,9,2,10
2,3,91,87,30,192
3,5,11,11,4,12
4,7,1,1,0,1


In [1397]:
# GOOD. Now we count every user, include that ones who didn't pass any steps
#events_df.pivot_table(index='user_id', columns='action', values='step_id', \
#               aggfunc='count', fill_value=0).reset_index().passed.hist();

In [1398]:
# Now we pivot task_submissions df, make user scores df
# For every user as index and subm. status as columns we count number of correct and wrong submissions.
users_scores = subm_df.pivot_table(index='user_id', columns='submission_status', values='step_id', \
               aggfunc='count', fill_value=0).reset_index()
# Note that 'submission_status' now is just a name of index column, nothing more than ordinal number.
users_scores.head()

submission_status,user_id,correct,wrong
0,2,2,0
1,3,29,23
2,5,2,2
3,8,9,21
4,14,0,1


### Let's find average break in the course

In [1399]:
# Get rid from user_id and day duplicates:
# Show 3 columns, drop duplicates in 'user_id' and 'day'
events_df[['user_id','day','timestamp']].drop_duplicates(subset=['user_id','day']).head()

Unnamed: 0,user_id,day,timestamp
0,17632,2015-06-15,1434340848
36,12494,2015-06-15,1434341931
59,442,2015-06-15,1434342029
62,22254,2015-06-15,1434342042
94,6646,2015-06-15,1434342230


In [1400]:
# Make lists of timestamp of every event for each unique user
events_df[['user_id','day','timestamp']] \
    .drop_duplicates(subset=['user_id','day']) \
    .groupby('user_id')['timestamp'].apply(list).head()

user_id
1                                         [1472827464]
2                             [1514383364, 1519226966]
3    [1434358476, 1441257725, 1441440209, 144153391...
5                             [1466156809, 1499859621]
7                                         [1521634660]
Name: timestamp, dtype: object

In [1401]:
# np.diff returns a list of differences between _each_ neighbor element in the original list
# e.g. from list with 4 timestamps we get 3 differences (between 1st and 2nd, 2nd and 3rd, 3rd and 4th)
# Thus we get amount of time between actions of the course
events_df[['user_id','day','timestamp']] \
    .drop_duplicates(subset=['user_id','day']) \
    .groupby('user_id')['timestamp'].apply(list) \
    .apply(np.diff).head()
# Note that we get zero diff (empty list) from 1-timestamp sequence

user_id
1                                                  []
2                                           [4843602]
3    [6899249, 182484, 93710, 2768870, 171400, 78712]
5                                          [33702812]
7                                                  []
Name: timestamp, dtype: object

In [1402]:
# Put these amounts of time between actions (gaps) in new numpy array
gap_data = events_df[['user_id','day','timestamp']] \
    .drop_duplicates(subset=['user_id','day']) \
    .groupby('user_id')['timestamp'].apply(list) \
    .apply(np.diff).to_numpy()  # 'to_numpy()' equal to '.values'
gap_data

array([array([], dtype=int32), array([4843602]),
       array([6899249,  182484,   93710, 2768870,  171400,   78712]), ...,
       array([   86128,   136759,    94899,  1422583, 14347289,   505061,
          98252,    57019]),
       array([1567990]), array([], dtype=int32)], dtype=object)

In [1403]:
# Turn numpy array(list of lists) into one big list and transform it into pandas series
gap_data = pd.Series(np.concatenate(gap_data, axis=0))
gap_data.head()

0    4843602
1    6899249
2     182484
3      93710
4    2768870
dtype: int32

In [1404]:
# Difference in days
gap_data = gap_data / (60 * 60 * 24)
gap_data.tail()

97867    166.056586
97868      5.845613
97869      1.137176
97870      0.659942
97871     18.148032
dtype: float64

In [1405]:
#gap_data[gap_data < 200].hist();
# The biggest part of users have gap in 0.001 day between actions

In [1406]:
gap_data.quantile(.95)
# It means that 95% of users have a gap under ~60 days
# Thus only 5% of users comes back to course after ~60 day gap

59.760440972222156

In [1407]:
gap_data.quantile(.90)
# It means that 90% of users have a gap under ~18 days

18.325995370370403

#### Lesson 1.11, find Anatoly Karpov's id

In [1408]:
# my solution... totally nightmare :(
step_1_11_df = events_df[events_df['date'].dt.year == 2016]
step_1_11_df.insert(6,'month',step_1_11_df.date.dt.month)
moths_id_s = step_1_11_df.groupby('month')['user_id'].unique()
result = {}
for month, m_value in enumerate(moths_id_s):
    for id_ in m_value:
        result[id_] = 0
        for el in moths_id_s:
            if id_ in el:
                result[id_]+=1
for el in result:
    if result[el]>11 :
        print(el)  # Lesson 1.11 solution

1046


In [1409]:
# More simple solution
# max of started_attempt value for user
events_df.pivot_table(index='user_id',columns='action', values='step_id', \
                       aggfunc='count', fill_value=0).reset_index(). \
                        sort_values(['started_attempt'],ascending=False).iloc[0,0]

1046

In [1410]:
events_df.tail(1) #see last day's timestamp in this dataset

Unnamed: 0,step_id,timestamp,action,user_id,date,day
3480702,33892,1526772811,viewed,18526,2018-05-19 23:33:31,2018-05-19


In [1411]:
# timestamp of the latest action for every user_id
users_data = events_df.groupby('user_id', as_index=False) \
                .agg({'timestamp':'max'}).rename(columns={'timestamp':'last_timestamp'})
users_data.head()

Unnamed: 0,user_id,last_timestamp
0,1,1472827464
1,2,1519226966
2,3,1444581588
3,5,1499859939
4,7,1521634660


In [1412]:
# Timestamp constants
NOW = 1526772811  # last day's timestamp
DROP_OUT_THRESHOLD = 30 *24 * 60 * 60  # Amount of seconds in month

In [1413]:
users_data['user_is_gone'] = (NOW - users_data.last_timestamp) > DROP_OUT_THRESHOLD
users_data.head()

Unnamed: 0,user_id,last_timestamp,user_is_gone
0,1,1472827464,True
1,2,1519226966,True
2,3,1444581588,True
3,5,1499859939,True
4,7,1521634660,True


In [1414]:
users_scores.head()

submission_status,user_id,correct,wrong
0,2,2,0
1,3,29,23
2,5,2,2
3,8,9,21
4,14,0,1


In [1415]:
# Merge last timestamp with scrores df
users_data = users_data.merge(users_scores, on='user_id', how='outer').fillna(0)
# arg 'outer' - union; default arg 'inner' - intersection
users_data.head()

Unnamed: 0,user_id,last_timestamp,user_is_gone,correct,wrong
0,1,1472827464,True,0.0,0.0
1,2,1519226966,True,2.0,0.0
2,3,1444581588,True,29.0,23.0
3,5,1499859939,True,2.0,2.0
4,7,1521634660,True,0.0,0.0


In [1416]:
# create df with data about actions for every user id
users_events_data = events_df.pivot_table(index='user_id', columns='action', values='step_id', \
               aggfunc='count', fill_value=0).reset_index()
# yes, we already did this pivot table before, but didn't assign this to variable
users_events_data.head()  

action,user_id,discovered,passed,started_attempt,viewed
0,1,1,0,0,1
1,2,9,9,2,10
2,3,91,87,30,192
3,5,11,11,4,12
4,7,1,1,0,1


In [1417]:
# Merge again, now with data about activities
users_data = users_data.merge(users_events_data, on='user_id', how='outer').fillna(0)
users_data.head()

Unnamed: 0,user_id,last_timestamp,user_is_gone,correct,wrong,discovered,passed,started_attempt,viewed
0,1,1472827464,True,0.0,0.0,1,0,0,1
1,2,1519226966,True,2.0,0.0,9,9,2,10
2,3,1444581588,True,29.0,23.0,91,87,30,192
3,5,1499859939,True,2.0,2.0,11,11,4,12
4,7,1521634660,True,0.0,0.0,1,1,0,1


In [1418]:
# number of unique learning days
users_days = events_df.groupby('user_id').day.nunique().to_frame().reset_index()
users_days.head()

Unnamed: 0,user_id,day
0,1,1
1,2,2
2,3,7
3,5,2
4,7,1


In [1419]:
# Add this feature about number of learning days to 'users_data' df 
users_data = users_data.merge(users_days, on='user_id', how='outer').rename(columns={"day": "learning_days"})

In [1420]:
users_data.head()

Unnamed: 0,user_id,last_timestamp,user_is_gone,correct,wrong,discovered,passed,started_attempt,viewed,learning_days
0,1,1472827464,True,0.0,0.0,1,0,0,1,1
1,2,1519226966,True,2.0,0.0,9,9,2,10,2
2,3,1444581588,True,29.0,23.0,91,87,30,192,7
3,5,1499859939,True,2.0,2.0,11,11,4,12,2
4,7,1521634660,True,0.0,0.0,1,1,0,1,1


In [1421]:
# check that no one is lost
# number of unique users in raw df must be equal to the same in 'feature engineered' df
print(f"{events_df.user_id.nunique()}  {users_data.user_id.nunique()}")

19234  19234


In [1422]:
# Add new feature - fact of passing course based on passed steps
users_data['course_passed'] = users_data.passed > 170

In [1423]:
users_data.head()

Unnamed: 0,user_id,last_timestamp,user_is_gone,correct,wrong,discovered,passed,started_attempt,viewed,learning_days,course_passed
0,1,1472827464,True,0.0,0.0,1,0,0,1,1,False
1,2,1519226966,True,2.0,0.0,9,9,2,10,2,False
2,3,1444581588,True,29.0,23.0,91,87,30,192,7,False
3,5,1499859939,True,2.0,2.0,11,11,4,12,2,False
4,7,1521634660,True,0.0,0.0,1,1,0,1,1,False


In [1424]:
users_data.groupby('course_passed').count()

Unnamed: 0_level_0,user_id,last_timestamp,user_is_gone,correct,wrong,discovered,passed,started_attempt,viewed,learning_days
course_passed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
False,17809,17809,17809,17809,17809,17809,17809,17809,17809,17809
True,1425,1425,1425,1425,1425,1425,1425,1425,1425,1425


In [1425]:
print(f"passed ratio = {round(1425/17809 *100)}%")

passed ratio = 8%


## Will user drop the course after n days?

In [1426]:
# What is median for days in which users complete the course?
# i.e. what is number of days in which biggest part of users completes the course?
users_data.query('course_passed == True').learning_days.median()
# also works users_data[users_data.course_passed].learning_days.median()

# Hmm, 20? ==> lets consider n = 3

20.0

In [1427]:
#users_data.query('course_passed == True').learning_days.hist();

In [1428]:
# Create new feature - datetime of starting the course:
user_min_time = events_df.groupby('user_id', as_index=False)\
                .timestamp.min() \
                .rename(columns={'timestamp':'min_timestamp'})
user_min_time.head()
# user_min_time = events_df.groupby('user_id', as_index=False) \
#                 .agg({'timestamp':'min'}).rename(columns={'timestamp':'min_timestamp'})  # also works

Unnamed: 0,user_id,min_timestamp
0,1,1472827464
1,2,1514383364
2,3,1434358476
3,5,1466156809
4,7,1521634660


In [1429]:
# Merge 'min_timestamp' feature with our amazing users_data df
users_data = users_data.merge(user_min_time, how='outer')
users_data.head()

Unnamed: 0,user_id,last_timestamp,user_is_gone,correct,wrong,discovered,passed,started_attempt,viewed,learning_days,course_passed,min_timestamp
0,1,1472827464,True,0.0,0.0,1,0,0,1,1,False,1472827464
1,2,1519226966,True,2.0,0.0,9,9,2,10,2,False,1514383364
2,3,1444581588,True,29.0,23.0,91,87,30,192,7,False,1434358476
3,5,1499859939,True,2.0,2.0,11,11,4,12,2,False,1466156809
4,7,1521634660,True,0.0,0.0,1,1,0,1,1,False,1521634660


#### Select actions in first three days after starting a course

In [1430]:
# !!! magic using of map !!!
# concatenate two columns in new one
events_df['user_time'] = events_df.user_id.map(str) + '_' + events_df.timestamp.map(str)
# user_id-underscore-time(of every action)

In [1431]:
events_df.head()

Unnamed: 0,step_id,timestamp,action,user_id,date,day,user_time
0,32815,1434340848,viewed,17632,2015-06-15 04:00:48,2015-06-15,17632_1434340848
1,32815,1434340848,passed,17632,2015-06-15 04:00:48,2015-06-15,17632_1434340848
2,32815,1434340848,discovered,17632,2015-06-15 04:00:48,2015-06-15,17632_1434340848
3,32811,1434340895,discovered,17632,2015-06-15 04:01:35,2015-06-15,17632_1434340895
4,32811,1434340895,viewed,17632,2015-06-15 04:01:35,2015-06-15,17632_1434340895


In [1432]:
learning_time_threshold = 3 * 24 * 60 * 60  # 3 days in seconds

In [1433]:
# 3rd day for every user (start course time + threshold)
user_learning_time_threshold = user_min_time.user_id.map(str) + '_' + \
                                (user_min_time.min_timestamp + learning_time_threshold).map(str)

In [1434]:
user_learning_time_threshold.head()

0    1_1473086664
1    2_1514642564
2    3_1434617676
3    5_1466416009
4    7_1521893860
dtype: object

In [1435]:
# alternative way from stepic's comments:
# (without creating string columns with concatenation)
# users_data3 = users_data.copy()
# users_data3['min_plus_3days'] = users_data.min_timestamp + learning_time_threshold
# users_data3 = users_data3[ ['user_id', 'min_plus_3days'] ]

# events_df3 = events_df.merge(users_data3, on = 'user_id', how='outer')
# events_df3 = events_df3[events_df3.timestamp < events_df3.min_plus_3days]

In [1436]:
user_min_time['user_learning_time_threshold'] = user_learning_time_threshold

In [1437]:
events_df.shape

(3480703, 7)

In [1438]:
events_df = events_df.merge(user_min_time[['user_id', 'user_learning_time_threshold']],\
                           how='outer')
# it's good to control shape before and right after this merge. Only number of columns must change to +1!

In [1439]:
events_df.shape

(3480703, 8)

In [1440]:
# Compare every action's time with our '+3days' threshold 
# and add to new df only those actions which is less
events_df_train = events_df[events_df.user_time <= events_df.user_learning_time_threshold]
events_df_train.head()

Unnamed: 0,step_id,timestamp,action,user_id,date,day,user_time,user_learning_time_threshold
0,32815,1434340848,viewed,17632,2015-06-15 04:00:48,2015-06-15,17632_1434340848,17632_1434600048
1,32815,1434340848,passed,17632,2015-06-15 04:00:48,2015-06-15,17632_1434340848,17632_1434600048
2,32815,1434340848,discovered,17632,2015-06-15 04:00:48,2015-06-15,17632_1434340848,17632_1434600048
3,32811,1434340895,discovered,17632,2015-06-15 04:01:35,2015-06-15,17632_1434340895,17632_1434600048
4,32811,1434340895,viewed,17632,2015-06-15 04:01:35,2015-06-15,17632_1434340895,17632_1434600048


#### Lesson 2.9 Find the hardest step

In [1441]:
subm_df.head()

Unnamed: 0,step_id,timestamp,submission_status,user_id,date,day
0,31971,1434349275,correct,15853,2015-06-15 06:21:15,2015-06-15
1,31972,1434348300,correct,15853,2015-06-15 06:05:00,2015-06-15
2,31972,1478852149,wrong,15853,2016-11-11 08:15:49,2016-11-11
3,31972,1478852164,correct,15853,2016-11-11 08:16:04,2016-11-11
4,31976,1434348123,wrong,15853,2015-06-15 06:02:03,2015-06-15


In [1442]:
subm_df.query('submission_status == "wrong"') \
        .groupby('step_id', as_index=False) \
        .agg({'submission_status':'count'}) \
        .sort_values(by='submission_status', ascending=False) \
        .head()

Unnamed: 0,step_id,submission_status
4,31978,16084
11,32031,13373
19,32202,13002
44,33481,10300
12,32075,10003


##### lets move on

In [1443]:
# Max number of unique days for user in events_df_train
# Previously we've filtered only events, that happened on interval start day + 3 days
# So, there can't be more than 4 days max (when user started to learn at midday and we plus 3 days to another midday,
# technically there'll be 4 calendar days)
events_df_train.groupby('user_id').day.nunique().max()

4

In [1444]:
# same for submissions df
subm_df['user_time'] = subm_df.user_id.map(str) + '_' + subm_df.timestamp.map(str)

In [1445]:
subm_df.shape

(509104, 7)

In [1446]:
subm_df = subm_df.merge(user_min_time[['user_id', 'user_learning_time_threshold']], how='outer')
subm_df.shape # rows increased because of users with no submits

(518398, 8)

In [1447]:
subm_df_train = subm_df[subm_df.user_time <= subm_df.user_learning_time_threshold]
# Max number of unique days for user in events_df_train
# Previously we've filtered only events, that happened on interval start day + 3 days
# So, there can't be more than 4 days max
subm_df_train.groupby('user_id').day.nunique().max()

4

In [1448]:
X = subm_df_train.groupby('user_id') \
    .day.nunique() \
    .to_frame() \
    .reset_index() \
    .rename(columns={'day': 'days'})

In [1449]:
steps_tried = subm_df_train.groupby('user_id') \
    .step_id.nunique() \
    .to_frame() \
    .reset_index() \
    .rename(columns={'step_id': 'steps_tried'})
steps_tried.head() # number of trials for every user

Unnamed: 0,user_id,steps_tried
0,2,2
1,3,4
2,8,11
3,14,1
4,16,23


In [1450]:
X = X.merge(steps_tried, on='user_id', how='outer')
X.shape

(8913, 3)

In [1451]:
X.head()

Unnamed: 0,user_id,days,steps_tried
0,2,1,2
1,3,1,4
2,8,1,11
3,14,1,1
4,16,3,23


In [1452]:
# merge with correct/wrong
X = X.merge(subm_df_train.pivot_table(index='user_id', \
                          columns='submission_status', \
                          values='step_id', \
                          aggfunc='count', fill_value=0) \
                          .reset_index())

In [1453]:
X.head()

Unnamed: 0,user_id,days,steps_tried,correct,wrong
0,2,1,2,2,0
1,3,1,4,4,4
2,8,1,11,9,21
3,14,1,1,0,1
4,16,3,23,23,27


In [1454]:
# To help the models, we make new feature with ratio
X['corr_ratio'] = X.correct / (X.correct + X.wrong)
X.head()

Unnamed: 0,user_id,days,steps_tried,correct,wrong,corr_ratio
0,2,1,2,2,0,1.0
1,3,1,4,4,4,0.5
2,8,1,11,9,21,0.3
3,14,1,1,0,1,0.0
4,16,3,23,23,27,0.46


In [1455]:
X = X.merge(events_df_train.pivot_table(index='user_id', \
                                       columns='action', \
                                       values='step_id', \
                                       aggfunc='count', \
                                       fill_value=0) \
            .reset_index()[['user_id', 'viewed']], how='outer')

In [1456]:
X.head()

Unnamed: 0,user_id,days,steps_tried,correct,wrong,corr_ratio,viewed
0,2,1.0,2.0,2.0,0.0,1.0,9
1,3,1.0,4.0,4.0,4.0,0.5,20
2,8,1.0,11.0,9.0,21.0,0.3,154
3,14,1.0,1.0,0.0,1.0,0.0,9
4,16,3.0,23.0,23.0,27.0,0.46,132


In [1457]:
X = X.fillna(0)

In [1458]:
# Lets temporary expand df X, add some important features
X = X.merge(users_data[['user_id', 'course_passed', 'user_is_gone']], how='outer')

In [1459]:
# Filter users who start course, didn't finish it yet and didn't gone
X = X[~((X.user_is_gone == False) & (X.course_passed == False))]
# ~ equals NOT
# our condition means that user either didn't pass the course or didn't churn
# so we drop users which didn't pass course and didn't churn

In [1460]:
X.groupby(['course_passed', 'user_is_gone']).user_id.count()
# Note that here we have only churned users frome those who didn't pass the course
# And for those, who passed - we have both churned and those who left on the course (that means
# they've passed not long time ago)

course_passed  user_is_gone
False          True            16560
True           False             141
               True             1284
Name: user_id, dtype: int64

#### Now create final X and y frames

In [1461]:
y = X.course_passed.map(int)
X = X.drop(columns=['user_is_gone', 'course_passed'])
# Also we make user id the index of X df - we hide it from model but save it from us
X = set_index(X.user_id).drop(columns='user_id')

In [None]:
# TODO: save X and Y

In [1462]:
X.head()

Unnamed: 0_level_0,index,days,steps_tried,correct,wrong,corr_ratio,viewed
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2,0,1.0,2.0,2.0,0.0,1.0,9
3,1,1.0,4.0,4.0,4.0,0.5,20
8,2,1.0,11.0,9.0,21.0,0.3,154
14,3,1.0,1.0,0.0,1.0,0.0,9
16,4,3.0,23.0,23.0,27.0,0.46,132
