## 影片長度與觀看倍數、影片完成度、暫停次數

In [1]:
from matplotlib import pyplot as plt
%matplotlib inline
# 設定圖形大小; DPI越大圖越大
plt.rcParams["figure.dpi"] = 150

import seaborn as sns
import pandas as pd
import numpy as np
import scipy.stats as stats
import statsmodels.api as sm
import statsmodels.stats.api as sms
import statsmodels.formula.api as smf
import statsmodels.stats.multicomp as smm
import statsmodels.stats.libqsturng as ssl
import statsmodels.stats.outliers_influence as sso
import math

pd.options.display.max_rows = 999
np.set_printoptions(suppress=True)

### 平均暫停次數

In [2]:
def cntPause(result, raw, grades): # arguments: a dataframe with a column of ids, the raw data, grades data
    pause_cnt = np.zeros(result['video_id'].shape[0])
    pause_cnt[:] = np.nan
    k = 0
    for i in result['video_id']:
        video_i = raw[raw['video_id'] == int(i)]
        paused_i = np.zeros(grades['student_id'].shape[0])
        m = 0
        for j in grades['student_id']:
            student_j = video_i[video_i['student_id'] == int(j)]
            paused = student_j[(student_j['end'] > student_j['start']) & (student_j['end'] - student_j['start'] >= 5) & (student_j['playback_rate'] == 0)]
            paused_i[m] = paused.shape[0]
            m = m + 1
        pause_cnt[k] = sum(paused_i) / paused_i.shape[0]
        k = k + 1
    result['avg_pause_cnt'] = pause_cnt

### 觀看倍數與影片完成度

In [3]:
def cntFreq(result, raw, grades): # arguments: a dataframe with a column of ids and a column of lens to store results, the raw data, grades data
    freq = np.ones(result['video_id'].shape[0])
    playback_rate = np.zeros(result['video_id'].shape[0])
    m = 0
    seq = [4581]
    for i in result['video_id']:
        video_i = raw[raw['video_id'] == int(i)]
        vid_len_i = result[result['video_id'] == int(i)]['len']
        vid_len = np.ones(grades['student_id'].shape[0])
        total_spent_time = np.ones(grades['student_id'].shape[0]) # 真正所花的時間
        total_video_time = np.ones(grades['student_id'].shape[0]) # 影片時間
        k = 0
        seq1 = [64]
        for j in grades['student_id']:
            student_j = video_i[video_i['student_id'] == int(j)]
            real_watch_j = student_j[(student_j['end'] > student_j['start']) & (student_j['playback_rate'] != 0)]
            
            video_time_j = real_watch_j['end'] - real_watch_j['start']
            total_video_time[k] = sum(video_time_j)
            total_spent_time[k] = sum(video_time_j/real_watch_j['playback_rate'])
            vid_len[k] = vid_len_i
            k = k + 1
        total_spent_time = [x if x != 0 else 1 for x in total_spent_time]
        avg_playback_rate = [i/j for (i, j) in zip(total_video_time, total_spent_time)]
        student_freq = np.array(total_video_time) / np.array(vid_len )
        playback_rate[m] = sum(total_video_time) / sum(total_spent_time)
        freq[m] = sum(student_freq) / student_freq.shape[0]
        m = m + 1
    result['avg_freq'] = freq
    result['avg_playback_rate'] = playback_rate

### 資料處理

#### DSAP107

In [14]:
dsap107_org = pd.read_excel('dsap107_video_raw.xlsx')
dsap107_grades = pd.read_csv('/Users/chihsin1/Documents/stats2_finalProject_data/DSAP107 grades.csv')
dsap107_video_info = pd.read_excel('/Users/chihsin1/Documents/stats2_finalProject_data/DSAP107 videos_sep.xlsx')
dsap107_org['created_at'] = pd.to_datetime(dsap107_org['created_at'])
dsap107_org = dsap107_org[~(dsap107_org['created_at'] > '2019-06-30 00:00:00 UTC')]
display(dsap107_org.tail())

Unnamed: 0,id,start,end,student_id,course_id,video_id,created_at,updated_at,playback_rate
98112,3416775,40,45,89,224,4605,2019-06-17 13:33:39,2019-06-17 13:33:39 UTC,0.0
98113,3416779,46,51,89,224,4605,2019-06-17 13:33:40,2019-06-17 13:33:40 UTC,0.0
98114,3416758,0,2,89,224,4605,2019-06-17 13:33:34,2019-06-17 13:33:34 UTC,1.0
98115,3416762,13,18,89,224,4605,2019-06-17 13:33:36,2019-06-17 13:33:36 UTC,0.0
98116,3416765,24,29,89,224,4605,2019-06-17 13:33:37,2019-06-17 13:33:37 UTC,0.0


In [15]:
dsap107_video = pd.DataFrame(data = dsap107_video_info['video_id'])
len = np.zeros(dsap107_video['video_id'].shape[0])
for i in range(dsap107_video['video_id'].shape[0]):
    len[i] = ''.join(c for c in dsap107_video_info['meta'][i] if c.isdigit())
dsap107_video['len'] = len
display(dsap107_video.head())

Unnamed: 0,video_id,len
0,4607,945.0
1,4581,743.0
2,4582,728.0
3,4585,1061.0
4,4586,920.0


In [17]:
dsap107_grades = dsap107_grades.iloc[1:]

In [18]:
cntPause(dsap107_video, dsap107_org, dsap107_grades)
cntFreq(dsap107_video, dsap107_org, dsap107_grades)
display(dsap107_video.head())

Unnamed: 0,video_id,len,avg_pause_cnt,avg_freq,avg_playback_rate
0,4607,945.0,5.32381,0.96641,1.494205
1,4581,743.0,3.019048,0.906159,1.536652
2,4582,728.0,3.647619,0.935309,1.546589
3,4585,1061.0,18.72381,1.097114,1.465569
4,4586,920.0,11.371429,1.159451,1.446708


In [19]:
dsap107_video.describe()

Unnamed: 0,video_id,len,avg_pause_cnt,avg_freq,avg_playback_rate
count,56.0,56.0,56.0,56.0,56.0
mean,5001.910714,630.089286,4.070748,0.791577,1.466973
std,471.811992,200.909503,3.797857,0.31969,0.287323
min,4581.0,114.0,0.0,0.0,0.0
25%,4599.75,485.5,1.395238,0.516878,1.497342
50%,4618.5,602.0,2.709524,0.902423,1.523935
75%,5450.25,787.5,6.019048,1.079594,1.544682
max,5964.0,1061.0,18.72381,1.252233,1.599311


In [20]:
#dsap107_video.to_excel('dsap107_videoLenVs.xlsx', index = False)
dsap107_video.to_excel('dsap107_videoLenVs_0614.xlsx', index = False)

#### DSAP108

In [21]:
dsap108_org = pd.read_excel('dsap108_video_raw.xlsx')
dsap108_grades = pd.read_csv('/Users/chihsin1/Documents/stats2_finalProject_data/DSAP108 grades.csv')
dsap108_video_info = pd.read_excel('/Users/chihsin1/Documents/stats2_finalProject_data/DSAP108 videos_sep.xlsx')
dsap108_org['created_at'] = pd.to_datetime(dsap108_org['created_at'])
dsap108_org = dsap108_org[~(dsap108_org['created_at'] > '2020-06-30 00:00:00 UTC')]
display(dsap108_org.tail())

Unnamed: 0,id,start,end,student_id,course_id,video_id,created_at,updated_at,playback_rate
113619,8114687,0,511,65,765,4598,2020-03-19 17:16:08,2020-03-19 17:16:08 UTC,1.5
113620,8114713,0,5,65,765,4599,2020-03-19 17:16:25,2020-03-19 17:16:25 UTC,1.0
113621,8115322,6,803,65,765,4599,2020-03-19 17:25:18,2020-03-19 17:25:18 UTC,1.5
113622,8115681,0,481,65,765,4602,2020-03-19 17:31:12,2020-03-19 17:31:12 UTC,1.5
113623,8115702,0,4,65,765,4603,2020-03-19 17:31:26,2020-03-19 17:31:26 UTC,1.0


In [22]:
dsap108_video = pd.DataFrame(data = dsap108_video_info['video_id'])
len = dsap108_video_info['meta']
dsap108_video['len'] = len
display(dsap108_video.head())

Unnamed: 0,video_id,len
0,29509,2431
1,25668,4010
2,25657,1081
3,29511,4302
4,26022,2508


In [32]:
dsap108_video2 = pd.DataFrame(data = dsap107_video_info['video_id'])
len = np.zeros(dsap108_video2['video_id'].shape[0])
for i in range(dsap108_video2['video_id'].shape[0]):
    len[i] = ''.join(c for c in dsap107_video_info['meta'][i] if c.isdigit())
dsap108_video2['len'] = len
display(dsap108_video2.head())

Unnamed: 0,video_id,len
0,4607,945.0
1,4581,743.0
2,4582,728.0
3,4585,1061.0
4,4586,920.0


In [23]:
dsap108_grades = dsap108_grades.iloc[1:]

In [33]:
cntPause(dsap108_video2, dsap108_org, dsap108_grades)
cntFreq(dsap108_video2, dsap108_org, dsap108_grades)
display(dsap108_video2.head())

Unnamed: 0,video_id,len,avg_pause_cnt,avg_freq,avg_playback_rate
0,4607,945.0,3.880734,0.814242,1.53363
1,4581,743.0,4.513761,0.829676,1.563145
2,4582,728.0,4.678899,0.775796,1.547804
3,4585,1061.0,8.981651,0.90523,1.455019
4,4586,920.0,4.678899,0.949103,1.447785


In [24]:
cntPause(dsap108_video, dsap108_org, dsap108_grades)
cntFreq(dsap108_video, dsap108_org, dsap108_grades)
display(dsap108_video.head())

Unnamed: 0,video_id,len,avg_pause_cnt,avg_freq,avg_playback_rate
0,29509,2431,0.0,0.0,0.0
1,25668,4010,26.12844,0.776302,1.549803
2,25657,1081,5.311927,0.811804,1.451234
3,29511,4302,0.0,0.0,0.0
4,26022,2508,7.504587,0.583465,1.56483


In [39]:
dsap108_video = pd.concat([dsap108_video, dsap108_video2], axis=0)
display(dsap108_video.head())
display(dsap108_video.tail())

Unnamed: 0,video_id,len,avg_pause_cnt,avg_freq,avg_playback_rate
0,29509,2431.0,0.0,0.0,0.0
1,25668,4010.0,26.12844,0.776302,1.549803
2,25657,1081.0,5.311927,0.811804,1.451234
3,29511,4302.0,0.0,0.0,0.0
4,26022,2508.0,7.504587,0.583465,1.56483


Unnamed: 0,video_id,len,avg_pause_cnt,avg_freq,avg_playback_rate
51,5594,800.0,6.192661,0.993085,1.431365
52,5595,374.0,1.633028,0.770888,1.414099
53,5596,526.0,2.155963,0.823839,1.451685
54,5597,343.0,0.798165,0.583839,1.404398
55,5964,1012.0,0.0,0.0,0.0


In [42]:
dsap108_video.describe()

Unnamed: 0,video_id,len,avg_pause_cnt,avg_freq,avg_playback_rate
count,110.0,110.0,110.0,110.0,110.0
mean,14817.118182,1000.845455,1.902919,0.487996,1.13912
std,10177.179481,802.7724,3.084311,0.397627,0.657026
min,4581.0,114.0,0.0,0.0,0.0
25%,4618.25,510.25,0.041284,0.003133,0.776089
50%,5780.5,734.0,0.949541,0.583652,1.468279
75%,23559.25,1108.0,2.444954,0.851427,1.535852
max,30967.0,4302.0,26.12844,1.159671,1.880576


In [43]:
#dsap108_video.to_excel('dsap108_videoLenVs.xlsx', index = False)
dsap108_video.to_excel('dsap108_videoLenVs_0614.xlsx', index = False)

#### OR107

In [44]:
or107_org = pd.read_excel('or107_video_raw.xlsx')
or107_grades = pd.read_csv('/Users/chihsin1/Documents/stats2_finalProject_data/OR107 grades.csv')
or107_video_info = pd.read_excel('/Users/chihsin1/Documents/stats2_finalProject_data/OR107 videos_sep.xlsx')
or107_org['created_at'] = pd.to_datetime(or107_org['created_at'])
or107_org = or107_org[~(or107_org['created_at'] > '2019-06-30 00:00:00 UTC')]
display(or107_org.tail())

Unnamed: 0,id,start,end,student_id,course_id,video_id,created_at,updated_at,playback_rate
81641,2805996,172,173,147,223,5956,2019-04-28 08:56:54,2019-04-28 08:56:54 UTC,1.5
81642,2814417,0,2,158,223,5956,2019-04-28 17:23:06,2019-04-28 17:23:06 UTC,1.0
81643,2957491,0,4,139,223,5956,2019-05-10 03:56:50,2019-05-10 03:56:50 UTC,1.0
81644,3106907,8,10,150,223,5956,2019-05-21 15:46:01,2019-05-21 15:46:01 UTC,1.25
81645,3106904,0,7,150,223,5956,2019-05-21 15:45:59,2019-05-21 15:45:59 UTC,1.0


In [45]:
or107_grades = or107_grades.iloc[1:]

In [46]:
or107_video = pd.DataFrame(data = or107_video_info['video_id'])
len = np.zeros(or107_video['video_id'].shape[0])
for i in range(or107_video['video_id'].shape[0]):
    len[i] = ''.join(c for c in or107_video_info['meta'][i] if c.isdigit())
or107_video['len'] = len
display(or107_video.head())

Unnamed: 0,video_id,len
0,4526,914.0
1,4530,712.0
2,4503,784.0
3,4511,710.0
4,4501,651.0


In [47]:
cntPause(or107_video, or107_org, or107_grades)
cntFreq(or107_video, or107_org, or107_grades)
display(or107_video.head())

Unnamed: 0,video_id,len,avg_pause_cnt,avg_freq,avg_playback_rate
0,4526,914.0,5.915663,0.780378,1.420191
1,4530,712.0,2.771084,0.249425,1.307856
2,4503,784.0,0.228916,0.023574,1.610781
3,4511,710.0,1.662651,0.477295,1.523039
4,4501,651.0,5.0,0.584421,1.465204


In [48]:
or107_video.describe()

Unnamed: 0,video_id,len,avg_pause_cnt,avg_freq,avg_playback_rate
count,80.0,80.0,80.0,80.0,80.0
mean,4609.675,864.3625,2.561446,0.408526,1.329527
std,311.253561,284.355111,2.967047,0.289946,0.441938
min,4501.0,187.0,0.0,0.0,0.0
25%,4520.75,691.75,0.0,0.027939,1.370395
50%,4540.5,890.5,1.951807,0.517928,1.469354
75%,4560.25,1063.5,3.954819,0.618396,1.500878
max,5956.0,1418.0,14.638554,0.919319,1.81778


In [49]:
or107_video.to_excel('or107_videoLenVs_0614.xlsx', index = False)

#### OR108

In [50]:
or108_org = pd.read_excel('or108_video_raw.xlsx')
or108_grades = pd.read_csv('/Users/chihsin1/Documents/stats2_finalProject_data/OR108 grades.csv')
or108_video_info = pd.read_excel('/Users/chihsin1/Documents/stats2_finalProject_data/OR107 videos_sep.xlsx')
or108_org['created_at'] = pd.to_datetime(or107_org['created_at'])
or108_org = or108_org[~(or108_org['created_at'] > '2020-06-30 00:00:00 UTC')]
display(or108_org.tail())

Unnamed: 0,id,start,end,student_id,course_id,video_id,created_at,updated_at,playback_rate
92674,8093962,17,501,66,772,4549,NaT,2020-03-19 14:47:00 UTC,1.5
92675,8093993,475,481,66,772,4549,NaT,2020-03-19 14:47:09 UTC,1.5
92676,8093967,501,502,66,772,4549,NaT,2020-03-19 14:47:02 UTC,1.5
92677,8094286,447,578,66,772,4549,NaT,2020-03-19 14:48:38 UTC,1.5
92678,8094978,588,853,66,772,4549,NaT,2020-03-19 14:51:35 UTC,1.5


In [51]:
or108_grades = or108_grades.iloc[1:]

In [52]:
or108_video = pd.DataFrame(data = or108_video_info['video_id'])
len = np.zeros(or108_video['video_id'].shape[0])
for i in range(or108_video['video_id'].shape[0]):
    len[i] = ''.join(c for c in or108_video_info['meta'][i] if c.isdigit())
or108_video['len'] = len
display(or108_video.head())

Unnamed: 0,video_id,len
0,4526,914.0
1,4530,712.0
2,4503,784.0
3,4511,710.0
4,4501,651.0


In [53]:
cntPause(or108_video, or108_org, or108_grades)
cntFreq(or108_video, or108_org, or108_grades)
display(or108_video.head())

Unnamed: 0,video_id,len,avg_pause_cnt,avg_freq,avg_playback_rate
0,4526,914.0,8.915094,0.888444,1.395388
1,4530,712.0,0.783019,0.329619,1.24498
2,4503,784.0,0.160377,0.008183,1.52809
3,4511,710.0,1.09434,0.597515,1.446069
4,4501,651.0,10.386792,0.701142,1.443833


In [54]:
or108_video.to_excel('or108_videoLenVs_0614.xlsx', index = False)