In [83]:
import numpy as np
import scipy.stats as stats
import pandas as pd

import matplotlib .pyplot as plt


### Data prep...

In [84]:
data = "./11-3_AllData.csv"
df = pd.read_csv(data)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145 entries, 0 to 144
Data columns (total 56 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   StartDate              145 non-null    object
 1   EndDate                145 non-null    object
 2   Status                 145 non-null    object
 3   IPAddress              144 non-null    object
 4   Progress               145 non-null    object
 5   Duration (in seconds)  145 non-null    object
 6   Finished               145 non-null    object
 7   RecordedDate           145 non-null    object
 8   ResponseId             145 non-null    object
 9   RecipientLastName      2 non-null      object
 10  RecipientFirstName     2 non-null      object
 11  RecipientEmail         2 non-null      object
 12  ExternalReference      2 non-null      object
 13  LocationLatitude       105 non-null    object
 14  LocationLongitude      105 non-null    object
 15  DistributionChannel    

In [85]:
# Get rid of useless columns/rows
df = df.drop(
        columns=['StartDate', 'EndDate', 'IPAddress', 'RecordedDate', 
                'RecipientLastName', 'RecipientFirstName', 'RecipientLastName', 
                'RecipientEmail', 'ExternalReference','DistributionChannel'],
        index=[0, 1]
)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 143 entries, 2 to 144
Data columns (total 47 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Status                 143 non-null    object
 1   Progress               143 non-null    object
 2   Duration (in seconds)  143 non-null    object
 3   Finished               143 non-null    object
 4   ResponseId             143 non-null    object
 5   LocationLatitude       103 non-null    object
 6   LocationLongitude      103 non-null    object
 7   UserLanguage           143 non-null    object
 8   Q_RecaptchaScore       140 non-null    object
 9   Q1                     129 non-null    object
 10  Q1_10_TEXT             1 non-null      object
 11  Q2                     120 non-null    object
 12  Q2_20_TEXT             19 non-null     object
 13  Q2_28_TEXT             1 non-null      object
 14  Q2_36_TEXT             7 non-null      object
 15  Q2_45_TEXT             

In [86]:
# Set the ResponseIds as the index value for each response
df.set_index("ResponseId", inplace=True)
df.head()

Unnamed: 0_level_0,Status,Progress,Duration (in seconds),Finished,LocationLatitude,LocationLongitude,UserLanguage,Q_RecaptchaScore,Q1,Q1_10_TEXT,...,Q10_6_TEXT,Q11,Q11_9_TEXT,Q12,Q12_5_TEXT,Q13,Q13_9_TEXT,Q14,Q15,Q15_1_TEXT
ResponseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
R_239ymaOz9IwzVY6,Survey Preview,100,12,True,37.7794,-122.2178,EN,,,,...,,,,,,,,,,
R_1i8WxOAA269cEk1,IP Address,100,55,True,37.8657,-122.2571,EN,,,,...,,Homework answers,,Small student-made class group chats,,Quora,,,,
R_1H60HxqCquIDS0n,IP Address,100,18592,True,37.8922,-122.2729,EN,0.8999999761581421,,,...,,,,,,"Twitter,Other (please specify)",,Neither agree nor disagree,,
R_1ghf8xJrVXbbkuJ,IP Address,100,458,True,22.2908,114.1501,EN,1.0,"Master's student,PhD student",,...,,Notes you created,,Small student-made class group chats,,Figma Community,,Strongly disagree,"Yes, if so please enter your email:",mubarak@goodnotesapp.com
R_1mJIzOkTM39eIoR,IP Address,100,1166,True,37.6609,-122.0491,EN,1.0,Undergraduate student,,...,,,,Student clubs and organizations,,"Instagram,Slack",,,"Yes, if so please enter your email:",juliacunningham@berkeley.edu


In [87]:
# Filter spam and incomplete responses
drop_indexes = []
for rID in df.index:
    response = df.loc[rID]
    if response['Finished'] == 'False' or response['Status'] == 'Spam':
        drop_indexes.append(rID)    
df = df.drop(index=drop_indexes)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 103 entries, R_239ymaOz9IwzVY6 to R_3HwDJeWJA2Nuujs
Data columns (total 46 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Status                 103 non-null    object
 1   Progress               103 non-null    object
 2   Duration (in seconds)  103 non-null    object
 3   Finished               103 non-null    object
 4   LocationLatitude       103 non-null    object
 5   LocationLongitude      103 non-null    object
 6   UserLanguage           103 non-null    object
 7   Q_RecaptchaScore       100 non-null    object
 8   Q1                     100 non-null    object
 9   Q1_10_TEXT             0 non-null      object
 10  Q2                     101 non-null    object
 11  Q2_20_TEXT             16 non-null     object
 12  Q2_28_TEXT             0 non-null      object
 13  Q2_36_TEXT             5 non-null      object
 14  Q2_45_TEXT             5 non-null      object
 15

In [88]:
# Define major groupings:
major_groups = {
    'stem':  ['Biology', 'Chemistry', 'Computer Science', 'Earth Science', 'Electrical Engineering', 'Engineering & Technology', 'Geography & Enviornment', 'Mathematics', 'Mechanical Engineering', 'Physics'],
    'h_m': ['Biology', 'Biomedicine $ Pharmacy', 'Dentistry', 'Medicine & Health', 'Nursing', 'Sports & Rehabilitation', 'Veterinary'],
    'ss': ['Anthropology', 'Economics', 'Law', 'Political science', 'Psychology', 'Social Work', 'Sociology'],
    'a_h':  ['Architecture', 'Arts, Media & Design', 'History', 'Journalism & Communication', 'Language & Literature', 'Music & Performing arts', 'Philosophy', 'Religion & Theology'],
    'b_f': ['Accounting & Finance', 'Business & Management', 'Economics', 'Hospitality & Tourism', 'Marketing & Public Relations']
}

# Store response IDs here (majors):
stem_r = []
h_m_r = []
ss_r = []
a_h_r = []
b_f_r = []
other_r = []

# Sore response IDs here (study groups): 
stem_sr = []
h_m_sr = []
ss_sr = []
a_h_sr = []
b_f_sr = []
other_sr = []

In [89]:
# Sort responses by pesonal major and study group major

for rID in df.index:
    response = df.loc[rID]
    major = response['Q2']
    
    if any(mg == major for mg in major_groups['stem']):
        stem_r.append(rID)
    if any(mg == major for mg in major_groups['h_m']):
        h_m_r.append(rID)
    if any(mg == major for mg in major_groups['ss']):
        ss_r.append(rID)
    if any(mg == major for mg in major_groups['a_h']):
        a_h_r.append(rID)
    if any(mg == major for mg in major_groups['b_f']):
        b_f_r.append(rID)
    else:
        other_r.append(rID)

    if any(mg == major for mg in major_groups['stem']):
        stem_sr.append(rID)
    if any(mg == major for mg in major_groups['h_m']):
        h_m_sr.append(rID)
    if any(mg == major for mg in major_groups['ss']):
        ss_sr.append(rID)
    if any(mg == major for mg in major_groups['a_h']):
        a_h_sr.append(rID)
    if any(mg == major for mg in major_groups['b_f']):
        b_f_sr.append(rID)
    else:
        other_sr.append(rID)


### Question: Major vs topic of study in study groups?

In [90]:
# number of same major vs topic of study (broad group)
same = 0
diff = 0
for rID in df.index:
    if rID in stem_r and rID in stem_sr:
        same += 1
    elif rID in h_m_r and rID in h_m_sr:
        same += 1
    elif rID in ss_r and rID in ss_sr:
        same += 1
    elif rID in a_h_r and rID in a_h_sr:
        same += 1
    elif rID in b_f_r and rID in b_f_sr:
        same += 1
    elif rID in other_r and rID in other_sr:
        same += 1
    else:
        diff += 1
print(same)
print(diff)

103
0


In [None]:
same = 0
diff = 0
for rID in df.index:
    response = df.loc[rID]
    if response['Q6'] == "Yes":
        major = response['Q2']

### Question: Is there a correlation between how notes are taken and how they are shared?