## Import modules and data

In [1]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import modelBuilder as mb
import main

df = pd.read_csv('data.csv')
df.Time = pd.to_datetime(df.Time)  # change time to datetime object
data = df.drop(['Sourcehash'],axis=1)  # drop unneeded hash

## Cleaning Up Data

In [2]:
df.Time = pd.to_datetime(df.Time)  # change time to datetime object
data = df.drop(['Sourcehash'],axis=1)  # drop unneeded hash

In [3]:
# Change incorrect, hint, and correct to binary features
data.loc[:,"incorrect"] = np.where(data['Score'] == 0, 1, 0)
data.loc[:,"hint"] = np.where((data['Score'] > 0) & (data['Score'] < 100), 1, 0)
data.loc[:,"correct"] = np.where((data['Score'] == 100), 1, 0)

In [4]:
# Find indices where score is less than 100
index = data.index
condition = data['Score'] < 100   # Boolean for when students score less than a 100
not100Indices = index[condition]   # Index of all submissions where students score less than a 100

In [5]:
# Find indices where score is 100, but drop the duplicates
data100s = data.loc[data['Score'] == 100]
sortedData = data100s.sort_values('Time', axis=0, ascending=True)
dataNoDups = pd.DataFrame(data100s,columns=['Stud_ID','Topic','Score']).drop_duplicates()
indices = dataNoDups.index
i = indices.append(not100Indices)
cleanData = data.iloc[i]
correctnessData = cleanData[["Stud_ID", "Topic", "incorrect","hint"]].groupby(["Stud_ID","Topic"]).sum().reset_index()

## Find Duration for Each Student on Each Problem

In [6]:
def timefunction(series):
    ''' Calculates the total time a student spent on a problem

        Parameters:
        series (pandas series): Series of datetime objects

        Returns:
        dur (float): The amount of time a student took ok a particular problem
    '''
    series = series.sort_values()
    length = len(series)
    if length == 1:
        dur = 0
    else: 
        duration = series.iloc[length - 1] - series.iloc[0]
        dur = duration.total_seconds() / 60
        if dur > 60:
            dur = 60
    return dur

In [7]:
# Use the time function to find the time per problem for each student
subsetData = cleanData[['Stud_ID', 'Topic', 'Time']]
timeData = subsetData.groupby(['Stud_ID', 'Topic']).agg(func=timefunction).reset_index()

In [8]:
# Merge the correctness data with the time data to get the finished dataset
final_df = pd.merge(correctnessData, timeData,  how='left', left_on=['Stud_ID','Topic'], right_on = ['Stud_ID','Topic'])
final = final_df.rename(columns={'Stud_ID': "stud_id", "Topic": 'problem_id', "Time": "duration"})

In [9]:
import modelBuilder as mb
mb.distributionBuilder(final)

Unnamed: 0_level_0,incorrectLower,hintLower,durationLower,incorrectUpper,hintUpper,durationUpper
problem_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
CAESAR,1.000000,0.000000,0.000000,1.000000,0.000000,0.000000
Caesar_gc,1.000000,0.000000,0.000000,1.000000,0.000000,0.000000
Lsn5_skill1,2.000000,0.000000,1.366667,2.000000,0.000000,1.366667
Lsn7_lists,8.000000,2.000000,23.033333,8.000000,2.000000,23.033333
PEX1,0.795876,12.708597,37.655661,1.408248,22.582806,55.188679
...,...,...,...,...,...,...
pex2,1.706114,0.000000,13.559933,2.985732,0.440779,33.602074
pex2_gc,2.390562,0.000000,18.511964,4.523960,0.380021,39.120139
pex3,8.355244,0.000000,55.035533,13.726760,0.000000,63.257881
skywriter,1.000000,0.000000,0.000000,1.000000,0.000000,0.000000


In [None]:
import modelBuilder as mb
import random
data = mb.importData()  # Get the data and question answer pairs
#     distributions = mb.distributionBuilder(data)  # build the chi-squared upper and lower bound dist for each Q
#     questionDifficulty = mb.difficulty(data)  # build student perceived difficulty model
#     problemNum, problem, answer = mb.startingQ(data, pairs, questionDifficulty)  # get the middle difficulty question
#     student = Student(problemNum, problem, answer)  # Initialize student with the starting problem and answer
#     return student, data, pairs, questionDifficulty, distributions  # return all that we found

In [10]:
questionDifficulty

Unnamed: 0,problem_id,difficulty
0,lsn23_soundboard,0.000000
1,lsn22_bouncingball,0.000000
2,a2_4_ques,0.000000
3,a1_7_dog,0.000000
4,a1_7_pilot_quals,0.000000
...,...,...
395,lsn15_survivors_by_gender,0.723231
396,lsn24_nasa,0.738894
397,Lsn7_lists,0.887108
398,to_the_end,0.908658


In [12]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import modelBuilder as mb
import main

df = pd.read_csv('data.csv')
df.Time = pd.to_datetime(df.Time)  # change time to datetime object
data = df.drop(['Sourcehash'],axis=1)  # drop unneeded hash

dayda = df.loc[(df.Topic == "lsn6_math") & (df.Stud_ID == "dee9256848fdca4942f5f3243ace75be0f7f64c9")]
qData = mb.convertSQLtoClean(dayda)
problem, questionDifficulty, distributions = main.initialize()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [13]:
dayda

Unnamed: 0,Stud_ID,Topic,Time,Graded,Score,Sourcehash
328274,dee9256848fdca4942f5f3243ace75be0f7f64c9,lsn6_math,2021-07-20 21:20:54,1,0.0,ad44202e36aaab0ba47c735a258f838bda96c3a9
328276,dee9256848fdca4942f5f3243ace75be0f7f64c9,lsn6_math,2021-07-20 21:21:38,1,0.0,bc40bf92b3241bf18ff48c851bece5f535cf54da
328277,dee9256848fdca4942f5f3243ace75be0f7f64c9,lsn6_math,2021-07-20 21:21:50,1,0.0,410012ce92cf1b0d574ea2703f80524626af4bdd
328278,dee9256848fdca4942f5f3243ace75be0f7f64c9,lsn6_math,2021-07-20 21:24:25,1,0.0,62786f7d0627ffb83cf40ead635ca5a6c69361f4
328279,dee9256848fdca4942f5f3243ace75be0f7f64c9,lsn6_math,2021-07-20 21:24:57,1,0.0,88fe441ac9ccfefcffdadd246639e20d2dccd449
328280,dee9256848fdca4942f5f3243ace75be0f7f64c9,lsn6_math,2021-07-20 21:26:55,1,100.0,e9ab4039b2b26249f736dee75fc29694a172ebd7


In [14]:
firstAttempt = df.loc[(df.Topic == "lsn6_math") & (df.Stud_ID == "dee9256848fdca4942f5f3243ace75be0f7f64c9") & (df.Time == "2021-07-20 21:20:54")]
secondAttempt = df.loc[(df.Topic == "lsn6_math") & (df.Stud_ID == "dee9256848fdca4942f5f3243ace75be0f7f64c9") & (df.Time == "2021-07-20 21:21:38")]

In [23]:
currentProb = pd.DataFrame()
currentProb = currentProb.append(firstAttempt)
currentProb = currentProb.append(secondAttempt)
currentProb

Unnamed: 0,Stud_ID,Topic,Time,Graded,Score,Sourcehash
328274,dee9256848fdca4942f5f3243ace75be0f7f64c9,lsn6_math,2021-07-20 21:20:54,1,0.0,ad44202e36aaab0ba47c735a258f838bda96c3a9
328276,dee9256848fdca4942f5f3243ace75be0f7f64c9,lsn6_math,2021-07-20 21:21:38,1,0.0,bc40bf92b3241bf18ff48c851bece5f535cf54da


In [15]:
firstAttempt

Unnamed: 0,Stud_ID,Topic,Time,Graded,Score,Sourcehash
328274,dee9256848fdca4942f5f3243ace75be0f7f64c9,lsn6_math,2021-07-20 21:20:54,1,0.0,ad44202e36aaab0ba47c735a258f838bda96c3a9


In [16]:
secondAttempt

Unnamed: 0,Stud_ID,Topic,Time,Graded,Score,Sourcehash
328276,dee9256848fdca4942f5f3243ace75be0f7f64c9,lsn6_math,2021-07-20 21:21:38,1,0.0,bc40bf92b3241bf18ff48c851bece5f535cf54da


In [4]:
mb.nextQuestion(problem, questionDifficulty, distributions, qData)

'lsn14_class2017'

In [5]:
questionDifficulty.loc[questionDifficulty.problem_id == 'lsn14_class2017']

Unnamed: 0,problem_id,difficulty
266,lsn14_class2017,0.182921


In [25]:
df = pd.DataFrame(columns = ["Stud_ID","Topic","Time","Graded","Score","Sourcehash"])

In [29]:
df.append(list([1,2,3,4,5,6]))

Unnamed: 0,0,Graded,Score,Sourcehash,Stud_ID,Time,Topic
0,1.0,,,,,,
1,2.0,,,,,,
2,3.0,,,,,,
3,4.0,,,,,,
4,5.0,,,,,,
5,6.0,,,,,,


In [34]:
df.iloc[0].append([1,2,3,4,5,6])

IndexError: single positional indexer is out-of-bounds

In [37]:
pd.DataFrame([1,1,1,1,1,1])

Unnamed: 0,0
0,1
1,1
2,1
3,1
4,1
5,1


In [41]:
df.iloc[0] = [[1,1,1,1,1,1]]

IndexError: iloc cannot enlarge its target object

In [56]:
attempt = pd.DataFrame(columns = ["Stud_ID","Topic","Time","Graded","Score","Sourcehash"])
series = pd.Series((2,1,3,4,5,6), index = attempt.columns)
attempt = attempt.append(series, ignore_index=True)

In [57]:
attempt

Unnamed: 0,Stud_ID,Topic,Time,Graded,Score,Sourcehash
0,2,1,3,4,5,6


In [50]:
attempt = attempt.append(series, ignore_index=True)

In [59]:
attempt = pd.DataFrame(columns = ["Stud_ID","Topic","Time","Graded","Score","Sourcehash"])
attempt

Unnamed: 0,Stud_ID,Topic,Time,Graded,Score,Sourcehash


In [54]:
a_series

Stud_ID       2
Topic         3
Time          4
Graded        5
Score         6
Sourcehash    7
dtype: int64