In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression

# Code01 Essentials

### Evaluation Function

In [2]:
# evaluation function (code from https://competitions.codalab.org/forums/22145/4564/)
def evaluate(truth, submission):
    # extract ranking
    left = list(truth.left)
    right = list(truth.right)
#     if len(left) != len(right):
#     message = 'left and right lengths are not the same'
#     sys.exit(message)

    submission_left = []
    submission_right = []
    submission_preference = []
    for idx in range(len(left)):
        submission_left.append(left[idx])
        submission_right.append(right[idx])
        ranking_left = submission[submission.QuestionId==left[idx]].ranking.values[0]
        ranking_right = submission[submission.QuestionId==right[idx]].ranking.values[0]
        preference = 1 if ranking_left < ranking_right else 2
        submission_preference.append(preference)
    # print(submission_preference)
    return submission_preference

### Get Datas

In [3]:
# dataframes
data = pd.read_csv('data/train_data/train_task_3_4.csv')
eval_vali = pd.read_csv('data/test_data/quality_response_remapped_public.csv')
eval_test = pd.read_csv('data/test_data/quality_response_remapped_private.csv')

# submission template
template = pd.read_csv('submission/template.csv')

# metadatas
student_metadata = pd.read_csv('data/metadata/student_metadata_task_3_4.csv')
subject_metadata = pd.read_csv('data/metadata/subject_metadata.csv')
answer_metadata = pd.read_csv('data/metadata/answer_metadata_task_3_4.csv')
question_metadata = pd.read_csv('data/metadata/question_metadata_task_3_4.csv')

### Data Preprocessing

In [4]:
# 1. Subject Grouping - SubId
QMD = question_metadata.copy()
QMD['SubId'] = QMD.SubjectId.apply(lambda x: x[1:-2].split(', ')[1])
QMD.drop(columns=['SubjectId'], inplace=True)

# 2. Position of Correct Answer - CorrectAnswer
DT = data.copy()
DT.drop(columns=['UserId', 'AnswerId','IsCorrect', 'AnswerValue'], inplace=True)
DT.drop_duplicates(['QuestionId'], inplace=True, ignore_index=True)

# 3. Ratio of Correct Resopnse - CRRatio
DT2 = data.copy()
DT2.drop(columns=['UserId', 'AnswerId', 'CorrectAnswer', 'AnswerValue'], inplace=True)
CRR = DT2.groupby('QuestionId').mean().rename(columns={'IsCorrect':'CRRatio'})

# merge to Submission Template 
mydata = pd.read_csv('submission/template.csv')
mydata = pd.merge(left = mydata, right = QMD, how = 'inner', on = 'QuestionId')
mydata = pd.merge(mydata, DT, 'inner', 'QuestionId')
mydata = pd.merge(mydata, CRR, 'inner', 'QuestionId')

display(mydata)

Unnamed: 0,QuestionId,ranking,SubId,CorrectAnswer,CRRatio
0,0,,32,1,0.443457
1,1,,32,3,0.571429
2,2,,32,2,0.385214
3,3,,32,2,0.808757
4,4,,71,3,0.401408
...,...,...,...,...,...
943,943,,32,4,0.566528
944,944,,32,4,0.142857
945,945,,32,1,0.422336
946,946,,32,2,0.459459


### Preperation for Fitting

In [5]:
# get every quesion ids used in validation set
qids_of_vali = eval_vali.left.tolist() + eval_vali.right.tolist()
qids_of_vali.sort()

# assign random ranking to every questions
np.random.seed(20182453)
mydata.ranking = np.random.rand(len(mydata))

# set the sole decision(Final) from five evaluator decisions
eVali = eval_vali.copy()
eTest = eval_test.copy()
eVali['Final'] = eVali.filter(regex='^T', axis = 1).mean(axis=1).apply(lambda x: 2 if x > 1.5 else 1)
eTest['Final'] = eTest.filter(regex='^T', axis = 1).mean(axis=1).apply(lambda x: 2 if x > 1.5 else 1)

# swap ranking values to based on Final decision
for row in eVali.itertuples():
    if row.Final == 1:
        if mydata[mydata.QuestionId == row.left].ranking.values[0] < mydata[mydata.QuestionId == row.right].ranking.values[0]:
            tmp = mydata[mydata.QuestionId == row.left].ranking.values[0]
            mydata[mydata.QuestionId == row.left].ranking.values[0]= mydata[mydata.QuestionId == row.right].ranking.values[0]
            mydata[mydata.QuestionId == row.right].ranking.values[0]= tmp
    elif row.Final == 2:
        if mydata[mydata.QuestionId == row.left].ranking.values[0]> mydata[mydata.QuestionId == row.right].ranking.values[0]:
            tmp = mydata[mydata.QuestionId == row.left].ranking.values[0]
            mydata[mydata.QuestionId == row.left].ranking.values[0]= mydata[mydata.QuestionId == row.right].ranking.values[0]
            mydata[mydata.QuestionId == row.right].ranking.values[0]= tmp

### Fitting

In [6]:
# seperate mydata(stanard - if question is in validation set)
mydata_in_vali = mydata[mydata.QuestionId.isin(qids_of_vali)].copy()
mydata_not_vali = mydata[~mydata.QuestionId.isin(qids_of_vali)].copy()

In [7]:
# fit linear regression model(in validation set group)
y = mydata_in_vali[['ranking']]
X = mydata_in_vali[['SubId', 'CorrectAnswer', 'CRRatio']]
lin_reg = LinearRegression()
lin_reg.fit(X, y)
lin_reg.intercept_, lin_reg.coef_

(array([0.70814095]), array([[-0.002403  ,  0.02228532, -0.43510357]]))

In [8]:
# predict ranking by fit model to non-validation data
X2 = mydata_not_vali[['SubId', 'CorrectAnswer', 'CRRatio']]
mydata_not_vali['ranking'] = lin_reg.predict(X2)

mydata.update(mydata_not_vali)

In [9]:
# export result
mydata[['QuestionId', 'ranking']].to_csv('submission/Result01.csv', index=False)

### Evaluation

In [10]:
estimateV = evaluate(eval_test, mydata)
trueV = eTest['Final'].tolist()

ev = np.array(estimateV)
tv = np.array(trueV)

samev = (ev == tv)
samev.sum() / samev.size

0.4