# Capstone 1 - Exploratory Analysis

In [61]:
import pandas as pd
import numpy as np
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.figure_factory as ff
from tools import *
from sklearn.model_selection import train_test_split
import scipy.stats as stats

# enable offline plotting in plotly
init_notebook_mode(connected=True)

In [24]:
users = pd.read_csv('./data/user_features.csv')
submissions = pd.read_csv('./data/train_submissions.csv')
problems = pd.read_csv('./data/problem_features.csv')

I'll start by merging the user and problem features into the submissions data set. This will give me one table with all of the necessary training data.

In [25]:
# join the user features into the submissions data set.
merge_users = submissions.set_index('user_id', drop=True).join(users.set_index('user_id', drop=True)).reset_index()

# now join the problem features into merge_users for the final training data set.
train = merge_users.set_index('problem_id', drop=True).join(problems.set_index('problem_id', drop=True)).reset_index()
train.set_index(['user_id', 'problem_id'], inplace=True, drop=True)

# for now, we will ignore rows with any missing values.
train = train[~train.isnull().any(axis=1)]
train.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,attempts_range,submission_count,problem_solved,contribution,country,follower_count,last_online_time_seconds,max_rating,rating,rank,...,string,strings,structures,suffix,ternary,the,theorem,theory,trees,two
user_id,problem_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
user_2517,prob_1,2,924,827,0,Russia,64,1505407698,581.995,581.995,advanced,...,0,0,0,0,0,0,0,0,0,0
user_2769,prob_1,1,110,98,-1,United States,0,1504710178,348.05,343.75,intermediate,...,0,0,0,0,0,0,0,0,0,0
user_22,prob_10,6,170,158,16,Vietnam,67,1505526912,489.392,474.197,advanced,...,0,0,0,0,0,0,0,0,0,0
user_1694,prob_100,1,340,333,0,,6,1504945053,508.888,508.888,advanced,...,0,0,0,0,0,0,0,0,0,0
user_1000,prob_1000,1,259,235,0,India,41,1505579889,371.273,336.583,intermediate,...,0,0,0,0,0,0,0,0,0,0
user_1017,prob_1000,1,373,357,-2,Kazakhstan,24,1498467892,311.067,252.867,beginner,...,0,0,0,0,0,0,0,0,0,0
user_1022,prob_1000,1,100,93,0,India,2,1505579737,368.406,367.259,intermediate,...,0,0,0,0,0,0,0,0,0,0
user_1050,prob_1000,1,232,208,0,,44,1498587392,315.08,217.603,beginner,...,0,0,0,0,0,0,0,0,0,0
user_1082,prob_1000,2,37,22,0,,4,1504530893,332.856,206.995,beginner,...,0,0,0,0,0,0,0,0,0,0
user_110,prob_1000,1,208,164,0,Bangladesh,17,1505504177,312.5,145.356,beginner,...,0,0,0,0,0,0,0,0,0,0


Now the first column is the value we want to predict, while all other columns are the predictor variables from both the user and problem meta data.

### Testing for User Feature Stratification

In [26]:
users[['country', 'rank']].groupby('rank').count()

Unnamed: 0_level_0,country
rank,Unnamed: 1_level_1
advanced,570
beginner,1509
expert,82
intermediate,1410


The table above shows the number of unique users in each rank. The vast majority of users are ranked beginner and intermediate.

### Does the distribution of the number of problem attempts differ between user ranks?

I want to know if there is a difference in how long it takes users in the different ranks to solve a problem. For example, on average, do experts solve problems in fewer attempts than beginners? To answer this question, I will look at the distribution of attempts_range for each rank category. Since there is a big difference in the number of users in each rank, I'll have to normalize the distributions by the number of users. Thus for each attempts_range, I will calculate the average number of problems solved by a user in that attempts_range, by rank.

In [45]:
# subset each rank into its own dataframe
adv = train[train['rank']=='advanced']
beg = train[train['rank']=='beginner']
exp = train[train['rank']=='expert']
inter = train[train['rank']=='intermediate']

In [59]:
# for each rank, count the number of problems that were
# solved in each attempts_range category. Then divide 
# this count by the number of users in that rank.
adv_counts = adv[['attempts_range', 'rank']].groupby('attempts_range').count()
adv_counts_norm = round(adv_counts/570, 1)

beg_counts = beg[['attempts_range', 'rank']].groupby('attempts_range').count()
beg_counts_norm = round(beg_counts/1509, 1)

exp_counts = exp[['attempts_range', 'rank']].groupby('attempts_range').count()
exp_counts_norm = round(exp_counts/82, 1)

inter_counts = inter[['attempts_range', 'rank']].groupby('attempts_range').count()
inter_counts_norm = round(inter_counts/1410, 1)

In [60]:
# plot the proportions for each rank and attempts_range category
trace1 = go.Bar(x=adv_counts_norm.index, y=adv_counts_norm['rank'], name='Advanced')
trace2 = go.Bar(x=beg_counts_norm.index, y=beg_counts_norm['rank'], name='Beginner')
trace3 = go.Bar(x=exp_counts_norm.index, y=exp_counts_norm['rank'], name='Expert')
trace4 = go.Bar(x=inter_counts_norm.index, y=inter_counts_norm['rank'], name='Intermediate')

layout = go.Layout(title='Average Number of Problems Solved Per User by Rank',
                  yaxis=dict(title='Average Number of Problems Solved Per User'),
                  xaxis=dict(title='Attempts Range'))

fig = go.Figure([trace1, trace2, trace3, trace4], layout=layout)

iplot(fig, filename='rank_distributions.html')

The bar plot above shows the average number of problems solved by a user in each attempts_range, by rank. If we focus on attempts_range=1, we see that the relative heights of the bars are what we would expect. The beginner (orange) rank has the lowest number of problems solved per user at ~18. The expert (green) rank on the otherhand has the largest number of problems solved per user, ~40. In other words, experts on average solve twice as many problems in a single attempt than beginners do. The intermediate This relationship holds for problems solved in an attempts_range of 2 (2-3 attempts). At an attempts_range of 3 or higher, there is less of a difference between groups.

To measure the significance of these differences, I will perform a chi-square test for the expert rank, comparing it to the average number of problems solved by a user for the other 3 ranks. I'll do this by averaging the number of problems solved by a user for the beginner, intermediate, and advanced ranks, and use this average distribution as my expected frequency distribution.

In [83]:
counts = pd.DataFrame({'beg': beg_counts_norm['rank'], 'inter': inter_counts_norm['rank'], 
                         'adv': adv_counts_norm['rank'], 'exp': exp_counts_norm['rank']})

exp_compare = pd.DataFrame({'others': counts[['beg', 'inter', 'adv']].mean(axis=1), 'exp': exp_counts_norm['rank']})
exp_compare

Unnamed: 0_level_0,exp,others
attempts_range,Unnamed: 1_level_1,Unnamed: 2_level_1
1,39.8,24.133333
2,17.2,13.7
3,3.6,4.033333
4,1.3,1.6
5,0.7,0.733333
6,1.1,0.933333


The table above shows the average number of problems solved by a user for the exper rank vs the 3 other ranks. The null hypothesis here is that there is no difference between the expert distribution and the distribution of the average of the other 3 ranks, "others".

H0: exp dist = others dist
HA; exp dist != others dist

I will use alpha=0.05 as the significance level and perform a chi-square test.

In [84]:
stats.chisquare(exp_compare.exp, exp_compare.others)

Power_divergenceResult(statistic=11.198594021966825, pvalue=0.04758156528080048)

The p-value comes out to 0.048 which is less than 0.05, thus the null hypothesis can be rejected in support of the alternate hypothesis. So there is a satistically significant difference in the average number of attempts experts require to solve a problem when compared to the other ranks. This result suggests that the user rank can be an important feature in determining how many attempts a user will require to solve a given problem.

We can also look at what numerical features correlate best to the attempts_range.

In [87]:
corr = train.corr().loc['submission_count':, 'attempts_range']
corr

submission_count            -0.005159
problem_solved              -0.004948
contribution                -0.023961
follower_count              -0.015042
last_online_time_seconds     0.010730
max_rating                  -0.042758
rating                      -0.039156
registration_time_seconds    0.042410
user_attempts_median         0.183970
user_attempts_min            0.026607
user_attempts_max            0.133945
user_attempts_count         -0.121968
user_attempts_iqr            0.023305
points                       0.128075
problem_attempts_median      0.419804
problem_attempts_min         0.160595
problem_attempts_max         0.194438
problem_attempts_count      -0.125028
problem_attempts_iqr         0.108099
algorithms                  -0.014884
and                          0.053102
binary                       0.059855
bitmasks                     0.010472
brute                        0.004302
chinese                      0.004650
combinatorics                0.019151
conquer     