# Meta Kaggle Dashboards
_To be used on a web page_

In [1]:
#import linear algebra and data manipulation libraries
import numpy as np
import pandas as pd

#import standard visualization
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# import plotly
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode
from plotly import tools

# plotly and cufflinks in offline mode
import cufflinks as cf
cf.go_offline(connected=True)
init_notebook_mode(connected=True)

In [10]:
import datetime as dt
from datetime import date, timedelta

## Import Datasets

Import datasets:

In [58]:
kernels = pd.read_csv('meta-kaggle/Kernels.csv')
tags = pd.read_csv('meta-kaggle/Tags.csv')
competitions = pd.read_csv('meta-kaggle/Competitions.csv')
users = pd.read_csv('meta-kaggle/Users.csv')
teams = pd.read_csv('meta-kaggle/Teams.csv')
submissions = pd.read_csv('meta-kaggle/Submissions.csv')
datasets = pd.read_csv('meta-kaggle/Datasets.csv')

In [5]:
competitions.head(2)

Unnamed: 0,Id,Slug,Title,Subtitle,HostSegmentTitle,ForumId,OrganizationId,CompetitionTypeId,HostName,EnabledDate,...,EnableSubmissionModelHashes,EnableSubmissionModelAttachments,RewardType,RewardQuantity,NumPrizes,UserRankMultiplier,CanQualifyTiers,TotalTeams,TotalCompetitors,TotalSubmissions
0,2408,Eurovision2010,Forecast Eurovision Voting,This competition requires contestants to forec...,Featured,2.0,,1,,4/7/2010 7:57:43 AM,...,False,False,USD,1000.0,1,1.0,False,22,25,22
1,8910,blg-454e-term-project-competition,BLG 454E Term Project Competition,"ITU Computer and Informatics Faculty, BLG 454E...",InClass,24780.0,,1,,3/17/2018 7:08:56 PM,...,False,False,,0.0,0,0.0,False,29,65,658


In [7]:
submissions.head(2)

Unnamed: 0,Id,SubmittedUserId,TeamId,SourceKernelVersionId,SubmissionDate,ScoreDate,IsAfterDeadline,PublicScoreLeaderboardDisplay,PublicScoreFullPrecision,PrivateScoreLeaderboardDisplay,PrivateScoreFullPrecision
0,459722,67252.0,53319,,10/7/2013,,False,0.07762,0.0776275,0.0644,0.0644099
1,402723,67252.0,53319,,7/30/2013,,False,0.08501,0.0850198,0.07239,0.0723972


In [8]:
teams.head(2)

Unnamed: 0,Id,CompetitionId,TeamLeaderId,TeamName,ScoreFirstSubmittedDate,LastSubmissionDate,PublicLeaderboardSubmissionId,PrivateLeaderboardSubmissionId,IsBenchmark,Medal,MedalAwardDate,PublicLeaderboardRank,PrivateLeaderboardRank
0,951,2448,368.0,Naive Baseline,8/9/2010,8/9/2010,4980.0,4980.0,False,,9/6/2018,35.0,32.0
1,957,2448,2100.0,leecbaker,9/17/2010,9/17/2010,7096.0,7096.0,False,1.0,7/15/2016,1.0,1.0


In [9]:
users.head(2)

Unnamed: 0,Id,UserName,DisplayName,RegisterDate,PerformanceTier
0,1,kaggleteam,Kaggle Team,3/24/2011,5
1,368,antgoldbloom,Anthony Goldbloom,1/20/2010,5


In [59]:
datasets.head(2)

Unnamed: 0,Id,CreatorUserId,OwnerUserId,OwnerOrganizationId,CurrentDatasetVersionId,CurrentDatasourceVersionId,ForumId,Type,CreationDate,ReviewDate,FeatureDate,LastActivityDate,TotalViews,TotalDownloads,TotalVotes,TotalKernels
0,52146,1,,223.0,281965.0,294410.0,60824,2,9/12/2018 9:13:38 PM,,,9/12/2018,179,3,0,1
1,52222,1135944,1135944.0,,97664.0,100200.0,60904,2,9/13/2018 5:06:57 AM,,,9/13/2018,151,4,1,2


## Plot Dashboard for Kaggle Activities over time

Create a dashboard, which contains number of competitions (both active and closed), datasets, kernels, and users over time. 

I used code for dates processing from __[this Kaggle kernel](https://www.kaggle.com/gaborfodor/kaggle-trends)__.

In [53]:
#function returns first day of the week by date

def date_to_first_day_of_week(day: date) -> date:
    return day - timedelta(days=day.weekday())

In [63]:
#calculate cumulative number of competitions for each first date of the week

competitions['Count'] = 1
competitions['EnabledWeek'] = [date_to_first_day_of_week(pd.Timestamp(d).date()) for d in competitions['EnabledDate']]

weekly_competitions = competitions.groupby('EnabledWeek')[['Count']].sum()
weekly_competitions.reset_index(level=0, inplace=True)

def enabled_week_to_cumulative_count(date):
    return weekly_competitions[weekly_competitions['EnabledWeek'] <= date]['Count'].sum()

weekly_competitions['CumulativeCount'] = [week_to_cumulative_count(d) for d in weekly_competitions['EnabledWeek']]

weekly_competitions = weekly_competitions.sort_values(by='EnabledWeek', ascending=True)

weekly_competitions = weekly_competitions[weekly_competitions['EnabledWeek'] >= date(2015,1,1)]

In [64]:
#calculate cumulative number of datasets for each first date of the week

datasets['Count'] = 1
datasets['CreationWeek'] = [date_to_first_day_of_week(pd.Timestamp(d).date()) for d in datasets['CreationDate']]

weekly_datasets = datasets.groupby('CreationWeek')[['Count']].sum()
weekly_datasets.reset_index(level=0, inplace=True)

def creation_week_to_cumulative_count(date):
    return weekly_datasets[weekly_datasets['CreationWeek'] <= date]['Count'].sum()

weekly_datasets['CumulativeCount'] = [creation_week_to_cumulative_count(d) for d in weekly_datasets['CreationWeek']]

weekly_datasets = weekly_datasets.sort_values(by='CreationWeek', ascending=True)

weekly_datasets = weekly_datasets[weekly_datasets['CreationWeek'] >= date(2015,1,1)]

In [72]:
#calculate cumulative number of kernels for each first date of the week
kernels_nonan = kernels.dropna(subset=['CreationDate'])

kernels_nonan['Count'] = 1
kernels_nonan['CreationWeek'] = [date_to_first_day_of_week(pd.Timestamp(d).date()) for d in kernels_nonan['CreationDate']]

weekly_kernels = kernels_nonan.groupby('CreationWeek')[['Count']].sum()
weekly_kernels.reset_index(level=0, inplace=True)

def kernel_creation_week_to_cumulative_count(date):
    return weekly_kernels[weekly_kernels['CreationWeek'] <= date]['Count'].sum()

weekly_kernels['CumulativeCount'] = [kernel_creation_week_to_cumulative_count(d) for d in weekly_kernels['CreationWeek']]

weekly_kernels = weekly_kernels.sort_values(by='CreationWeek', ascending=True)

weekly_kernels = weekly_kernels[weekly_kernels['CreationWeek'] >= date(2015,1,1)]

In [76]:
#calculate cumulative number of users for each first date of the week

users['Count'] = 1
users['RegisterWeek'] = [date_to_first_day_of_week(pd.Timestamp(d).date()) for d in users['RegisterDate']]

weekly_users = users.groupby('RegisterWeek')[['Count']].sum()
weekly_users.reset_index(level=0, inplace=True)

def register_week_to_cumulative_count(date):
    return weekly_users[weekly_users['RegisterWeek'] <= date]['Count'].sum()

weekly_users['CumulativeCount'] = [register_week_to_cumulative_count(d) for d in weekly_users['RegisterWeek']]

weekly_users = weekly_users.sort_values(by='RegisterWeek', ascending=True)

weekly_users = weekly_users[weekly_users['RegisterWeek'] >= date(2015,1,1)]

Plot dashboard:

In [85]:
data = [
    #competitions
    go.Scatter(
        x=weekly_competitions.EnabledWeek.values,
        y=weekly_competitions.CumulativeCount.values,
        mode='lines',
        name='Competitions',
        line=dict(width=4, color='#68B6AF')
    ),
    
    #datasets
    go.Scatter(
        x=weekly_datasets.CreationWeek.values,
        y=weekly_datasets.CumulativeCount.values,
        mode='lines',
        name='Datasets',
        line=dict(width=4, color='#82C5A0')
    ),
    
    
    #kernels
    go.Scatter(
        x=weekly_kernels.CreationWeek.values,
        y=weekly_kernels.CumulativeCount.values,
        mode='lines',
        name='Kernels',
        line=dict(width=4, color='#EED2BB')
    ),
    
    #users
    go.Scatter(
        x=weekly_users.RegisterWeek.values,
        y=weekly_users.CumulativeCount.values,
        mode='lines',
        name='Users',
        line=dict(width=4, color='#7FDBE2')
    ),
    
]

layout = go.Layout(
    title='Overall number of Kaggle activities over time',
    xaxis=dict(title='WeekStart', ticklen=5, zeroline=False, gridwidth=2),
    yaxis=dict(title='Number of activities', ticklen=5, gridwidth=2),
    showlegend=True
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='kaggle_activity_dashboard')

In [86]:
data = [
    #competitions
    go.Scatter(
        x=weekly_competitions.EnabledWeek.values,
        y=weekly_competitions.CumulativeCount.values,
        mode='lines',
        name='Competitions',
        line=dict(width=4, color='#68B6AF')
    ),
    
    #datasets
    go.Scatter(
        x=weekly_datasets.CreationWeek.values,
        y=weekly_datasets.CumulativeCount.values,
        mode='lines',
        name='Datasets',
        line=dict(width=4, color='#82C5A0')
    ),
]

layout = go.Layout(
    title='Overall number of Kaggle activities over time',
    xaxis=dict(title='WeekStart', ticklen=5, zeroline=False, gridwidth=2),
    yaxis=dict(title='Number of activities', ticklen=5, gridwidth=2),
    showlegend=True
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='kaggle_activity_dashboard')