In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.plotly as plotl
import plotly.graph_objs as go
import plotly.tools as tls
from wordcloud import WordCloud
import seaborn as sns
sns.set(color_codes=True)
plt.style.use('ggplot')  

In [None]:
def vocabSavedRange(dl):
	#if dl == 0: return '0'
	if 0 <= dl <= 1: return '0-1'
	elif 1 < dl <= 10: return '2-10'
	elif 10 < dl <= 20: return '11-20'
	elif 20 < dl <= 40: return '21-40'
	elif 40 < dl <= 60: return '41-60'
	elif 60 < dl <= 80: return '61-80'
	elif 80 < dl <= 100: return '81-100'
	elif dl > 100: return '> 100'
	else: return 'None'

def listenScoreRange(dl):
	if dl < 0: return '< 0'
	#elif dl == -1: return 'Incomplete'
	elif 0 <= dl <= 49: return '0-49'
	elif 49 < dl <= 59: return '50-59'
	elif 59 < dl <= 62: return '60-62'
	elif 62 < dl <= 66: return '63-66'
	elif 66 < dl <= 69: return '67-69'
	elif 69 < dl <= 72: return '70-72'
	elif 72 < dl <= 76: return '73-76'
	elif 76 < dl <= 79: return '77-79'
	elif 79 < dl <= 84: return '80-84'
	elif 84 < dl <= 89: return '85-89'
	elif 89 < dl <= 100: return '90-100'
	elif dl > 100: return '>100'
	else: return 'None'


In [None]:
def numOfUsersbyAvgDictionarySize(preppedDataWithoutSection):
	"""
	More than half of students save an avg of 1-20% of words in their dictionaries
	An avg. video vocab count is 239, so between 10 to 25 words is saved from video on average by most users
	"""
	df = pd.read_json(preppedDataWithoutSection) #read_json
	df['avg_words_saved_perc'] = df['avg_words_saved'] * 100
	df = df.groupby(['user']).mean() #avg overall words saved by user, get overall avg of individual averages per video
	df['user_group'] = df['avg_words_saved_perc'].map(vocabSavedRange)
	results = df.groupby(['user_group']).size()
	return results

values = numOfUsersbyAvgDictionarySize('prepped-wo-section.json')
final = go.Bar(
            x=values.values,
            y=values.index,
            orientation = 'h'
)
data = [final]
layout = go.Layout(
    title='Exploring students dictionary usage',
    xaxis=dict(
        title='# of students',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    ),
    yaxis=dict(
        title='Avg. % of words saved overall',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    )
)
fig = go.Figure(data=data, layout=layout)
#plotl.iplot(fig, filename='numOfUsersbyAvgDictionarySize')

In [None]:
def numOfUsersByScoreRange(preppedDataWithoutSection):
	"""
	e.g. users and their score range
	"""
	df = pd.read_json(preppedDataWithoutSection) #read_json in jupyter
	df = df[df['postId'] != 3913]
	df = df.groupby(['user']).mean()
	df['avg_score_range'] = df['avg_score'].map(listenScoreRange)
	results = df.groupby(['avg_score_range']).size()
	return results #df[['avg_score', 'avg_score_range']]

values2 = numOfUsersByScoreRange('prepped-wo-section.json')
final = go.Bar(
            x=values2.values,
            y=values2.index,
            orientation = 'h'
)
data = [final]
layout = go.Layout(
    title='Exploring overall students performance',
    xaxis=dict(
        title='# of students',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    ),
    yaxis=dict(
        title='Avg. score',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    )
)
fig = go.Figure(data=data, layout=layout)
#plotl.iplot(fig, filename='numOfUsersByScoreRange')

In [None]:
def AvgScorebyAvgDictionarySize(preppedDataWithoutSection):
	df = pd.read_json(preppedDataWithoutSection) #read_json
	filtered = df[df['avg_score'] >= 0] # ignore Nan, means no scores for that user
	format = filtered.groupby(['user']).mean()
	format['avg_words_saved'] = (100 * format['avg_words_saved']).map(vocabSavedRange) #convert to percent and map to a range
	final = format.groupby(['avg_words_saved'])['avg_score'].mean()
	return final

values3 = AvgScorebyAvgDictionarySize('prepped-wo-section.json')
final = go.Bar(
            x=values3.values,
            y=values3.index,
            orientation = 'h'
)
data = [final]
layout = go.Layout(
    title='Explore relationship between scores and dictionary usage',
    xaxis=dict(
        title='Avg. listen score',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    ),
    yaxis=dict(
        title='Avg. % of words saved overall',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    )
)
fig = go.Figure(data=data, layout=layout)
#plotl.iplot(fig, filename='AvgScorebyAvgDictionarySize')

In [None]:
#http://stackoverflow.com/questions/17812978/how-to-plot-two-columns-of-a-pandas-data-frame-using-points
#http://stackoverflow.com/questions/28576540/how-can-i-normalize-the-data-in-a-range-of-columns-in-my-pandas-dataframe

##############################################
#     REMOVE STOP WORDS WITH NLTK
##############################################

import nltk
import json
from nltk import word_tokenize
from nltk.corpus import stopwords
from collections import Counter

video = pd.read_json('videoDataInfoNew.json')
student_behav = pd.read_json('dictionary.json')
stop = stopwords.words('english')

video['stop_word_filtered'] = video['wordList'].apply(lambda x: [item for item in x if item not in stop])
#video[['postId','wordList','stop_word_filtered']]
#video.to_json('video_data_stop_words_removed.json', orient='records')

student_behav['stop_word_filtered'] = student_behav['wordList'].apply(lambda x: [item for item in x if item not in stop])
#student_behav[['memberId','wordList','stop_word_filtered']]
#student_behav.to_json('student_dictionary_stop_words_removed.json', orient='records')

allWords = []
def allWordss(wlist):
    for word in wlist:
        allWords.append(word)
#stop
#student_behav['wordList'].apply(allWordss)
#freq = Counter(allWords)
#freq.most_common(200)
#freq.to_json('freq_dict_words_among_users.json', orient='records')

#wordcloud = WordCloud().generate(' '.join(allWords))
#plt.imshow(wordcloud)
#plt.axis("off")
#plt.show()


In [None]:
##########################################################
#https://dev.socrata.com/blog/2016/02/02/plotly-pandas.html
# Correlation between words saved and score
#########################################################

df = pd.read_json('prepped-wo-section.json') #read_json
df['avg_words_saved_perc'] = df['avg_words_saved'] * 100
df = df.groupby(['user']).mean() #avg overall words saved by user, get overall avg of individual averages per video
df['user_group'] = df['avg_words_saved_perc'].map(vocabSavedRange)

data = pd.DataFrame({'score':df['avg_score'], 'words_saved':df['avg_words_saved_perc'] })

color1 = '#9467bd'
color2 = '#F08B00'

trace1 = go.Scatter(
    x = data.index,
    y = data['score'],
    name='scores',
    line = dict(
        color = color1
    )
)
trace2 = go.Scatter(
    x= data.index,
    y =data['words_saved'] ,
    name='words_saved',
    yaxis='y2',
    mode='markers'

)
data = [trace1, trace2]
layout = go.Layout(
    title= "Relationship between scores and words saved",
    yaxis=dict(
        title='score',
        titlefont=dict(
            color=color1
        ),
        tickfont=dict(
            color=color1
        )
    ),
    yaxis2=dict(
        title='words',
        overlaying='y',
        side='right',
        titlefont=dict(
            color=color2
        ),
        tickfont=dict(
            color=color2
        )

    )

)
fig = go.Figure(data=data, layout=layout)
plotl.iplot(fig)

In [None]:
data = pd.DataFrame({'score':df['avg_score'], 'words_saved':df['avg_words_saved_perc'] })

x=data['score']
y=data['words_saved']
plt.scatter(x,y)
#fig = go.Figure(data=data, layout=layout)
#plotl.iplot(fig)

In [None]:
data2 = pd.DataFrame({'avg_incomplete':df['avg_incomplete'] })
sns.distplot(data2['avg_incomplete'],kde=False, rug=True)
#http://seaborn.pydata.org/tutorial/distributions.html

In [7]:
# data = pd.DataFrame({'avg_incomplete':df['avg_incomplete'], 'video':df['postId'] })

# color1 = '#9467bd'
# color2 = '#F08B00'

# trace1 = go.Scatter(
#     x = data.index,
#     y = data['avg_incomplete'],
#     name='avg_incomplete',
#     line = dict(
#         color = color1
#     )
# )
# trace2 = go.Scatter(
#     x= data.index,
#     y =data['video'] ,
#     name='video',
#     yaxis='y2',
#     mode='markers'

# )
# data = [trace1, trace2]
# layout = go.Layout(
#     title= "User sequence of incomplete videos",
#     yaxis=dict(
#         title='avg_incomplete',
#         titlefont=dict(
#             color=color1
#         ),
#         tickfont=dict(
#             color=color1
#         )
#     ),
#     yaxis2=dict(
#         title='video',
#         overlaying='y',
#         side='right',
#         titlefont=dict(
#             color=color2
#         ),
#         tickfont=dict(
#             color=color2
#         )

#     )

# )
# fig = go.Figure(data=data, layout=layout)
# plotl.iplot(fig)

#explore videos with 1.0 incompletion meaning all sections were skipped from a video
data_from_json = pd.read_json('prepped-wo-section.json')
data_to_frame = pd.DataFrame(data_from_json)
data_to_frame = data_to_frame.fillna(0)
data_to_frame.head(50)

#print(data_to_frame[data_to_frame["avg_incomplete"] == 1].iloc[0])
# Print the first row of all the games with scores greater than 0.
#print(data_to_frame[data_to_frame["avg_incomplete"] > 0].iloc[4])


Unnamed: 0,avg_incomplete,avg_score,avg_words_saved,num_words_saved,postId,user,video_vocab_count
0,0.0,92.0,0.104839,26,3913,50679,248
1,0.0,93.333333,0.098039,15,5186,50679,153
2,0.0,90.142857,0.165533,73,4974,50679,441
3,0.0,90.166667,0.121673,32,4802,50679,263
4,0.0,93.857143,0.104101,33,3711,50679,317
5,0.0,92.0,0.12782,17,5881,50679,133
6,0.333333,78.0,0.138686,19,7126,50679,137
7,1.0,0.0,0.142349,40,5797,50679,281
8,0.0,82.2,0.221774,55,3913,22808,248
9,0.0,76.857143,0.293375,93,3711,22808,317


In [None]:
from sklearn.cluster import KMeans
# Import the PCA model.
from sklearn.decomposition import PCA

# Initialize the model with 2 parameters -- number of clusters and random state.
kmeans_model = KMeans(n_clusters=4, random_state=1)
# Get only the numeric columns from games.
good_columns = data_to_frame[['avg_incomplete']] #data_to_frame._get_numeric_data()


# Fit the model using the good columns.
kmeans_model.fit(good_columns)
# Get the cluster assignments.
labels = kmeans_model.labels_

# Create a PCA model.
#pca_2 = PCA(2)
# Fit the PCA model on the numeric columns from earlier.
#plot_columns = pca_2.fit_transform(good_columns)
# Make a scatter plot of each game, shaded according to cluster assignment.
#plt.scatter(x=plot_columns[:,0], y=plot_columns[:,1], c=labels)
# Show the plot.


df_processed = data_to_frame.copy()
df_processed['Cluster_Class'] = pd.Series(labels, index=df_processed.index)
df_processed
#df_processed = df_processed[df_processed['Cluster_Class'] != 0]
#df_processed.groupby(['Cluster_Class', 'postId']).size()


In [None]:
sns.distplot(df_processed['Cluster_Class'])

In [33]:
pd.set_option("display.max_rows", 400000)