In [1]:
import numpy as np
import pandas as pd
import unicodedata
import re
import nltk

from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
import prepare
# nltk.download('wordnet') - needed to run to download 'wordnet' resource to use lemmatize function

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_json('data.json')
df

Unnamed: 0,repo,language,readme_contents
0,gocodeup/codeup-setup-script,Shell,# Codeup Setup Script\n\nSetup script for Code...
1,gocodeup/movies-application,JavaScript,"# Movies Application\n\nFor this project, we w..."
2,torvalds/linux,C,Linux kernel\n============\n\nThere are severa...
3,beetbox/beets,Python,.. image:: https://img.shields.io/pypi/v/beets...
4,scottschiller/SoundManager2,JavaScript,# SoundManager 2: JavaScript Sound for the Web...
...,...,...,...
107,fastai/courses,Jupyter Notebook,# Practical Deep Learning for Coders (fast.ai ...
108,Yorko/mlcourse.ai,Python,"<div align=""center"">\n\n![ODS stickers](https:..."
109,jtoy/awesome-tensorflow,,# Awesome TensorFlow [![Awesome](https://cdn....
110,nlintz/TensorFlow-Tutorials,Jupyter Notebook,# TensorFlow-Tutorials\n[![Build Status](https...


In [3]:
df = prepare.prep_data(df, 'readme_contents')

In [4]:
df

Unnamed: 0,repo,language,readme_contents,stemmed,lemmatized,clean,stopwords_removed,doc_length,words
1,gocodeup/movies-application,JavaScript,"# Movies Application\n\nFor this project, we w...",movi applic for thi project we will be build a...,movie application for this project we will be ...,movie application project building single page...,361,423,"[movie, application, project, building, single..."
2,torvalds/linux,C,Linux kernel\n============\n\nThere are severa...,linux kernel there are sever guid for kernel d...,linux kernel there are several guide for kerne...,linux kernel several guide kernel developer us...,39,71,"[linux, kernel, several, guide, kernel, develo..."
3,beetbox/beets,Python,.. image:: https://img.shields.io/pypi/v/beets...,imag http img shield io pypi v beet svg target...,image http img shield io pypi v beet svg targe...,image http img shield io pypi v beet svg targe...,191,532,"[image, http, img, shield, io, pypibeet, svg, ..."
4,scottschiller/SoundManager2,JavaScript,# SoundManager 2: JavaScript Sound for the Web...,soundmanag javascript sound for the web by wra...,soundmanager javascript sound for the web by w...,soundmanager javascript sound web wrapping ext...,305,614,"[soundmanager, javascript, sound, web, wrappin..."
5,CreateJS/SoundJS,JavaScript,# SoundJS\r\n\r\nSoundJS is a library to make ...,soundj soundj is a librari to make work with a...,soundjs soundjs is a library to make working w...,soundjs soundjs library make working audio web...,155,355,"[soundjs, soundjs, library, make, working, aud..."
...,...,...,...,...,...,...,...,...,...
104,josephmisiti/awesome-machine-learning,Python,# Awesome Machine Learning [![Awesome](https:/...,awesom machin learn awesom http cdn rawgit com...,awesome machine learning awesome http cdn rawg...,awesome machine learning awesome http cdn rawg...,3809,18096,"[awesome, machine, learning, awesome, http, cd..."
107,fastai/courses,Jupyter Notebook,# Practical Deep Learning for Coders (fast.ai ...,practic deep learn for coder fast ai cours the...,practical deep learning for coder fast ai cour...,practical deep learning coder fast ai course l...,59,82,"[practical, deep, learning, coder, fast, ai, c..."
108,Yorko/mlcourse.ai,Python,"<div align=""center"">\n\n![ODS stickers](https:...",div align center od sticker http github com yo...,div align center od sticker http github com yo...,div align center od sticker http github com yo...,194,2061,"[div, align, center, od, sticker, http, github..."
110,nlintz/TensorFlow-Tutorials,Jupyter Notebook,# TensorFlow-Tutorials\n[![Build Status](https...,tensorflow tutori build statu http travi ci or...,tensorflow tutorial build status http travis c...,tensorflow tutorial build status http travis c...,10,142,"[tensorflow, tutorial, build, status, http, tr..."


In [5]:
df.language.value_counts()

JavaScript          28
Python              17
Ruby                12
HTML                10
C++                  8
Jupyter Notebook     6
CSS                  4
TypeScript           3
Java                 2
Scala                2
C#                   2
C                    2
Name: language, dtype: int64

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 96 entries, 1 to 111
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   repo               96 non-null     object
 1   language           96 non-null     object
 2   readme_contents    96 non-null     object
 3   stemmed            96 non-null     object
 4   lemmatized         96 non-null     object
 5   clean              96 non-null     object
 6   stopwords_removed  96 non-null     int64 
 7   doc_length         96 non-null     int64 
 8   words              96 non-null     object
dtypes: int64(2), object(7)
memory usage: 7.5+ KB


In [7]:
from sklearn.model_selection import train_test_split

train, validate = train_test_split(df, stratify=df.language, test_size=.2, random_state=123)

In [8]:
train.shape

(76, 9)

In [9]:
train.language.value_counts()

JavaScript          22
Python              13
Ruby                 9
HTML                 8
C++                  6
Jupyter Notebook     5
CSS                  3
Java                 2
Scala                2
TypeScript           2
C                    2
C#                   2
Name: language, dtype: int64

In [10]:
validate.shape

(20, 9)

In [11]:
validate.language.value_counts()

JavaScript          6
Python              4
Ruby                3
HTML                2
C++                 2
Jupyter Notebook    1
TypeScript          1
CSS                 1
Name: language, dtype: int64

In [12]:
# create series objects for each top_code_clean that is a string of words joined on spaces to make it 1 continious string  
javascript_words = ' '.join(train[train.language=='JavaScript'].clean)
python_words = ' '.join(train[train.language=='Python'].clean)
ruby_words = ' '.join(train[train.language=='Ruby'].clean)
html_words = ' '.join(train[train.language=='HTML'].clean)
c_plus_plus_words = ' '.join(train[train.language=='C++'].clean)
all_words = ' '.join(train.clean)

# Exploration

Explore the data that you have scraped. Here are some ideas for exploration:

- What are the most common words in READMEs?
- What does the distribution of IDFs look like for the most common words?
- Does the length of the README vary by programming language?
- Do different programming languages use a different number of unique words?

### What are the most common words in READMEs?

In [13]:
all_words_df = pd.DataFrame(all_words.split())

In [14]:
all_words_df.value_counts().head(10)

http          1749
com           1049
github         580
org            487
00s            331
www            245
build          231
tensorflow     220
data           207
file           186
dtype: int64

### Does the length of the README vary by programming language?

In [15]:
train.groupby('language').doc_length.mean().sort_values(ascending=False)

language
Python              1100.384615
C++                  859.833333
Ruby                 698.777778
JavaScript           513.000000
HTML                 457.500000
C#                   439.000000
Scala                422.500000
TypeScript           310.000000
C                    252.500000
CSS                  207.000000
Jupyter Notebook     165.200000
Java                 149.500000
Name: doc_length, dtype: float64

In [16]:
from math import sqrt
from scipy import stats

In [17]:
# This cell block looks at every pair of languages and compares the document lengths using a t-test

# Start by developing two identical lists of all of the languages represented in the train dataset
# We will use the series above to form our list so our output is arranged in a similar order
train_language_list_1 = list(train.groupby('language').doc_length.mean().sort_values(ascending=False).index)
train_language_list_2 = list(train.groupby('language').doc_length.mean().sort_values(ascending=False).index)

# This empty list will hold information about which pairs have been tested. 
# If the python and javascript pair has already be tested, then we do not need to test the javascript and python pair
testing_pairs = []

for language_1 in train_language_list_1: # Iterates through list 1
    for language_2 in train_language_list_2: # Iterates through list 2 in entirety for each element in list 1
        
        if language_1 == language_2: # Cannot run a t-test against itself, so skip the test if the two list elements are identical
            continue
            
        else:
            alpha = 0.05 # Set alpha
            
            # Run the t-test and store the t-statistic and the p-value
            stat, p = stats.ttest_ind(train[train.language == language_1].doc_length, train[train.language == language_2].doc_length)
            
            # If the p-value is statistically significant we print the results, otherwise we do nothing
            if p/2 < alpha:
                
                # Creating strings to represent the pair that is being tested (eg. 'Python and JavaScript' & 'JavaScript and Python')
                testing_pair_1 = language_1 + " " + language_2
                testing_pair_2 = language_2 + " " + language_1
                
                # If this unique pair has not yet been tested:
                if (testing_pair_1 not in testing_pairs) and (testing_pair_2 not in testing_pairs):
                    
                    # Add this pair to the testing_pairs list so that we do not output duplicate t-test results
                    testing_pairs.append(testing_pair_1)
                    testing_pairs.append(testing_pair_2)
                    
                    # Print the results of the test
                    print("----------------")
                    print(f"Document Length T-Test: {language_1} & {language_2}")
                    print("----------------")
                    print("Hypotheses:")
                    print(f"H_0: There is no difference in the mean document lengths of {language_1} and {language_2}")
                    print(f"H_a: There is a difference in the mean document lengths of {language_1} and {language_2}")
                    print('\n')
                    print(f"p-value: {p/2}")
                    print(f"t-statistic: {stat}")
                    print(f"We reject the null hypothesis")
                    print("\n")
                    if stat < 0:
                        print(f"The mean readme document length for {language_1} is smaller than {language_2}")
                    elif stat > 0:
                        print(f"The mean readme document length for {language_1} is larger than {language_2}")
                    print('\n','\n')
                    
                    # If the pair had already been tested, do not print any results and continue through the loop
                else:
                    continue

----------------
Document Length T-Test: Python & JavaScript
----------------
Hypotheses:
H_0: There is no difference in the mean document lengths of Python and JavaScript
H_a: There is a difference in the mean document lengths of Python and JavaScript


p-value: 0.023292790833676885
t-statistic: 2.067665566926816
We reject the null hypothesis


The mean readme document length for Python is larger than JavaScript

 

----------------
Document Length T-Test: C++ & Jupyter Notebook
----------------
Hypotheses:
H_0: There is no difference in the mean document lengths of C++ and Jupyter Notebook
H_a: There is a difference in the mean document lengths of C++ and Jupyter Notebook


p-value: 0.017854421020433928
t-statistic: 2.467630092745799
We reject the null hypothesis


The mean readme document length for C++ is larger than Jupyter Notebook

 

----------------
Document Length T-Test: Ruby & TypeScript
----------------
Hypotheses:
H_0: There is no difference in the mean document lengths o

### Do different programming languages use a different number of unique words?

In [18]:
unique_word_count = {}

for language in train_language_list_1:
    unique_word_count[language] = len(set(' '.join(train[train.language==language].clean).split()))

In [19]:
unique_word_count = pd.DataFrame.from_dict(unique_word_count, orient='index').rename(columns={0: 'num_unique_words'})

In [20]:
unique_word_count.sort_values(by='num_unique_words', ascending=False)

Unnamed: 0,num_unique_words
Python,3401
JavaScript,2667
Ruby,1872
C++,1201
HTML,1152
Jupyter Notebook,376
C#,352
Scala,339
CSS,323
TypeScript,232


While this gives us a total count of the number of unique words each programming language has connected to it, this is likely to be correlated both to the total document length AND the number of observations that each language has. Languages with more observations likely have more opportunities to expand the unique word list. 

A better metric would be the average number of unique words that each language has. To do this we will need to add a column to our dataframe.

In [21]:
train['unique_word_count'] = train.words.apply(lambda x: len(set(x)))

In [22]:
train

Unnamed: 0,repo,language,readme_contents,stemmed,lemmatized,clean,stopwords_removed,doc_length,words,unique_word_count
53,Hextris/hextris,JavaScript,"Hextris\n==========\n\n<img src=""images/twitte...",hextri img src imag twitter opengraph png widt...,hextris img src image twitter opengraph png wi...,hextris img src image twitter opengraph png wi...,83,178,"[hextris, img, src, image, twitter, opengraph,...",107
78,ahmia/ahmia-site,Python,[![Build Status](https://travis-ci.org/ahmia/a...,build statu http travi ci org ahmia ahmia site...,build status http travis ci org ahmia ahmia si...,build status http travis ci org ahmia ahmia si...,195,701,"[build, status, http, travis, ci, org, ahmia, ...",268
60,OptiKey/OptiKey,C#,# OptiKey\n\nOptiKey is an on-screen keyboard ...,optikey optikey is an on screen keyboard that ...,optikey optikey is an on screen keyboard that ...,optikey optikey screen keyboard designed help ...,111,241,"[optikey, optikey, screen, keyboard, designed,...",138
30,nprapps/app-template,JavaScript,nprviz's Project Template\n===================...,nprviz' project templat about thi templat abou...,nprviz's project template about this template ...,nprviz project template template template assu...,196,330,"[nprviz, project, template, template, template...",175
24,atlemo/SubtlePatterns,HTML,Subtle Patterns\n===============\n\nView all t...,subtl pattern view all the pattern from subtl ...,subtle pattern view all the pattern from subtl...,subtle pattern view pattern subtle pattern htt...,33,73,"[subtle, pattern, view, pattern, subtle, patte...",44
...,...,...,...,...,...,...,...,...,...,...
70,EFForg/action-center-platform,Ruby,[![Build Status](https://travis-ci.org/EFForg/...,build statu http travi ci org efforg action ce...,build status http travis ci org efforg action ...,build status http travis ci org efforg action ...,426,1009,"[build, status, http, travis, ci, org, efforg,...",409
47,MonoGame/MonoGame,C#,﻿# MonoGame\n\nOne framework for creating powe...,monogam one framework for creat power cross pl...,monogame one framework for creating powerful c...,monogame one framework creating powerful cross...,201,637,"[monogame, one, framework, creating, powerful,...",252
91,umutisik/Eigentechno,Jupyter Notebook,## Eigentechno\n\nCode for applying Principal ...,eigentechno code for appli princip compon anal...,eigentechno code for applying principal compon...,eigentechno code applying principal component ...,3,22,"[eigentechno, code, applying, principal, compo...",20
3,beetbox/beets,Python,.. image:: https://img.shields.io/pypi/v/beets...,imag http img shield io pypi v beet svg target...,image http img shield io pypi v beet svg targe...,image http img shield io pypi v beet svg targe...,191,532,"[image, http, img, shield, io, pypibeet, svg, ...",231


Now we can run a similar set of t-tests to compare the unique word count for each programming language

In [23]:
train.groupby('language').unique_word_count.mean().sort_values(ascending=False)

language
Python              388.846154
Ruby                288.222222
C++                 271.666667
JavaScript          227.681818
C#                  195.000000
Scala               187.500000
HTML                169.875000
TypeScript          126.500000
C                   125.500000
CSS                 116.333333
Jupyter Notebook     90.800000
Java                 83.500000
Name: unique_word_count, dtype: float64

In [24]:
# This cell block looks at every pair of languages and compares the document lengths using a t-test

# Start by developing two identical lists of all of the languages represented in the train dataset
# We will use the languages represented in the array above so that the output below is sorted in a similar manner
train_language_list_1 = list(train.groupby('language').unique_word_count.mean().sort_values(ascending=False).index)
train_language_list_2 = list(train.groupby('language').unique_word_count.mean().sort_values(ascending=False).index)

# This empty list will hold information about which pairs have been tested. 
# If the python and javascript pair has already be tested, then we do not need to test the javascript and python pair
testing_pairs = []

for language_1 in train_language_list_1: # Iterates through list 1
    for language_2 in train_language_list_2: # Iterates through list 2 in entirety for each element in list 1
        
        if language_1 == language_2: # Cannot run a t-test against itself, so skip the test if the two list elements are identical
            continue
            
        else:
            alpha = 0.05 # Set alpha
            
            # Run the t-test and store the t-statistic and the p-value
            stat, p = stats.ttest_ind(train[train.language == language_1].unique_word_count, train[train.language == language_2].unique_word_count)
            
            # If the p-value is statistically significant we print the results, otherwise we do nothing
            if p/2 < alpha:
                
                # Creating strings to represent the pair that is being tested (eg. 'Python and JavaScript' & 'JavaScript and Python')
                testing_pair_1 = language_1 + " " + language_2
                testing_pair_2 = language_2 + " " + language_1
                
                # If this unique pair has not yet been tested:
                if (testing_pair_1 not in testing_pairs) and (testing_pair_2 not in testing_pairs):
                    
                    # Add this pair to the testing_pairs list so that we do not output duplicate t-test results
                    testing_pairs.append(testing_pair_1)
                    testing_pairs.append(testing_pair_2)
                    
                    # Print the results of the test
                    print("----------------")
                    print(f"Unique Word Count T-Test: {language_1} & {language_2}")
                    print("----------------")
                    print("Hypotheses:")
                    print(f"H_0: There is no difference in the mean number of unique words for {language_1} and {language_2}")
                    print(f"H_a: There is a difference in the mean number of unique words for {language_1} and {language_2}")
                    print('\n')
                    print(f"p-value: {p/2}")
                    print(f"t-statistic: {stat}")
                    print(f"We reject the null hypothesis")
                    print("\n")
                    if stat < 0:
                        print(f"The mean number of unique words for {language_1} is smaller than {language_2}")
                    elif stat > 0:
                        print(f"The mean number of unique words for {language_1} is larger than {language_2}")
                    print('\n','\n')
                    
                    # If the pair had already been tested, do not print any results and continue through the loop
                else:
                    continue

----------------
Unique Word Count T-Test: Python & JavaScript
----------------
Hypotheses:
H_0: There is no difference in the mean number of unique words for Python and JavaScript
H_a: There is a difference in the mean number of unique words for Python and JavaScript


p-value: 0.032259111637321584
t-statistic: 1.9125475316269795
We reject the null hypothesis


The mean number of unique words for Python is larger than JavaScript

 

----------------
Unique Word Count T-Test: Python & Jupyter Notebook
----------------
Hypotheses:
H_0: There is no difference in the mean number of unique words for Python and Jupyter Notebook
H_a: There is a difference in the mean number of unique words for Python and Jupyter Notebook


p-value: 0.03847766820262695
t-statistic: 1.8904156781881192
We reject the null hypothesis


The mean number of unique words for Python is larger than Jupyter Notebook

 

----------------
Unique Word Count T-Test: Ruby & TypeScript
----------------
Hypotheses:
H_0: There 