In [1]:
import numpy as np
import pandas as pd
import unicodedata
import re
import nltk

from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
import prepare
# nltk.download('wordnet') - needed to run to download 'wordnet' resource to use lemmatize function

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_json('data.json')
df

Unnamed: 0,repo,language,readme_contents
0,gocodeup/codeup-setup-script,Shell,# Codeup Setup Script\n\nSetup script for Code...
1,gocodeup/movies-application,JavaScript,"# Movies Application\n\nFor this project, we w..."
2,torvalds/linux,C,Linux kernel\n============\n\nThere are severa...
3,beetbox/beets,Python,.. image:: https://img.shields.io/pypi/v/beets...
4,scottschiller/SoundManager2,JavaScript,# SoundManager 2: JavaScript Sound for the Web...
...,...,...,...
107,fastai/courses,Jupyter Notebook,# Practical Deep Learning for Coders (fast.ai ...
108,Yorko/mlcourse.ai,Python,"<div align=""center"">\n\n![ODS stickers](https:..."
109,jtoy/awesome-tensorflow,,# Awesome TensorFlow [![Awesome](https://cdn....
110,nlintz/TensorFlow-Tutorials,Jupyter Notebook,# TensorFlow-Tutorials\n[![Build Status](https...


In [3]:
df = prepare.prep_data(df, 'readme_contents')

In [4]:
df

Unnamed: 0,repo,language,readme_contents,stemmed,lemmatized,clean,stopwords_removed,doc_length,words
0,gocodeup/movies-application,JavaScript,"# Movies Application\n\nFor this project, we w...",movi applic for thi project we will be build a...,movie application for this project we will be ...,movie application building single page movie a...,367,417,"[movie, application, building, single, page, m..."
1,torvalds/linux,C,Linux kernel\n============\n\nThere are severa...,linux kernel there are sever guid for kernel d...,linux kernel there are several guide for kerne...,linux kernel several guide kernel developer us...,39,71,"[linux, kernel, several, guide, kernel, develo..."
2,beetbox/beets,Python,.. image:: https://img.shields.io/pypi/v/beets...,imag http img shield io pypi v beet svg target...,image http img shield io pypi v beet svg targe...,image http img shield io pypi v beet svg targe...,205,518,"[image, http, img, shield, io, pypibeet, svg, ..."
3,scottschiller/SoundManager2,JavaScript,# SoundManager 2: JavaScript Sound for the Web...,soundmanag javascript sound for the web by wra...,soundmanager javascript sound for the web by w...,soundmanager javascript sound web wrapping ext...,316,603,"[soundmanager, javascript, sound, web, wrappin..."
4,CreateJS/SoundJS,JavaScript,# SoundJS\r\n\r\nSoundJS is a library to make ...,soundj soundj is a librari to make work with a...,soundjs soundjs is a library to make working w...,soundjs soundjs make working audio web easier ...,160,350,"[soundjs, soundjs, make, working, audio, web, ..."
...,...,...,...,...,...,...,...,...,...
91,josephmisiti/awesome-machine-learning,Python,# Awesome Machine Learning [![Awesome](https:/...,awesom machin learn awesom http cdn rawgit com...,awesome machine learning awesome http cdn rawg...,awesome machine learning awesome http cdn rawg...,5027,16878,"[awesome, machine, learning, awesome, http, cd..."
92,fastai/courses,Jupyter Notebook,# Practical Deep Learning for Coders (fast.ai ...,practic deep learn for coder fast ai cours the...,practical deep learning for coder fast ai cour...,practical deep learning coder fast ai course l...,60,81,"[practical, deep, learning, coder, fast, ai, c..."
93,Yorko/mlcourse.ai,Python,"<div align=""center"">\n\n![ODS stickers](https:...",div align center od sticker http github com yo...,div align center od sticker http github com yo...,div align center od sticker http com yorko mlc...,218,2037,"[div, align, center, od, sticker, http, com, y..."
94,nlintz/TensorFlow-Tutorials,Jupyter Notebook,# TensorFlow-Tutorials\n[![Build Status](https...,tensorflow tutori build statu http travi ci or...,tensorflow tutorial build status http travis c...,tensorflow tutorial build status http travis c...,12,140,"[tensorflow, tutorial, build, status, http, tr..."


In [5]:
df.language.value_counts()

JavaScript          28
Python              17
Ruby                12
HTML                10
C++                  8
Jupyter Notebook     6
CSS                  4
TypeScript           3
Java                 2
C#                   2
Scala                2
C                    2
Name: language, dtype: int64

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96 entries, 0 to 95
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   repo               96 non-null     object
 1   language           96 non-null     object
 2   readme_contents    96 non-null     object
 3   stemmed            96 non-null     object
 4   lemmatized         96 non-null     object
 5   clean              96 non-null     object
 6   stopwords_removed  96 non-null     int64 
 7   doc_length         96 non-null     int64 
 8   words              96 non-null     object
dtypes: int64(2), object(7)
memory usage: 6.9+ KB


In [7]:
from sklearn.model_selection import train_test_split

train, validate = train_test_split(df, stratify=df.language, test_size=.2, random_state=123)

In [8]:
train.shape

(76, 9)

In [9]:
train.language.value_counts()

JavaScript          22
Python              13
Ruby                 9
HTML                 8
C++                  6
Jupyter Notebook     5
CSS                  3
Java                 2
C#                   2
C                    2
Scala                2
TypeScript           2
Name: language, dtype: int64

In [10]:
validate.shape

(20, 9)

In [11]:
validate.language.value_counts()

JavaScript          6
Python              4
Ruby                3
HTML                2
C++                 2
Jupyter Notebook    1
CSS                 1
TypeScript          1
Name: language, dtype: int64

In [12]:
# create series objects for each top_code_clean that is a string of words joined on spaces to make it 1 continious string  
javascript_words = ' '.join(train[train.language=='JavaScript'].clean)
python_words = ' '.join(train[train.language=='Python'].clean)
ruby_words = ' '.join(train[train.language=='Ruby'].clean)
html_words = ' '.join(train[train.language=='HTML'].clean)
c_plus_plus_words = ' '.join(train[train.language=='C++'].clean)
all_words = ' '.join(train.clean)

# Exploration

Explore the data that you have scraped. Here are some ideas for exploration:

- What are the most common words in READMEs?
- What does the distribution of IDFs look like for the most common words?
- Does the length of the README vary by programming language?
- Do different programming languages use a different number of unique words?

### What are the most common words in READMEs?

In [13]:
all_words_df = pd.DataFrame(all_words.split())

In [14]:
all_words_df.value_counts().head(10)

http          1749
com           1049
org            487
00s            331
www            245
build          231
tensorflow     220
data           207
file           186
license        178
dtype: int64

### What does the distribution of IDFs look like for the most common words?

In [15]:
# List of the top ten most common words across the combined readmes of the train dataset
most_common_words = [word[0] for word in list(all_words_df.value_counts().head(10).index)]
most_common_words

['http',
 'com',
 'org',
 '00s',
 'www',
 'build',
 'tensorflow',
 'data',
 'file',
 'license']

In [16]:
def idf(most_common_words):
    # This dictionary will store how many documents each word appears in 
    appearances_dict = dict.fromkeys(most_common_words)
    
    # The total number of documents is based on the number of rows in the train dataframe
    number_of_documents = train.shape[0]
    
    # This is essentially a list containing the contents of the words column in the dataframe. It is a list of lists.
    list_of_wordlists = list(train.words.values)
    
    # Start iterating through the list of common words. We want to collect information for each one.
    for word in most_common_words:
        
        # Set the initial number of documents that the word appears in to zero
        number_of_appearances = 0
        
        # Start iterating through the list made from the words column in the dataframe 
        for words in list_of_wordlists:
            
            # If the current word is in a document, add 1 to the number of appearances and then move to the next document
            if word in words:
                number_of_appearances += 1
        
        # Once all the documents have been iterated through, add the sum total of all appearances to our appearances dictionary
        appearances_dict[word] = number_of_appearances
        
    # Create a new dictionary that will contain the IDF values for each word
    idf_dict = dict.fromkeys(most_common_words)
    
    # Start iterating through the list of common words again, using the number of appearances and the total number of documents to calculate the IDF and update the relevant key:value in the dictionary
    for word in most_common_words:
        idf_dict[word] = np.log(number_of_documents / appearances_dict[word])
        
    # Return the dictionary showing the actual number of appearances and the dictionary showing the calculated IDFs
    return appearances_dict, idf_dict

In [17]:
idf(most_common_words)

({'http': 75,
  'com': 69,
  'org': 61,
  '00s': 1,
  'www': 53,
  'build': 40,
  'tensorflow': 5,
  'data': 26,
  'file': 42,
  'license': 45},
 {'http': 0.013245226750020723,
  'com': 0.0966268356890717,
  'org': 0.21985947611301987,
  '00s': 4.330733340286331,
  'www': 0.3604414267342092,
  'build': 0.6418538861723947,
  'tensorflow': 2.7212954278522306,
  'data': 1.072636802264849,
  'file': 0.5930637220029628,
  'license': 0.5240708505160113})

### Does the length of the README vary by programming language?

In [18]:
train.groupby('language').doc_length.mean().sort_values(ascending=False)

language
Python              1086.615385
C++                  845.333333
Ruby                 688.666667
JavaScript           499.363636
HTML                 450.375000
C#                   430.500000
Scala                418.000000
TypeScript           300.500000
C                    240.000000
CSS                  196.333333
Jupyter Notebook     162.800000
Java                 146.000000
Name: doc_length, dtype: float64

In [19]:
from math import sqrt
from scipy import stats

In [20]:
# This cell block looks at every pair of languages and compares the document lengths using a t-test

# Start by developing two identical lists of all of the languages represented in the train dataset
# We will use the series above to form our list so our output is arranged in a similar order
# When this is converted to a function, we can generate the list in a more generalized manner
train_language_list_1 = list(train.groupby('language').doc_length.mean().sort_values(ascending=False).index)
train_language_list_2 = list(train.groupby('language').doc_length.mean().sort_values(ascending=False).index)

# This empty list will hold information about which pairs have been tested. 
# If the python and javascript pair has already be tested, then we do not need to test the javascript and python pair
testing_pairs = []

for language_1 in train_language_list_1: # Iterates through list 1
    for language_2 in train_language_list_2: # Iterates through list 2 in entirety for each element in list 1
        
        if language_1 == language_2: # Cannot run a t-test against itself, so skip the test if the two list elements are identical
            continue
            
        else:
            alpha = 0.05 # Set alpha
            
            # Run the t-test and store the t-statistic and the p-value
            stat, p = stats.ttest_ind(train[train.language == language_1].doc_length, train[train.language == language_2].doc_length)
            
            # If the p-value is statistically significant we print the results, otherwise we do nothing
            if p/2 < alpha:
                
                # Creating strings to represent the pair that is being tested (eg. 'Python and JavaScript' & 'JavaScript and Python')
                testing_pair_1 = language_1 + " " + language_2
                testing_pair_2 = language_2 + " " + language_1
                
                # If this unique pair has not yet been tested:
                if (testing_pair_1 not in testing_pairs) and (testing_pair_2 not in testing_pairs):
                    
                    # Add this pair to the testing_pairs list so that we do not output duplicate t-test results
                    testing_pairs.append(testing_pair_1)
                    testing_pairs.append(testing_pair_2)
                    
                    # Print the results of the test
                    print("----------------")
                    print(f"Document Length T-Test: {language_1} & {language_2}")
                    print("----------------")
                    print("Hypotheses:")
                    print(f"H_0: There is no difference in the mean document lengths of {language_1} and {language_2}")
                    print(f"H_a: There is a difference in the mean document lengths of {language_1} and {language_2}")
                    print('\n')
                    print(f"p-value: {p/2}")
                    print(f"t-statistic: {stat}")
                    print(f"We reject the null hypothesis")
                    print("\n")
                    if stat < 0:
                        print(f"The mean readme document length for {language_1} is smaller than {language_2}")
                    elif stat > 0:
                        print(f"The mean readme document length for {language_1} is larger than {language_2}")
                    print('\n','\n')
                    
                    # If the pair had already been tested, do not print any results and continue through the loop
                else:
                    continue

----------------
Document Length T-Test: Python & JavaScript
----------------
Hypotheses:
H_0: There is no difference in the mean document lengths of Python and JavaScript
H_a: There is a difference in the mean document lengths of Python and JavaScript


p-value: 0.02265755105674392
t-statistic: 2.0805481106603447
We reject the null hypothesis


The mean readme document length for Python is larger than JavaScript

 

----------------
Document Length T-Test: C++ & Jupyter Notebook
----------------
Hypotheses:
H_0: There is no difference in the mean document lengths of C++ and Jupyter Notebook
H_a: There is a difference in the mean document lengths of C++ and Jupyter Notebook


p-value: 0.01831934302482318
t-statistic: 2.4519606495569213
We reject the null hypothesis


The mean readme document length for C++ is larger than Jupyter Notebook

 

----------------
Document Length T-Test: Ruby & TypeScript
----------------
Hypotheses:
H_0: There is no difference in the mean document lengths o

### Do different programming languages use a different number of unique words?

In [21]:
unique_word_count = {}

for language in train_language_list_1:
    unique_word_count[language] = len(set(' '.join(train[train.language==language].clean).split()))

In [22]:
unique_word_count = pd.DataFrame.from_dict(unique_word_count, orient='index').rename(columns={0: 'num_unique_words'})

In [23]:
unique_word_count.sort_values(by='num_unique_words', ascending=False)

Unnamed: 0,num_unique_words
Python,3400
JavaScript,2666
Ruby,1870
C++,1199
HTML,1150
Jupyter Notebook,374
C#,349
Scala,336
CSS,321
TypeScript,231


While this gives us a total count of the number of unique words each programming language has connected to it, this is likely to be correlated both to the total document length AND the number of observations that each language has. Languages with more observations likely have more opportunities to expand the unique word list. 

A better metric would be the average number of unique words that each language has. To do this we will need to add a column to our dataframe.

In [24]:
train['unique_word_count'] = train.words.apply(lambda x: len(set(x)))

In [25]:
train

Unnamed: 0,repo,language,readme_contents,stemmed,lemmatized,clean,stopwords_removed,doc_length,words,unique_word_count
48,Hextris/hextris,JavaScript,"Hextris\n==========\n\n<img src=""images/twitte...",hextri img src imag twitter opengraph png widt...,hextris img src image twitter opengraph png wi...,hextris img src image twitter opengraph png wi...,91,170,"[hextris, img, src, image, twitter, opengraph,...",105
71,ahmia/ahmia-site,Python,[![Build Status](https://travis-ci.org/ahmia/a...,build statu http travi ci org ahmia ahmia site...,build status http travis ci org ahmia ahmia si...,build status http travis ci org ahmia ahmia si...,206,690,"[build, status, http, travis, ci, org, ahmia, ...",267
54,OptiKey/OptiKey,C#,# OptiKey\n\nOptiKey is an on-screen keyboard ...,optikey optikey is an on screen keyboard that ...,optikey optikey is an on screen keyboard that ...,optikey optikey screen keyboard designed help ...,117,235,"[optikey, optikey, screen, keyboard, designed,...",136
27,nprapps/app-template,JavaScript,nprviz's Project Template\n===================...,nprviz' project templat about thi templat abou...,nprviz's project template about this template ...,nprviz template template template assumption a...,222,304,"[nprviz, template, template, template, assumpt...",173
21,atlemo/SubtlePatterns,HTML,Subtle Patterns\n===============\n\nView all t...,subtl pattern view all the pattern from subtl ...,subtle pattern view all the pattern from subtl...,subtle pattern view pattern subtle pattern htt...,33,73,"[subtle, pattern, view, pattern, subtle, patte...",44
...,...,...,...,...,...,...,...,...,...,...
63,EFForg/action-center-platform,Ruby,[![Build Status](https://travis-ci.org/EFForg/...,build statu http travi ci org efforg action ce...,build status http travis ci org efforg action ...,build status http travis ci org efforg action ...,449,986,"[build, status, http, travis, ci, org, efforg,...",407
42,MonoGame/MonoGame,C#,﻿# MonoGame\n\nOne framework for creating powe...,monogam one framework for creat power cross pl...,monogame one framework for creating powerful c...,monogame one framework creating powerful cross...,212,626,"[monogame, one, framework, creating, powerful,...",249
79,umutisik/Eigentechno,Jupyter Notebook,## Eigentechno\n\nCode for applying Principal ...,eigentechno code for appli princip compon anal...,eigentechno code for applying principal compon...,eigentechno code applying principal component ...,3,22,"[eigentechno, code, applying, principal, compo...",20
2,beetbox/beets,Python,.. image:: https://img.shields.io/pypi/v/beets...,imag http img shield io pypi v beet svg target...,image http img shield io pypi v beet svg targe...,image http img shield io pypi v beet svg targe...,205,518,"[image, http, img, shield, io, pypibeet, svg, ...",228


Now we can run a similar set of t-tests to compare the unique word count for each programming language

In [26]:
train.groupby('language').unique_word_count.mean().sort_values(ascending=False)

language
Python              387.461538
Ruby                286.444444
C++                 269.833333
JavaScript          225.863636
C#                  192.500000
Scala               185.500000
HTML                169.125000
TypeScript          125.500000
C                   125.000000
CSS                 115.000000
Jupyter Notebook     89.600000
Java                 83.000000
Name: unique_word_count, dtype: float64

In [27]:
# This cell block looks at every pair of languages and compares the document lengths using a t-test

# Start by developing two identical lists of all of the languages represented in the train dataset
# We will use the languages represented in the array above so that the output below is sorted in a similar manner
# When this is converted to a function, we can generate the list in a more generalized manner
train_language_list_1 = list(train.groupby('language').unique_word_count.mean().sort_values(ascending=False).index)
train_language_list_2 = list(train.groupby('language').unique_word_count.mean().sort_values(ascending=False).index)

# This empty list will hold information about which pairs have been tested. 
# If the python and javascript pair has already be tested, then we do not need to test the javascript and python pair
testing_pairs = []

for language_1 in train_language_list_1: # Iterates through list 1
    for language_2 in train_language_list_2: # Iterates through list 2 in entirety for each element in list 1
        
        if language_1 == language_2: # Cannot run a t-test against itself, so skip the test if the two list elements are identical
            continue
            
        else:
            alpha = 0.05 # Set alpha
            
            # Run the t-test and store the t-statistic and the p-value
            stat, p = stats.ttest_ind(train[train.language == language_1].unique_word_count, train[train.language == language_2].unique_word_count)
            
            # If the p-value is statistically significant we print the results, otherwise we do nothing
            if p/2 < alpha:
                
                # Creating strings to represent the pair that is being tested (eg. 'Python and JavaScript' & 'JavaScript and Python')
                testing_pair_1 = language_1 + " " + language_2
                testing_pair_2 = language_2 + " " + language_1
                
                # If this unique pair has not yet been tested:
                if (testing_pair_1 not in testing_pairs) and (testing_pair_2 not in testing_pairs):
                    
                    # Add this pair to the testing_pairs list so that we do not output duplicate t-test results
                    testing_pairs.append(testing_pair_1)
                    testing_pairs.append(testing_pair_2)
                    
                    # Print the results of the test
                    print("----------------")
                    print(f"Unique Word Count T-Test: {language_1} & {language_2}")
                    print("----------------")
                    print("Hypotheses:")
                    print(f"H_0: There is no difference in the mean number of unique words for {language_1} and {language_2}")
                    print(f"H_a: There is a difference in the mean number of unique words for {language_1} and {language_2}")
                    print('\n')
                    print(f"p-value: {p/2}")
                    print(f"t-statistic: {stat}")
                    print(f"We reject the null hypothesis")
                    print("\n")
                    if stat < 0:
                        print(f"The mean number of unique words for {language_1} is smaller than {language_2}")
                    elif stat > 0:
                        print(f"The mean number of unique words for {language_1} is larger than {language_2}")
                    print('\n','\n')
                    
                    # If the pair had already been tested, do not print any results and continue through the loop
                else:
                    continue

----------------
Unique Word Count T-Test: Python & JavaScript
----------------
Hypotheses:
H_0: There is no difference in the mean number of unique words for Python and JavaScript
H_a: There is a difference in the mean number of unique words for Python and JavaScript


p-value: 0.031652954785133074
t-statistic: 1.9217650547719476
We reject the null hypothesis


The mean number of unique words for Python is larger than JavaScript

 

----------------
Unique Word Count T-Test: Python & Jupyter Notebook
----------------
Hypotheses:
H_0: There is no difference in the mean number of unique words for Python and Jupyter Notebook
H_a: There is a difference in the mean number of unique words for Python and Jupyter Notebook


p-value: 0.038384786732243714
t-statistic: 1.89172927906776
We reject the null hypothesis


The mean number of unique words for Python is larger than Jupyter Notebook

 

----------------
Unique Word Count T-Test: Ruby & TypeScript
----------------
Hypotheses:
H_0: There i