In [22]:
import pandas as pd
import numpy as np 

import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_json('data.json')
df

Unnamed: 0,repo,language,readme_contents
0,minvws/nl-covid19-notification-app-website,HTML,Content & Translations\n---------------------\...
1,heremaps/here-covid-19-tracker,JavaScript,"# HERE-COVID-2019\n\n*Update: March 17, 2020 -..."
2,trekhleb/covid-19,JavaScript,# Coronavirus (COVID-19) Dashboard\n\n📈 [**Cor...
3,PubInv/covid19-vent-list,,# COVID-19 Ventilator Projects and Resources w...
4,JieYingWu/COVID-19_US_County-level_Summaries,HTML,# County-level Socioeconomic Data for Predicti...
...,...,...,...
198,simonblowsnow/COVID-19-Visualization,Python,\n# 2019新型冠状病毒疫情数据可视化、疫情历史数据分析、数据更新、数据清洗行政区域代码...
199,thepanacealab/covid19_twitter,Jupyter Notebook,## Latest Updates:\n\n2/12/23 Version 153 of t...
200,Coders-Of-XDA-OT/covid19-status-android,Java,# COVID-19 Status Android app\r\n\r\nAn androi...
201,lispc/covid19-citymap-china,JavaScript,# 中国新型冠状病毒肺炎疫情地级市图\n\n在网页上通过新浪新闻的 API 获得 __现存_...


In [3]:
df[df['readme_contents'] == '']

Unnamed: 0,repo,language,readme_contents
5,elcronos/COVID-19,JavaScript,
7,WorldHealthOrganization/app,Dart,
60,joanby/covid19,HTML,
76,k-sys/covid-19,Jupyter Notebook,


In [4]:
df = df[df['readme_contents'] != '']

In [5]:
def basic_clean(string):
    '''
    This function takes in the original text.
    The text is all lowercased, 
    the text is encoded in ascii and any characters that are not ascii are ignored.
    The text is then decoded in utf-8 and any characters that are not ascii are ignored
    Additionally, special characters are all removed.
    A clean article is then returned
    '''
    #lowercase
    string = string.lower()
    
    #normalize
    string = unicodedata.normalize('NFKD', string)\
    .encode('ascii', 'ignore')\
    .decode('utf-8', 'ignore')
    
    #remove special characters and replaces it with blank
    string = re.sub(r"[^a-z0-9'\s]", '', string)
    
    return string

In [6]:
def tokenize(string):
    '''
    This function takes in a string
    and returns the string as individual tokens put back into the string
    '''
    #create the tokenizer
    tokenizer = nltk.tokenize.ToktokTokenizer()

    #use the tokenizer
    string = tokenizer.tokenize(string, return_str = True)

    return string

In [7]:
def stem(string):
    '''
    This function takes in text
    and returns the stem word joined back into the text
    '''
    #create porter stemmer
    ps = nltk.porter.PorterStemmer()
    
    #use the stem, split string using each word
    stems = [ps.stem(word) for word in string.split()]
    
    #join stem word to string
    string = ' '.join(stems)

    return string

In [8]:
def lemmatize(string):
    '''
    This function takes in a string
    and returns the lemmatized word joined back into the string
    '''
    #create the lemmatizer
    wnl = nltk.stem.WordNetLemmatizer()
    
    #look at the article 
    lemmas = [wnl.lemmatize(word) for word in string.split()]
    
    #join lemmatized words into article
    string = ' '.join(lemmas)

    return string

In [9]:
def remove_stopwords(string, extra_words = [], exclude_words = []):
    '''
    This function takes in text, extra words and exclude words
    and returns a list of text with stopword removed
    '''
    #create stopword list
    stopword_list = stopwords.words('english')
    
    #remove excluded words from list
    stopword_list = set(stopword_list) - set(exclude_words)
    
    #add the extra words to the list
    stopword_list = stopword_list.union(set(extra_words))
    
    #split the string into different words
    words = string.split()
    
    #create a list of words that are not in the list
    filtered_words = [word for word in words if word not in stopword_list]
    
    #join the words that are not stopwords (filtered words) back into the string
    string = ' '.join(filtered_words)
    
    return string

In [10]:
def prep_data(df, column, extra_words=[], exclude_words=[]):
    '''
    This function take in a df and the string name for a text column with 
    option to pass lists for extra_words and exclude_words and
    returns a df with the text article title, original text, stemmed text,
    lemmatized text, cleaned, tokenized, & lemmatized text with stopwords removed.
    '''
    
    #chain together clean, tokenize, remove stopwords
    df['clean'] = df[column].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(remove_stopwords, 
                                   extra_words=extra_words, 
                                   exclude_words=exclude_words)
    
    #chain clean, tokenize, stem, remove stopwords
    df['stemmed'] = df['clean'].apply(stem)
    
    #clean clean, tokenize, lemmatize, remove stopwords
    df['lemmatized'] = df['clean'].apply(lemmatize)
    
    return df[['repo', 'language', 'clean', 'stemmed', 'lemmatized']]

In [11]:
df = prep_data(df, 'readme_contents')
df

Unnamed: 0,repo,language,clean,stemmed,lemmatized
0,minvws/nl-covid19-notification-app-website,HTML,content translations add question add feature ...,content translat add question add featur creat...,content translation add question add feature c...
1,heremaps/here-covid-19-tracker,JavaScript,herecovid2019 update march 17 2020 production ...,herecovid2019 updat march 17 2020 product vers...,herecovid2019 update march 17 2020 production ...
2,trekhleb/covid-19,JavaScript,coronavirus covid19 dashboard coronavirus covi...,coronaviru covid19 dashboard coronaviru covid1...,coronavirus covid19 dashboard coronavirus covi...
3,PubInv/covid19-vent-list,,covid19 ventilator projects resources faqs lon...,covid19 ventil project resourc faq longer acti...,covid19 ventilator project resource faq longer...
4,JieYingWu/COVID-19_US_County-level_Summaries,HTML,countylevel socioeconomic data predictive mode...,countylevel socioeconom data predict model epi...,countylevel socioeconomic data predictive mode...
...,...,...,...,...,...
198,simonblowsnow/COVID-19-Visualization,Python,2019 covid192019ncov wuhancoronavirus visualiz...,2019 covid192019ncov wuhancoronaviru visual an...,2019 covid192019ncov wuhancoronavirus visualiz...
199,thepanacealab/covid19_twitter,Jupyter Notebook,latest updates 21223 version 153 dataset daily...,latest updat 21223 version 153 dataset daili d...,latest update 21223 version 153 dataset daily ...
200,Coders-Of-XDA-OT/covid19-status-android,Java,covid19 status android app android app track c...,covid19 statu android app android app track co...,covid19 status android app android app track c...
201,lispc/covid19-citymap-china,JavaScript,api github pages httpslispcgithubiocovid19city...,api github page httpslispcgithubiocovid19citym...,api github page httpslispcgithubiocovid19citym...


In [12]:
df['language'].value_counts()

Python              38
JavaScript          36
Jupyter Notebook    29
HTML                19
R                    9
TypeScript           7
Kotlin               7
Vue                  5
Ruby                 3
TeX                  2
Dart                 2
PHP                  2
Go                   2
Objective-C          1
PowerShell           1
TSQL                 1
SCSS                 1
C#                   1
CSS                  1
Svelte               1
Stan                 1
Java                 1
Name: language, dtype: int64

In [33]:
def seperate_language(df):
    
    desired_languages = ['Python', 'JavaScript', 'Jupyter Notebook', 'HTML', 'R']

    mask = df['language'].isin(desired_languages)

    df['language'] = pd.np.where(mask, df['language'], 'others')

    return df

In [13]:
desired_languages = ['Python', 'JavaScript', 'Jupyter Notebook', 'HTML', 'R']

mask = df['language'].isin(desired_languages)

df['language'] = pd.np.where(mask, df['language'], 'others')

df

Unnamed: 0,repo,language,clean,stemmed,lemmatized
0,minvws/nl-covid19-notification-app-website,HTML,content translations add question add feature ...,content translat add question add featur creat...,content translation add question add feature c...
1,heremaps/here-covid-19-tracker,JavaScript,herecovid2019 update march 17 2020 production ...,herecovid2019 updat march 17 2020 product vers...,herecovid2019 update march 17 2020 production ...
2,trekhleb/covid-19,JavaScript,coronavirus covid19 dashboard coronavirus covi...,coronaviru covid19 dashboard coronaviru covid1...,coronavirus covid19 dashboard coronavirus covi...
3,PubInv/covid19-vent-list,others,covid19 ventilator projects resources faqs lon...,covid19 ventil project resourc faq longer acti...,covid19 ventilator project resource faq longer...
4,JieYingWu/COVID-19_US_County-level_Summaries,HTML,countylevel socioeconomic data predictive mode...,countylevel socioeconom data predict model epi...,countylevel socioeconomic data predictive mode...
...,...,...,...,...,...
198,simonblowsnow/COVID-19-Visualization,Python,2019 covid192019ncov wuhancoronavirus visualiz...,2019 covid192019ncov wuhancoronaviru visual an...,2019 covid192019ncov wuhancoronavirus visualiz...
199,thepanacealab/covid19_twitter,Jupyter Notebook,latest updates 21223 version 153 dataset daily...,latest updat 21223 version 153 dataset daili d...,latest update 21223 version 153 dataset daily ...
200,Coders-Of-XDA-OT/covid19-status-android,others,covid19 status android app android app track c...,covid19 statu android app android app track co...,covid19 status android app android app track c...
201,lispc/covid19-citymap-china,JavaScript,api github pages httpslispcgithubiocovid19city...,api github page httpslispcgithubiocovid19citym...,api github page httpslispcgithubiocovid19citym...


In [18]:
df['language'].value_counts()

others              68
Python              38
JavaScript          36
Jupyter Notebook    29
HTML                19
R                    9
Name: language, dtype: int64

In [19]:
len(df['language'])

199

In [29]:
def train_val_test(df,col):
    seed = 42
    
    ''' This function is to split our data into our train, validate, and test subsets. We put in a dataframe
    and our target variable to then return the subsets of train, validate and test.'''
    
    train, test = train_test_split(df, train_size = 0.7, random_state = seed, stratify = df[col])
    
    validate, test = train_test_split(test, test_size = 0.5, random_state = seed, stratify = test[col])
    
    return train, validate, test

In [30]:
train, validate, test = train_val_test(df, 'language')

train.shape, validate.shape, test.shape

((139, 5), (30, 5), (30, 5))

In [32]:
train

Unnamed: 0,repo,language,clean,stemmed,lemmatized
163,dssg-pt/covid19pt-data,Jupyter Notebook,dados relativos pandemia covid19 em portugal u...,dado relativo pandemia covid19 em portug ultim...,dado relativos pandemia covid19 em portugal ul...
192,kaz-ogiwara/covid19,HTML,coronavirus disease covid19 situation report j...,coronaviru diseas covid19 situat report japan ...,coronavirus disease covid19 situation report j...
63,disease-sh/API,JavaScript,logopublicassetsimgiconlongpng brbr dockerhub ...,logopublicassetsimgiconlongpng brbr dockerhub ...,logopublicassetsimgiconlongpng brbr dockerhub ...
17,pennsignals/chime,Python,chime looking chime repo moved please update f...,chime look chime repo move pleas updat fork ht...,chime looking chime repo moved please update f...
45,arpanmangal/CovidAID,Python,covidaid detection covid19 xray images present...,covidaid detect covid19 xray imag present covi...,covidaid detection covid19 xray image present ...
...,...,...,...,...,...
88,PatilShreyas/Covid19-Notifier-IN,others,p aligncenter img srcappsrcmainiclauncherplays...,p aligncent img srcappsrcmainiclauncherplaysto...,p aligncenter img srcappsrcmainiclauncherplays...
140,minvws/nl-covid19-notification-app-design,others,covid19 notification app design het ontwikkele...,covid19 notif app design het ontwikkelen en on...,covid19 notification app design het ontwikkele...
27,MinCiencia/Datos-COVID19,Jupyter Notebook,datoscovid19 el objetivo de la mesa de datos c...,datoscovid19 el objetivo de la mesa de dato co...,datoscovid19 el objetivo de la mesa de datos c...
95,sfu-db/covid19-datasets,HTML,covid19 datasets apis list projects vision dev...,covid19 dataset api list project vision develo...,covid19 datasets apis list project vision deve...
