In [15]:
import re
import unicodedata
import pandas as pd
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('inflation_readme.csv', index_col=0)
df.head()

Unnamed: 0,repo,language,readme_contents
0,cashapp/InflationInject,Kotlin,# Inflation Injection\n\nConstructor-inject vi...
1,InflationX/ViewPump,Kotlin,ViewPump\n========\n\nView inflation you can i...
2,rdeits/iris-distro,Matlab,Introduction\n============\n\nThis package con...
3,uhussain/WebCrawlerForOnlineInflation,Python,# Table of Contents \n1. [Introduction](README...
4,sandes/zipfly,Python,[![Build Status](https://img.shields.io/circle...


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 848 entries, 0 to 847
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   repo             848 non-null    object
 1   language         727 non-null    object
 2   readme_contents  611 non-null    object
dtypes: object(3)
memory usage: 26.5+ KB


In [4]:
df.isnull().sum()

repo                 0
language           121
readme_contents    237
dtype: int64

In [6]:
df[df.readme_contents.isnull()]

Unnamed: 0,repo,language,readme_contents
17,future-tense/pool.futuretense.io,HTML,
19,gwern/gwern.net,Haskell,
55,kranthikittu/Android_inflation,Kotlin,
60,itamarcaspi/core-inflation,HTML,
67,motherjones/inflation-calculator,,
...,...,...,...
842,LUNDR/inflation_forecasts_with_tf,Jupyter Notebook,
843,millerngit/RateOfInflation,C++,
844,iamdingkai/what-s-driving-inflation,Jupyter Notebook,
846,fscheler/Inflation_Tracker_World,HTML,


In [None]:
# drop the null value

In [7]:
df = df.dropna()
df.shape

(523, 3)

### Prepare

In [9]:
def basic_clean(text):
    article = text.lower()
    article = unicodedata.normalize('NFKD', article)\
    .encode('ascii', 'ignore')\
    .decode('utf-8', 'ignore')
    article = re.sub(r"[^a-z0-9'\s]", '', article)
    return article 

In [10]:
def tokenize(basic_clean_text):
    tokenizer = nltk.tokenize.ToktokTokenizer()
    article = tokenizer.tokenize(basic_clean_text, return_str=True)
    return article

In [11]:
def stem(tokenized_text):
    ps = nltk.stem.porter.PorterStemmer()
    stems = [ps.stem(word) for word in tokenized_text.split()]
    article_stemmed = ' '.join(stems)
    return article_stemmed  

In [12]:
def lemmatize(tokenized_text):    
    wnl = nltk.stem.WordNetLemmatizer()    
    lemmas = [wnl.lemmatize(word) for word in tokenized_text.split()]    
    lemmatized_string = ' '.join(lemmas)    
    return lemmatized_string

In [13]:
def remove_stopwords(string, extra_words=None, exclude_words=None):    
    stopword_list = stopwords.words('english')    
    if exclude_words:        
        stopword_list = stopword_list + exclude_words
        
    if extra_words:        
        for word in extra_words:            
            stopword_list.remove(word)
            
    words = string.split()    
    filtered_words = [word for word in words if word not in stopword_list]    
    filtered_string = ' '.join(filtered_words)   
    return filtered_string

# no_stop = remove_stopwords(lemmatized)

In [None]:
# extra_words = ['the','no']
# no_stop1 = remove_stopwords(lemmatized, extra_words=extra_words)
# no_stop1

In [16]:
df['clean'] = df['readme_contents'].apply(basic_clean).apply(tokenize).apply(remove_stopwords)
df['stemmed'] = df['clean'].apply(stem)
df['lemmatized'] = df['clean'].apply(lemmatize)
df.head()

Unnamed: 0,repo,language,readme_contents,clean,stemmed,lemmatized
0,cashapp/InflationInject,Kotlin,# Inflation Injection\n\nConstructor-inject vi...,inflation injection constructorinject views xm...,inflat inject constructorinject view xml layou...,inflation injection constructorinject view xml...
1,InflationX/ViewPump,Kotlin,ViewPump\n========\n\nView inflation you can i...,viewpump view inflation intercept viewpump ins...,viewpump view inflat intercept viewpump instal...,viewpump view inflation intercept viewpump ins...
2,rdeits/iris-distro,Matlab,Introduction\n============\n\nThis package con...,introduction package contains iris algorithm i...,introduct packag contain iri algorithm iter co...,introduction package contains iris algorithm i...
3,uhussain/WebCrawlerForOnlineInflation,Python,# Table of Contents \n1. [Introduction](README...,table contents 1 introductionreadmemdintroduct...,tabl content 1 introductionreadmemdintroduct 2...,table content 1 introductionreadmemdintroducti...
4,sandes/zipfly,Python,[![Build Status](https://img.shields.io/circle...,build statushttpsimgshieldsiocirclecibuildgith...,build statushttpsimgshieldsiocirclecibuildgith...,build statushttpsimgshieldsiocirclecibuildgith...


In [18]:
df.language.value_counts().head(10)

Python              102
Jupyter Notebook     96
JavaScript           61
R                    55
HTML                 46
Java                 24
C++                  13
TypeScript           13
CSS                  10
TeX                   9
Name: language, dtype: int64

In [23]:
df =  df.astype('string')

In [29]:
df[df.language == 'Jupyter Notebook']

Unnamed: 0,repo,language,readme_contents,clean,stemmed,lemmatized
9,palewire/cpi,Jupyter Notebook,### Links * Docs: [palewi.re/docs/cpi/](https...,links docs palewiredocscpihttpspalewiredocscpi...,link doc palewiredocscpihttpspalewiredocscpi c...,link doc palewiredocscpihttpspalewiredocscpi c...
22,fipelle/replication-hasenzagl-et-al-2020,Jupyter Notebook,## A Model of the Fed's View on Inflation Thi...,model fed ' view inflation repository contains...,model fed ' view inflat repositori contain sou...,model fed ' view inflation repository contains...
26,rsvp/fecon235,Jupyter Notebook,## fecon235 :: Notebooks for financial economi...,fecon235 notebooks financial economics join ch...,fecon235 notebook financi econom join chat htt...,fecon235 notebook financial economics join cha...
59,koksoya/inflation-forecast,Jupyter Notebook,# inflation-forecast ## Objectives of the Pro...,inflationforecast objectives project define in...,inflationforecast object project defin inflat ...,inflationforecast objective project define inf...
88,fqroldan/Conquest,Jupyter Notebook,# Conquest Jupyter Notebooks replication of Th...,conquest jupyter notebooks replication conques...,conquest jupyt notebook replic conquest americ...,conquest jupyter notebook replication conquest...
...,...,...,...,...,...,...
826,Manu-Gr/Project---Forecasting-Inflation,Jupyter Notebook,# Project---Forecasting-Inflation * Analysing...,projectforecastinginflation analysing inflatio...,projectforecastinginfl analys inflat trend his...,projectforecastinginflation analysing inflatio...
827,longjivwamhi/Fandango-Ratings-Inflation-Analysis,Jupyter Notebook,# Fandango-Ratings-Inflation-Analysis,fandangoratingsinflationanalysis,fandangoratingsinflationanalysi,fandangoratingsinflationanalysis
829,kassiili/two-period-inflation,Jupyter Notebook,# two-period-inflation This code was used to ...,twoperiodinflation code used analyse nparticle...,twoperiodinfl code use analys nparticl simul t...,twoperiodinflation code used analyse nparticle...
838,lowsaelee/us-inflation-data,Jupyter Notebook,# us-inflation-data Data source: https://www....,usinflationdata data source httpswwwkagglecomv...,usinflationdata data sourc httpswwwkagglecomva...,usinflationdata data source httpswwwkagglecomv...


### only pick top 5 to predict

In [26]:
ADDITIONAL_STOPWORDS = []

def clean(text):
    'A simple function to cleanup data'
    wnl = nltk.stem.WordNetLemmatizer()
    stopwords = nltk.corpus.stopwords.words('english') + ADDITIONAL_STOPWORDS
    text = (unicodedata.normalize('NFKD', text)
             .encode('ascii', 'ignore')
             .decode('utf-8', 'ignore')
             .lower())
    words = re.sub(r'[^\w\s]', '', text).split()
    return [wnl.lemmatize(word) for word in words if word not in stopwords]

In [27]:
python_words = clean(' '.join(df[df.language == 'Python'].readme_contents))
jupyter_notebook_words = clean(' '.join(df[df.language == 'Jupyter Notebook'].readme_contents))
javascript_words = clean(' '.join(df[df.language == 'JavaScript'].readme_contents))
r_words = clean(' '.join(df[df.language == 'R'].readme_contents))
html_words = clean(' '.join(df[df.language == 'HTML'].readme_contents))
all_words = clean(' '.join(df.readme_contents))

In [28]:
python_words

['table',
 'content',
 '1',
 'introductionreadmemdintroduction',
 '2',
 'pipelinereadmemdpipeline',
 '3',
 'requirementsreadmemdrequirements',
 '4',
 'environment',
 'set',
 'upreadmemdenvironment20setup',
 '5',
 'repository',
 'structure',
 'run',
 'instructionsreadmemdrepository20structure20and20run20instructions',
 'introduction',
 'price',
 'crawler',
 'tracking',
 'price',
 'inflation',
 'project',
 'completed',
 '3',
 'week',
 'insight',
 'data',
 'engineering',
 'program',
 'new',
 'york',
 '20b',
 'session',
 'goal',
 'project',
 'calculate',
 'inflation',
 'rate',
 'first',
 'principle',
 'mean',
 'calculating',
 'inflation',
 'rate',
 'using',
 'price',
 'good',
 'service',
 'sold',
 'online',
 'project',
 'built',
 'pipleine',
 'use',
 'petabyte',
 'web',
 'page',
 'data',
 'contained',
 'common',
 'crawlhttpscommoncrawlorg',
 'archive',
 'web',
 'page',
 'content',
 'calculate',
 'inflation',
 'rate',
 'result',
 'used',
 'enhance',
 'investment',
 'strategy',
 'business',
