In [1]:
# le basics
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from time import strftime

# le mon creation
import env
import acquire as a
import prepare as p
import explore as e
from pprint import pprint

# for text digestion
import unicodedata
import re
import json
from bs4 import BeautifulSoup

# nltk: natural language toolkit -> tokenization, stopwords
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
from collections import Counter

from sklearn.feature_extraction.text import TfidfVectorizer


# what's cookin', good lookin'?
import warnings
warnings.filterwarnings('ignore')

# Acquire

In [2]:
df_acquire = a.acquire_readmes()

In [3]:
df = df_acquire.copy()

In [4]:
df.head()

Unnamed: 0,repo,language,readme_contents
0,microsoft/terminal,C++,![terminal-logos](https://user-images.githubus...
1,microsoft/PowerToys,C#,# Microsoft PowerToys\n\n![Hero image for Micr...
2,huggingface/transformers,Python,<!---\nCopyright 2020 The HuggingFace Team. Al...
3,rust-lang/rust,Rust,# The Rust Programming Language\n\nThis is the...
4,mtdvio/every-programmer-should-know,,> *[Join our community](https://metadevelopmen...


In [5]:
df.shape

(683, 3)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 683 entries, 0 to 682
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   repo             683 non-null    object
 1   language         619 non-null    object
 2   readme_contents  681 non-null    object
dtypes: object(3)
memory usage: 21.3+ KB


# Prepare

In [7]:
df_prepare = p.prep_df_for_nlp(df, 'readme_contents', extra_words = p.EXTRA_WORDS)

In [8]:
df = df_prepare.copy()

In [9]:
df.shape

(683, 8)

In [10]:
df.head()

Unnamed: 0,repo,language,readme_contents,clean,stem,lemmatized,username,lemmatized_len
0,terminal,Other,![terminal-logos](https://user-images.githubus...,terminallogoshttpsuserimagesgithubusercontentc...,terminallogoshttpsuserimagesgithubusercontentc...,terminallogoshttpsuserimagesgithubusercontentc...,microsoft,11128
1,PowerToys,Other,# Microsoft PowerToys\n\n![Hero image for Micr...,microsoft powertoys hero microsoft powertoysdo...,microsoft powertoy hero microsoft powertoysdoc...,microsoft powertoys hero microsoft powertoysdo...,microsoft,9707
2,transformers,Python,<!---\nCopyright 2020 The HuggingFace Team. Al...,copyright 2020 huggingface team rights reserve...,copyright 2020 huggingfac team right reserv li...,copyright 2020 huggingface team right reserved...,huggingface,58996
3,rust,Other,# The Rust Programming Language\n\nThis is the...,rust programming language main source code rep...,rust program languag main sourc code repositor...,rust programming language main source code rep...,rust-lang,6474
4,every-programmer-should-know,Not Listed,> *[Join our community](https://metadevelopmen...,join communityhttpsmetadevelopmentio professio...,join communityhttpsmetadevelopmentio professio...,join communityhttpsmetadevelopmentio professio...,mtdvio,13700


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 683 entries, 0 to 682
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   repo             683 non-null    object  
 1   language         683 non-null    category
 2   readme_contents  683 non-null    object  
 3   clean            683 non-null    object  
 4   stem             683 non-null    object  
 5   lemmatized       683 non-null    object  
 6   username         683 non-null    object  
 7   lemmatized_len   683 non-null    int64   
dtypes: category(1), int64(1), object(6)
memory usage: 43.7+ KB


In [12]:
train, validate, test = p.split_data(df, 'language')

In [13]:
df = train.copy()

In [14]:
df.index = range(len(df.index))

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 443 entries, 0 to 442
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   repo             443 non-null    object  
 1   language         443 non-null    category
 2   readme_contents  443 non-null    object  
 3   clean            443 non-null    object  
 4   stem             443 non-null    object  
 5   lemmatized       443 non-null    object  
 6   username         443 non-null    object  
 7   lemmatized_len   443 non-null    int64   
dtypes: category(1), int64(1), object(6)
memory usage: 25.1+ KB


# Explore based on Ryan's notebooks

# Top five languages

### make the following cell Code to generate language series

JavaScript_words_series = (' '.join(df[df.language == 'JavaScript']['readme_contents']))
Python_words_series = (' '.join(df[df.language == 'Python']['readme_contents']))
TypeScript_words_series = (' '.join(df[df.language == 'TypeScript']['readme_contents']))
Go_words_words_series = (' '.join(df[df.language == 'Go']['readme_contents']))
Java_words_series = (' '.join(df[df.language == 'Java']['readme_contents']))

all_words_series = (' '.join(df['readme_contents']))


# Thursday: Freqency of a word within a single readme

In [16]:
# plan have loop run through the contenct of every read me and count the words that are in that readme. 
# ham_words = clean(' '.join(df[df.label == 'ham']['text']))
# readme_words = join ['lemmatized'] but df.label = row
df.head()
## let's get some sights on word frequency by taking our words back apart
# we will split each set of words by the spaces,
# turn that into a list, cast that list as a Series,
# and then take the value counts of that Series
# We will do this for each type of word present
#ham_freq = pd.Series(ham_words).value_counts()

Unnamed: 0,repo,language,readme_contents,clean,stem,lemmatized,username,lemmatized_len
0,awesome-actions,Not Listed,"<p align=""center"">\n <br>\n <img src=""awes...",p aligncenter br srcawesomeactionspng width150...,p aligncent br srcawesomeactionspng width150 b...,p aligncenter br srcawesomeactionspng width150...,sdras,34287
1,InstaPy,Python,"<p align=""center"">\n <img src=""https://i.imgu...",p aligncenter srchttpsiimgurcomsjzfzsljpg widt...,p aligncent srchttpsiimgurcomsjzfzsljpg width1...,p aligncenter srchttpsiimgurcomsjzfzsljpg widt...,InstaPy,2643
2,osquery,Other,"# osquery\n\n<p align=""center"">\n<img alt=""osq...",osquery p aligncenter altosquery logo width200...,osqueri p aligncent altosqueri logo width200 s...,osquery p aligncenter altosquery logo width200...,osquery,4604
3,Best-App,Not Listed,\nBest App\n----\n\n*经常会有朋友想知道有哪些 Apps 或 服务 是值...,best app apps bestapp ios app httpsappsappleco...,best app app bestapp io app httpsappsapplecomc...,best app apps bestapp io app httpsappsapplecom...,hzlzh,17437
4,actix-web,Other,actix-web/README.md,actixwebreadmemd,actixwebreadmemd,actixwebreadmemd,actix,16


In [17]:
df.lemmatized.info()

<class 'pandas.core.series.Series'>
RangeIndex: 443 entries, 0 to 442
Series name: lemmatized
Non-Null Count  Dtype 
--------------  ----- 
443 non-null    object
dtypes: object(1)
memory usage: 3.6+ KB


In [18]:
wnl = nltk.stem.WordNetLemmatizer()

In [19]:
lemmas = [wnl.lemmatize(word) for word in df.lemmatized[1]]

In [20]:
df1 = df.lemmatized.str.split().apply(lambda x: Counter(x))

In [21]:
df1 

0      {'p': 2, 'aligncenter': 1, 'br': 2, 'srcawesom...
1      {'p': 7, 'aligncenter': 3, 'srchttpsiimgurcoms...
2      {'osquery': 12, 'p': 4, 'aligncenter': 2, 'alt...
3      {'best': 1, 'app': 12, 'apps': 9, 'bestapp': 1...
4                                {'actixwebreadmemd': 1}
                             ...                        
438    {'slyhttpdarsainsly': 1, 'javascript': 1, 'lib...
439    {'20230112': 1, '0000': 1, 'android': 1, 'wind...
440    {'english': 2, 'repo': 1, 'gitbook': 1, 'branc...
441                                      {'readmemd': 1}
442    {'github': 9, 'licensehttpsimgshieldsiogithubl...
Name: lemmatized, Length: 443, dtype: object

In [22]:
def lemm_counter(df):
    df.lemmatized.str.split().apply(lambda x: Counter(x))
    return df

In [23]:
ones = e.get_ngram_frequency(df.lemmatized)

In [24]:
freq_words = list(ones[ones >10].index)

In [25]:
# BEWARE THIS BOX IS REALLY LONG!!!!!

#for word in freq_words:
    #print(f'{word}\n----------------\n{df[df.lemmatized.str.contains(word)].index.value_counts()}\n')


In [26]:
print(type(df.lemmatized))

<class 'pandas.core.series.Series'>


#### beginning of block i want in loop

In [27]:
df.head()

Unnamed: 0,repo,language,readme_contents,clean,stem,lemmatized,username,lemmatized_len
0,awesome-actions,Not Listed,"<p align=""center"">\n <br>\n <img src=""awes...",p aligncenter br srcawesomeactionspng width150...,p aligncent br srcawesomeactionspng width150 b...,p aligncenter br srcawesomeactionspng width150...,sdras,34287
1,InstaPy,Python,"<p align=""center"">\n <img src=""https://i.imgu...",p aligncenter srchttpsiimgurcomsjzfzsljpg widt...,p aligncent srchttpsiimgurcomsjzfzsljpg width1...,p aligncenter srchttpsiimgurcomsjzfzsljpg widt...,InstaPy,2643
2,osquery,Other,"# osquery\n\n<p align=""center"">\n<img alt=""osq...",osquery p aligncenter altosquery logo width200...,osqueri p aligncent altosqueri logo width200 s...,osquery p aligncenter altosquery logo width200...,osquery,4604
3,Best-App,Not Listed,\nBest App\n----\n\n*经常会有朋友想知道有哪些 Apps 或 服务 是值...,best app apps bestapp ios app httpsappsappleco...,best app app bestapp io app httpsappsapplecomc...,best app apps bestapp io app httpsappsapplecom...,hzlzh,17437
4,actix-web,Other,actix-web/README.md,actixwebreadmemd,actixwebreadmemd,actixwebreadmemd,actix,16


In [28]:
row1 = df.lemmatized[1]

In [29]:
type(row1)

str

In [30]:
row1_series = pd.Series(row1)

#### end of block i want in loop

In [31]:
e.top_five_words(row1_series)

Unnamed: 0,word_1,count_1,word_2,count_2,word_3,count_3,word_4,count_4,word_5,count_5
0,p,7,talk,3,aligncenter,3,project,3,community,3


In [32]:
# now to make a loop that runs through the rows of and converts the str of the lemmatized column into series


In [33]:
e.top_five_words(df.lemmatized)

Unnamed: 0,word_1,count_1,word_2,count_2,word_3,count_3,word_4,count_4,word_5,count_5
0,github,120,action,106,run,45,deploy,29,pull,27
1,p,7,talk,3,aligncenter,3,project,3,community,3
2,release,13,osquery,12,sql,11,select,7,minor,5
3,pro,20,app,12,999,11,499,11,apps,9
4,actixwebreadmemd,1,,0,,0,,0,,0
...,...,...,...,...,...,...,...,...,...,...
438,1,6,var,3,jquery,3,navigation,3,instance,3
439,v2ray,2,20230112,1,vmesseyjhzgqioiaimtuumja0ljmxljyziiwgimfpzci6i...,1,vmesseyj2ijogijiilcaichmioiaiz2l0ahvilmnvbs9mc...,1,vmesseyj2ijogijiilcaichmioiaiz2l0ahvilmnvbs9mc...,1
440,p,6,aligncenter,4,hrefhttpsgithubcomhaizlinfeinterviewstargazersimg,4,altstatra,4,hrefhttpswwwgeekxhcomimg,3
441,readmemd,1,,0,,0,,0,,0


# looking into significance

## Moving into testing frequency per readme with TF-IDF

In [34]:
document = df.lemmatized[1]

In [35]:


# clean up the text
document = document.lower().replace(',', '').replace('.', '')
# transform into a series
words = pd.Series(document.split())

# From the Series we can extract the value_counts, which is our raw count
# for term frequency. Once we have the raw counts, we can calculate the
# other measures.
(pd.DataFrame({'raw_count': words.value_counts()})
 .assign(frequency=lambda df: df.raw_count / df.raw_count.sum()))


Unnamed: 0,raw_count,frequency
p,7,0.044304
talk,3,0.018987
aligncenter,3,0.018987
project,3,0.018987
community,3,0.018987
...,...,...
work,1,0.006329
channelhttpsdiscordggfdetsht,1,0.006329
discord,1,0.006329
instapyhttpstwittercominstapy,1,0.006329


In [36]:
row1 = df.lemmatized[1]
row1

'p aligncenter srchttpsiimgurcomsjzfzsljpg width154 h1 aligncenterinstapyh1 p aligncentertooling bautomatesb social medium interaction farm like comment follower instagram implemented python using selenium modulep p aligncenter hrefhttpsgithubcomtimgrossmanninstapyblobmasterlicense srchttpsimgshieldsiobadgelicensegplv3bluesvg hrefhttpsgithubcomseleniumhqselenium srchttpsimgshieldsiobadgebuilt20withseleniumyellowsvg hrefhttpswwwpythonorg srchttpsimgshieldsiobadgebuilt20withpython3redsvg hrefhttpswwwgithubcomtimgrossmanninstapybacker srchttpsopencollectivecominstapybackersbadgesvg hrefhttpswwwgithubcomtimgrossmanninstapysponsors srchttpsopencollectivecominstapysponsorsbadgesvg hrefhttpsdiscordggfdetsht srchttpsimgshieldsiodiscord510385886869979136svg p p twitter instapyhttpstwittercominstapy discord channelhttpsdiscordggfdetsht work freecodingcamphttpswwwfreecodecamporgnewsmyopensourceinstagrambotgotme2500realfollowersfor5inservercostse40491358340 talk automating instagramhttpsyoutube4tm

In [37]:
type(row1)

str

In [38]:
words = pd.Series(row1.split())
words

0                                p
1                      aligncenter
2      srchttpsiimgurcomsjzfzsljpg
3                         width154
4                               h1
                  ...             
153                            get
154                         banned
155                            due
156                      extensive
157                            use
Length: 158, dtype: object

In [39]:
words_df=(pd.DataFrame({'raw_count': words.value_counts()})
 .assign(frequency=lambda df: df.raw_count / df.raw_count.sum())
 .assign(augmented_frequency=lambda df: df.frequency / df.frequency.max())

SyntaxError: unexpected EOF while parsing (1586270342.py, line 3)

In [None]:
words_df

In [None]:
words_df_augfreq = words_df[words_df.augmented_frequency>0.3]

In [None]:
words_df_augfreq.set_index()

In [None]:
tfs = []

# We'll caclulate the tf-idf value for every word across every document

# Start by iterating over all the documents
for doc, text in documents.items():
    # We'll make a data frame that contains the tf for every word in every document
    df = (pd.Series(text.split())
          .value_counts()
          .reset_index()
          .set_axis(['word', 'raw_count'], axis=1, inplace=False)
          .assign(tf=lambda df: df.raw_count / df.shape[0])
          .drop(columns='raw_count')
          .assign(doc=doc))
    # Then add that data frame to our list
    tfs.append(df)

# We'll then concatenate all the tf values together.
(pd.concat(tfs)
 # calculate the idf value for each word
 .assign(idf=lambda df: df.word.apply(idf))
 # then use the if and idf values to calculate tf-idf 
 .assign(tf_idf=lambda df: df.idf * df.tf)
 .drop(columns=['tf', 'idf'])
 .sort_values(by='tf_idf', ascending=False))

# working on repo titles

In [None]:
# is there significant indication of language in the title alone?
# titles will initally be lemmatized

In [40]:
df.head()

Unnamed: 0,repo,language,readme_contents,clean,stem,lemmatized,username,lemmatized_len
0,awesome-actions,Not Listed,"<p align=""center"">\n <br>\n <img src=""awes...",p aligncenter br srcawesomeactionspng width150...,p aligncent br srcawesomeactionspng width150 b...,p aligncenter br srcawesomeactionspng width150...,sdras,34287
1,InstaPy,Python,"<p align=""center"">\n <img src=""https://i.imgu...",p aligncenter srchttpsiimgurcomsjzfzsljpg widt...,p aligncent srchttpsiimgurcomsjzfzsljpg width1...,p aligncenter srchttpsiimgurcomsjzfzsljpg widt...,InstaPy,2643
2,osquery,Other,"# osquery\n\n<p align=""center"">\n<img alt=""osq...",osquery p aligncenter altosquery logo width200...,osqueri p aligncent altosqueri logo width200 s...,osquery p aligncenter altosquery logo width200...,osquery,4604
3,Best-App,Not Listed,\nBest App\n----\n\n*经常会有朋友想知道有哪些 Apps 或 服务 是值...,best app apps bestapp ios app httpsappsappleco...,best app app bestapp io app httpsappsapplecomc...,best app apps bestapp io app httpsappsapplecom...,hzlzh,17437
4,actix-web,Other,actix-web/README.md,actixwebreadmemd,actixwebreadmemd,actixwebreadmemd,actix,16


In [41]:
tok_name = p.tokenize('repo')

In [42]:
df[‘clean_title’] = df[‘repo’].apply(p.squeaky_clean)

'repo'

## column for lemmatized repo names

In [63]:
df.head(1)

Unnamed: 0,repo,language,readme_contents,clean,stem,lemmatized,username,lemmatized_len,clean_title,dirty_title
0,awesome-actions,Not Listed,"<p align=""center"">\n <br>\n <img src=""awes...",p aligncenter br srcawesomeactionspng width150...,p aligncent br srcawesomeactionspng width150 b...,p aligncenter br srcawesomeactionspng width150...,sdras,34287,awesomeactions,awesome-actions


In [43]:
df['clean_title'] = df['repo'].apply(p.squeaky_clean)

In [53]:
df['dirty_title'] = df['repo']

In [69]:
df_clean_title_count = df.groupby(['language'])['clean_title'].count()
df_clean_title_count

language
Go             38
Java           37
JavaScript     94
Not Listed     41
Other         147
Python         46
TypeScript     40
Name: clean_title, dtype: int64

In [82]:
df_clean_title_count = df.groupby(['language', 'clean_title'])['clean_title'].count()
df_clean_title_count

language    clean_title          
Go          12306                    0
            30daysofjavascript       0
            30secondsofcss           0
            50projects50days         0
            955wlb                   0
                                    ..
TypeScript  youmightnotneedjquery    0
            zheng                    0
            zipline                  0
            zshsyntaxhighlighting    0
            zxing                    0
Name: clean_title, Length: 3087, dtype: int64

In [83]:
((df.language == 'Go') == True).value_counts()

False    405
True      38
Name: language, dtype: int64

In [80]:
type(poop)

pandas.core.frame.DataFrame

# conclusion: titles are insignificant
### out of 443 rows: 
- when titles are cleand, 441 have no repeating words
- when titles are left as their original, 443 have no repeating words