In [1]:
# notebook dependencies 
import os # used in caching
import pandas as pd
import numpy as np

# visualization imports
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# regular expression import
import re

# JSON import
import json

# importing BeautifulSoup for parsing HTML/XTML
from bs4 import BeautifulSoup as BSoup

# request module for connecting to APIs
from requests import get

# text prepare modules
import acquire
import prepare

# uni-code library
import unicodedata

# natural language toolkit library/modules
import nltk
from nltk.stem import PorterStemmer, LancasterStemmer, WordNetLemmatizer

from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

In [2]:
# list of english stop words

print(stopwords.words("english"))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [3]:
# pulling in the acquire df

df = pd.read_csv("metaverse.csv")

In [4]:
# df info

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 998 entries, 0 to 997
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   repo             998 non-null    object
 1   language         682 non-null    object
 2   readme_contents  741 non-null    object
dtypes: object(3)
memory usage: 23.5+ KB


In [5]:
# specifying the column dtypes

df = df[["repo", "language", "readme_contents"]].astype(str)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 998 entries, 0 to 997
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   repo             998 non-null    object
 1   language         998 non-null    object
 2   readme_contents  998 non-null    object
dtypes: object(3)
memory usage: 23.5+ KB


In [6]:
# creating a function titled, 'basic_clean'
# lowercase everything
# normalize unicode characters
# replace non-alphanumeric characters with whitespace

def basic_clean(string):

    # lowercase the text
    string = string.lower()

    # normalizing the text
    string = unicodedata.normalize('NFKD', string).encode('ascii', 'ignore').decode('utf-8', 'ignore')

    # return only alphanumeric values in text: everything else, convert to whitespace
    string = re.sub("[^a-z0-9\s']", '', string)
    
    # return the string text
    return string

In [7]:
# viewing the clean/tokenize/stem/lemmatization process

df["clean"] = df["readme_contents"].apply(prepare.basic_clean).apply(prepare.tokenize).apply(prepare.remove_stopwords, include_words = ["metaverse", "Metaverse", "meta-verse", "Meta-verse", "meta verse", "Meta Verse", "Meta verse"])
df["stemmed"] = df["clean"].apply(prepare.porter_stem)
df["lemmatized"] = df["clean"].apply(prepare.lemmatize)

df.head(20)

Unnamed: 0,repo,language,readme_contents,clean,stemmed,lemmatized
0,M3-org/awesome-metaverse,,# Awesome Metaverse [![Awesome](https://awesom...,awesome awesome https awesome badge svg https ...,awesom awesom http awesom badg svg http awesom...,awesome awesome http awesome badge svg http aw...
1,mvs-org/metaverse,C++,Metaverse Core Integration/staging Tree\n=====...,core integration staging tree build status htt...,core integr stage tree build statu http travi ...,core integration staging tree build status htt...
2,webaverse/app,JavaScript,"<img src=""docs/banner.jpeg"" width=100% />\n\n<...",img src docs banner jpeg width 100 align cente...,img src doc banner jpeg width 100 align center...,img src doc banner jpeg width 100 align center...
3,shadowcz007/awesome-metaverse,,"# awesome-metaverse\n<a href=""https://awesome....",awesome href https awesome target blank img al...,awesom href http awesom target blank img alt a...,awesome href http awesome target blank img alt...
4,vircadia/vircadia,C++,"<p align=""center""><a href=""https://vircadia.co...",align center href https vircadia img src inter...,align center href http vircadia img src interf...,align center href http vircadia img src interf...
5,AI4Finance-Foundation/FinRL-Meta,Jupyter Notebook,# FinRL-Meta: A Universe of Market Environment...,finrl meta universe market environments benchm...,finrl meta univers market environ benchmark da...,finrl meta universe market environment benchma...
6,bit-country/Metaverse-Network,Rust,"<p align=""center"">\n <img src=""https://github...",align center img src https bit country network...,align center img src http bit countri network ...,align center img src http bit country network ...
7,omigroup/omigroup,,# Open Metaverse Interoperability Group (OMI)\...,open interoperability group omi club https img...,open interoper group omi club http img shield ...,open interoperability group omi club http img ...
8,joaneeet7/Metaverse,JavaScript,,,,
9,houbb/awesome-metaverse-zh,Batchfile,# Awesome Metaverse [![Awesome](https://awesom...,awesome awesome https awesome badge svg https ...,awesom awesom http awesom badg svg http awesom...,awesome awesome http awesome badge svg http aw...


In [8]:
# creating a df that shows the clean/tokenize/stem/and lemmatization process

df.to_csv("cleaned_comparison_file.csv", index = False)

In [9]:
'''Function to mass data-clean the original README repo files'''
def mass_text_clean(text, include_words=None, exclude_words=None):

    text = prepare.basic_clean(text)

    text = prepare.lemmatize(text)

    text = prepare.remove_stopwords(text, include_words = include_words, exclude_words = exclude_words)

    return list(text.split(' '))

In [10]:
'''Function to clean the original data objects/df'''
def clean_data_objects(df):
    
    df = df[[
        "repo", \
        "language", \
        "readme_contents"]].astype(str)

    print(f'df shape: {df.shape}')

    return df

In [11]:
# testing function
df = pd.read_csv("metaverse.csv")

df = prepare.clean_data_objects(df)
df.head()

df shape: (998, 3)


Unnamed: 0,repo,language,readme_contents
0,M3-org/awesome-metaverse,,# Awesome Metaverse [![Awesome](https://awesom...
1,mvs-org/metaverse,C++,Metaverse Core Integration/staging Tree\n=====...
2,webaverse/app,JavaScript,"<img src=""docs/banner.jpeg"" width=100% />\n\n<..."
3,shadowcz007/awesome-metaverse,,"# awesome-metaverse\n<a href=""https://awesome...."
4,vircadia/vircadia,C++,"<p align=""center""><a href=""https://vircadia.co..."


In [12]:
# using the mass_text_clean function

df["readme_contents"] = df["readme_contents"].apply(prepare.mass_text_clean)
df.head() # check out!

Unnamed: 0,repo,language,readme_contents
0,M3-org/awesome-metaverse,,awesome awesome awesome badge svg awesome list...
1,mvs-org/metaverse,C++,core integration staging tree build status tra...
2,webaverse/app,JavaScript,img src doc banner jpeg width 100 align center...
3,shadowcz007/awesome-metaverse,,awesome href awesome target blank img alt awes...
4,vircadia/vircadia,C++,align center href vircadia img src interface r...
