In [6]:
# notebook dependencies 
import os # used in caching
import pandas as pd
import numpy as np

# visualization imports
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# regular expression import
import re

# JSON import
import json

# importing BeautifulSoup for parsing HTML/XTML
from bs4 import BeautifulSoup as BSoup

# request module for connecting to APIs
from requests import get

# text prepare modules
import acquire
import prepare

# uni-code library
import unicodedata

# natural language toolkit library/modules
import nltk
from nltk.stem import PorterStemmer, LancasterStemmer, WordNetLemmatizer

from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

#sklearn
from sklearn.model_selection import train_test_split

# Wrangling

### Data Acquisition

In [2]:
# Reading in data
df = pd.read_csv('metaverse.csv')

In [3]:
df

Unnamed: 0,repo,language,readme_contents
0,M3-org/awesome-metaverse,,# Awesome Metaverse [![Awesome](https://awesom...
1,mvs-org/metaverse,C++,Metaverse Core Integration/staging Tree\n=====...
2,webaverse/app,JavaScript,"<img src=""docs/banner.jpeg"" width=100% />\n\n<..."
3,shadowcz007/awesome-metaverse,,"# awesome-metaverse\n<a href=""https://awesome...."
4,vircadia/vircadia,C++,"<p align=""center""><a href=""https://vircadia.co..."
...,...,...,...
993,LinasKo/MetaVerse,C#,# MetaVerse\nVirtual Conference project for Me...
994,saiva11/Metaverse,Vue,
995,dimplehh/metaverse,,# 맥스트 VPS SDK 사용\n\n## VPS 사용\n\nAssets\MaxstA...
996,niksanvijan/MetaVerse,,


In [4]:
# Looking at messy reame_contents
df.readme_contents

0      # Awesome Metaverse [![Awesome](https://awesom...
1      Metaverse Core Integration/staging Tree\n=====...
2      <img src="docs/banner.jpeg" width=100% />\n\n<...
3      # awesome-metaverse\n<a href="https://awesome....
4      <p align="center"><a href="https://vircadia.co...
                             ...                        
993    # MetaVerse\nVirtual Conference project for Me...
994                                                  NaN
995    # 맥스트 VPS SDK 사용\n\n## VPS 사용\n\nAssets\MaxstA...
996                                                  NaN
997                                                  NaN
Name: readme_contents, Length: 998, dtype: object

In [5]:
df.isnull().sum()

repo                 0
language           316
readme_contents    257
dtype: int64

### Data Preparation - Cleaning

In [7]:
# Calling prepare function to clean dataframe
df = prepare.clean_data_objects(df)
df.head()

df shape: (998, 3)


Unnamed: 0,repo,language,readme_contents
0,M3-org/awesome-metaverse,,# Awesome Metaverse [![Awesome](https://awesom...
1,mvs-org/metaverse,C++,Metaverse Core Integration/staging Tree\n=====...
2,webaverse/app,JavaScript,"<img src=""docs/banner.jpeg"" width=100% />\n\n<..."
3,shadowcz007/awesome-metaverse,,"# awesome-metaverse\n<a href=""https://awesome...."
4,vircadia/vircadia,C++,"<p align=""center""><a href=""https://vircadia.co..."


In [8]:
# Calling mass_text_clean function to clean readme content
df["readme_contents"] = df["readme_contents"].apply(prepare.mass_text_clean)
df.head()

Unnamed: 0,repo,language,readme_contents
0,M3-org/awesome-metaverse,,awesome awesome awesome badge svg awesome list...
1,mvs-org/metaverse,C++,core integration staging tree build status tra...
2,webaverse/app,JavaScript,img src doc banner jpeg width 100 align center...
3,shadowcz007/awesome-metaverse,,awesome href awesome target blank img alt awes...
4,vircadia/vircadia,C++,align center href vircadia img src interface r...


In [9]:
df.language.value_counts()

nan                 316
JavaScript          266
C#                   78
HTML                 72
TypeScript           70
Solidity             27
CSS                  23
Python               22
Rust                 16
Go                   15
C++                  14
Jupyter Notebook     11
Java                  7
ShaderLab             7
SCSS                  6
Vue                   5
Shell                 5
PHP                   5
SuperCollider         3
Clojure               3
Batchfile             2
Roff                  2
Swift                 2
C                     2
GLSL                  2
GDScript              2
Objective-C           2
Kotlin                2
HCL                   1
EJS                   1
PowerShell            1
Svelte                1
TeX                   1
Objective-C++         1
Cadence               1
Metal                 1
R                     1
LSL                   1
ASP.NET               1
Name: language, dtype: int64