# **Prepare**

In [1]:
# work used for cleansing the acquire text data and preping it for exploration.

In [2]:
# imports
import json
import pandas as pd
import prepare as p

In [3]:
# Load the REPOS list from the JSON file
file_name = 'repo_processed.json'

with open(file_name, 'r') as file:
    repos = json.load(file)

In [4]:
# Here we're using pandas to produce datframe
repos_df = pd.DataFrame(repos)
repos_df

Unnamed: 0,repo,language,readme_contents
0,AtsushiSakai/PythonRobotics,Python,"<img src=""https://github.com/AtsushiSakai/Pyth..."
1,kiloreux/awesome-robotics,,Awesome Robotics\n================\n\n[![Aweso...
2,NxRLab/ModernRobotics,Python,"# Modern Robotics: Mechanics, Planning, and C..."
3,mithi/robotics-coursework,,# [🐳](https://mithi.github.io/deep-blueberry) ...
4,onlytailei/CppRobotics,C++,# CppRobotics\n\nThis is the cpp implementatio...
...,...,...,...
672,CatchZeng/dingtalk,Go,# dingtalk\n\n![Go](https://github.com/CatchZe...
673,1c7/Translate-Subtitle-File,,## 字幕组机翻小助手 - [视频演示](https://www.bilibili.com/...
674,landy22granatt/Kumpulan-Script-Termux,Python,pkg update && pkg upgrade\npkg install bash\np...
675,cirosantilli/china-dictatorship,HTML,"<style>\npre{ white-space:pre }\n#header,#cont..."


  # Investigation

In [5]:
language_df = p.prep_text_data(repos_df,
                               'language', 
                               extra_words = [
                                    'C++',
                                    'Python',
                                    'JavaScript',
                                    'C',
                                    'Java',
                                    'TypeScript',
                                    'C#',
                                    'Rust',
                                    'Ruby',
                                    'Kotlin',
                                    'Julia',
                                    'PHP',
                                    'Swift',
                                    'Common Lisp',
                                    'Scheme',
                                    'Dart',
                                    'Scala',
                                    'Objective-C',
                                    'Lua',
                                    'Prolog'
                               ], 
                               exclude_words = [
                                    "Jupyter Notebook",
                                    "HTML",
                                    "Go",
                                    "Shell",
                                    "TeX",
                                    "CMake",
                                    "Dockerfile",
                                    "OpenSCAD",
                                    "MQL5",
                                    "Vue",
                                    "Smali",
                                    "Matlab",
                                    "RobotFramework",
                                    "Cuda",
                                    "SCSS"
                                ])
                                                                                                 
language_df.head()

Unnamed: 0,repo,language,clean_readme,stemmed,lemmatized
0,AtsushiSakai/PythonRobotics,Python,ython,ython,ython
1,kiloreux/awesome-robotics,,,,
2,NxRLab/ModernRobotics,Python,ython,ython,ython
3,mithi/robotics-coursework,,,,
4,onlytailei/CppRobotics,C++,,,


In [6]:
language_df.language.value_counts()

language
C++                 208
Python              201
JavaScript           35
C                    19
Jupyter Notebook     19
Java                 14
TypeScript           14
C#                   13
HTML                 12
Go                   12
Rust                 10
MATLAB                9
Shell                 6
TeX                   4
Ruby                  4
CMake                 4
Kotlin                4
Julia                 3
PHP                   3
Dockerfile            3
Swift                 2
OpenSCAD              2
MQL5                  1
Common Lisp           1
Scheme                1
Vue                   1
Smali                 1
Dart                  1
Matlab                1
Scala                 1
RobotFramework        1
Objective-C           1
Lua                   1
Prolog                1
Cuda                  1
SCSS                  1
Name: count, dtype: int64

In [7]:
# lem and unlem check for ou and html tags

In [8]:
# Load the REPOS list from the JSON file
with open(file_name, 'r') as file:
    repo_loaded = json.load(file)
    
repos_df = pd.DataFrame(repo_loaded)

repos_df

Unnamed: 0,repo,language,readme_contents
0,AtsushiSakai/PythonRobotics,Python,"<img src=""https://github.com/AtsushiSakai/Pyth..."
1,kiloreux/awesome-robotics,,Awesome Robotics\n================\n\n[![Aweso...
2,NxRLab/ModernRobotics,Python,"# Modern Robotics: Mechanics, Planning, and C..."
3,mithi/robotics-coursework,,# [🐳](https://mithi.github.io/deep-blueberry) ...
4,onlytailei/CppRobotics,C++,# CppRobotics\n\nThis is the cpp implementatio...
...,...,...,...
672,CatchZeng/dingtalk,Go,# dingtalk\n\n![Go](https://github.com/CatchZe...
673,1c7/Translate-Subtitle-File,,## 字幕组机翻小助手 - [视频演示](https://www.bilibili.com/...
674,landy22granatt/Kumpulan-Script-Termux,Python,pkg update && pkg upgrade\npkg install bash\np...
675,cirosantilli/china-dictatorship,HTML,"<style>\npre{ white-space:pre }\n#header,#cont..."


# Johns Process Dataframe function (edited) check for odd words in result

In [9]:
import prepare as p
# Function to apply cleaning and processing functions from prepare.py
def process_dataframe(df, extra_words= [], exclude_words= []):
    # Create a new column 'original' and assign the values from 'content'
    df['original'] = df['readme_contents']
    
    # Apply the basic_clean function to 'original', then tokenize the result, and remove stopwords
    df['clean'] = df['original'].apply(p.basic_clean).apply(p.tokenize)
    
    df['remove_stopwords'] = df['clean'].apply(lambda x: p.remove_stopwords(x, extra_words, exclude_words))
    
    # Apply the stem function to 'clean' column
    df['stemmed'] = df['remove_stopwords'].apply(p.stem)
    
    # Apply the lemmatize function to 'clean' column
    df['lemmatized'] = df['remove_stopwords'].apply(p.lemmatize)
    
    # Drop the 'content' column from the dataframe
    df = df.drop(columns='readme_contents', axis=1)
    
    # Return the modified dataframe
    return df

In [10]:
# Run the cleaning script
extra_words = []
exclude_words = []
repos_df = process_dataframe(repos_df, extra_words, exclude_words)
# rename to un_lemmatized
repos_df['un_lemmatized'] = repos_df['remove_stopwords']
# get rid of the unnecessary processed columns 
repos_df = repos_df.drop(columns=['original', 'clean', 'remove_stopwords', 'stemmed'])#, 'lemmatized'])
# Remove repos without a readme
repos_df = repos_df[repos_df['un_lemmatized'] != 'failtoloadreadme']
# Split lemmatized text and stack it into a DataFrame
all_words_lemmatized = repos_df.lemmatized.str.split(expand=True).stack()
all_words_un_lemmatized = repos_df.un_lemmatized.str.split(expand=True).stack()
# combine the two pd series we created in the step before into a single df
all_words_df = pd.DataFrame({
    'lemmatized': all_words_lemmatized,
    'un_lemmatized': all_words_un_lemmatized
})

# filter the df by lemmatized
filtered_rows = all_words_df[all_words_df.lemmatized == 'ou']
filtered_rows

Unnamed: 0,Unnamed: 1,lemmatized,un_lemmatized
0,238,ou,ou
0,1040,ou,ou
0,1101,ou,ou
8,459,ou,ou
9,244,ou,ou
...,...,...,...
663,972,ou,ou
669,193,ou,ou
669,244,ou,ou
670,307,ou,ou


In [11]:
len(filtered_rows)

865

  # Cleaning

In [12]:
file_name = 'repo_processed.json'

with open(file_name, 'r') as file:
    repos = json.load(file)

In [13]:
import prepare as p

repos_df = pd.DataFrame(repos)

# Apply the clean function element-wise to the 'readme_contents' column
repos_df['clean_readme'] = repos_df['readme_contents'].apply(lambda x: p.clean(x, extra_stopwords=[]))
# Filter rows with 'readme_contents' not equal to 'FailToLoadReadME'
repos_df = repos_df[repos_df['readme_contents'] != 'FailToLoadReadME']

# Drop the 'readme_contents' column
repos_df.drop(columns='readme_contents', inplace=True)

# Filter rows where 'language' is not NaN and reset the index
repos_df = repos_df[repos_df.language.notna()].reset_index(drop=True)

In [14]:
repos_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 611 entries, 0 to 610
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   repo          611 non-null    object
 1   language      611 non-null    object
 2   clean_readme  611 non-null    object
dtypes: object(3)
memory usage: 14.4+ KB


In [15]:
repos_df.head()

Unnamed: 0,repo,language,clean_readme
0,AtsushiSakai/PythonRobotics,Python,img srchttpsgithubcomtsushiakaiythonoboticsraw...
1,NxRLab/ModernRobotics,Python,odern obotics echanics lanning ontrol ode ibra...
2,onlytailei/CppRobotics,C++,ppobotics cpp implementation ythonoboticshttps...
3,JdeRobot/RoboticsAcademy,JavaScript,hrefhttpsjderobotgithubioimg srcimglogogif wid...
4,pptacher/probabilistic_robotics,C++,probabilisticrobotics working detailed solutio...


In [16]:
# repos_dfiloc[10.clean_readme].

In [17]:
from collections import Counter

# Analyze the 'clean_readme' column to identify common words
common_words = ' '.join(repos_df['clean_readme']).split()
word_counts = Counter(common_words)

In [18]:
word_counts

Counter({'use': 1668,
         'using': 1595,
         'robot': 1487,
         'img': 1467,
         'install': 1453,
         'run': 1136,
         'f': 1126,
         'data': 1108,
         'see': 1105,
         'n': 1034,
         'e': 953,
         'code': 951,
         'build': 928,
         'p': 855,
         'bash': 839,
         'ou': 831,
         'used': 822,
         'earning': 809,
         'also': 793,
         'file': 779,
         'uild': 766,
         'ariv': 756,
         'python': 752,
         'following': 752,
         'cd': 734,
         'obotics': 714,
         'robots': 711,
         'project': 709,
         'obot': 687,
         'source': 633,
         'environment': 632,
         'set': 600,
         'model': 599,
         'software': 596,
         'example': 589,
         'ython': 588,
         'make': 581,
         'library': 578,
         'version': 574,
         'new': 574,
         'package': 566,
         'aligncenter': 561,
         'pen': 550,
         

In [19]:
common_words

['img',
 'srchttpsgithubcomtsushiakaiythonoboticsrawmastericonpngrawtrue',
 'alignright',
 'width300',
 'altheader',
 'pic',
 'ythonobotics',
 'itubctioninuxhttpsgithubcomtsushiakaiythonoboticsworkflowsinuxbadgesvg',
 'itubctionachttpsgithubcomtsushiakaiythonoboticsworkflowsacbadgesvg',
 'itubctionindowshttpsgithubcomtsushiakaiythonoboticsworkflowsindowsbadgesvg',
 'uild',
 'statushttpsciappveyorcomapiprojectsstatussb279kxuv1be391gsvgtruehttpsciappveyorcomprojecttsushiakaipythonrobotics',
 'codecovhttpscodecovioghtsushiakaiythonoboticsbranchmastergraphbadgesvghttpscodecovioghtsushiakaiythonobotics',
 'ython',
 'codes',
 'robotics',
 'algorithm',
 'able',
 'ontents',
 'hat',
 'thiswhatisthis',
 'equirementsrequirements',
 'ocumentationdocumentation',
 'ow',
 'usehowtouse',
 'ocalizationlocalization',
 'xtended',
 'alman',
 'ilter',
 'localizationextendedkalmanfilterlocalization',
 'article',
 'filter',
 'localizationparticlefilterlocalization',
 'istogram',
 'filter',
 'localizationhist

In [20]:

# Filter for words with lengths between 4 and 10 characters
common_words_3_to_10 = [word for word, count in word_counts.items() if 3 <= len(word) <= 10]

# Create a list of the most common words with lengths between 4 and 10 characters
most_common_words_3_to_10 = [(word, count) for word, count in word_counts.items() if 3 <= len(word) <= 10]

# Sort the list by count in descending order
most_common_words_3_to_10.sort(key=lambda x: x[1], reverse=True)

# Return the most common words with lengths between 4 and 10 characters
most_common_words_3_to_10


[('use', 1668),
 ('using', 1595),
 ('robot', 1487),
 ('img', 1467),
 ('install', 1453),
 ('run', 1136),
 ('data', 1108),
 ('see', 1105),
 ('code', 951),
 ('build', 928),
 ('bash', 839),
 ('used', 822),
 ('earning', 809),
 ('also', 793),
 ('file', 779),
 ('uild', 766),
 ('ariv', 756),
 ('python', 752),
 ('following', 752),
 ('obotics', 714),
 ('robots', 711),
 ('project', 709),
 ('obot', 687),
 ('source', 633),
 ('set', 600),
 ('model', 599),
 ('software', 596),
 ('example', 589),
 ('ython', 588),
 ('make', 581),
 ('library', 578),
 ('version', 574),
 ('new', 574),
 ('package', 566),
 ('pen', 550),
 ('control', 540),
 ('need', 535),
 ('system', 528),
 ('work', 524),
 ('based', 521),
 ('image', 519),
 ('please', 506),
 ('repository', 500),
 ('via', 493),
 ('one', 489),
 ('sudo', 487),
 ('like', 472),
 ('support', 472),
 ('files', 471),
 ('oint', 469),
 ('eural', 451),
 ('default', 448),
 ('time', 440),
 ('git', 435),
 ('available', 429),
 ('test', 419),
 ('command', 413),
 ('point', 411)

## Find a way to create a  datframe with this text data nd seperate to explore

In [21]:
# Load the REPOS list from the JSON file
with open(file_name, 'r') as file:
    repo_loaded = json.load(file)
    
repos_df = pd.DataFrame(repo_loaded)

# Run the cleaning script
extra_words = []
exclude_words = []
repos_df = process_dataframe(repos_df, extra_words, exclude_words)
# rename to un_lemmatized
repos_df['un_lemmatized'] = repos_df['remove_stopwords']
# get rid of the unnecessary processed columns 
repos_df = repos_df.drop(columns=['original', 'clean', 'remove_stopwords', 'stemmed'])#, 'lemmatized'])
# Remove repos without a readme
repos_df = repos_df[repos_df['un_lemmatized'] != 'failtoloadreadme']
# Split lemmatized text and stack it into a DataFrame
all_words_lemmatized = repos_df.lemmatized.str.split(expand=True).stack()
all_words_un_lemmatized = repos_df.un_lemmatized.str.split(expand=True).stack()
# combine the two pd series we created in the step before into a single df
all_words_df = pd.DataFrame({
    'lemmatized': all_words_lemmatized,
    'un_lemmatized': all_words_un_lemmatized
})

# filter the df by lemmatized
filtered_rows = all_words_df[all_words_df.lemmatized == 'ou']
filtered_rows

Unnamed: 0,Unnamed: 1,lemmatized,un_lemmatized
0,238,ou,ou
0,1040,ou,ou
0,1101,ou,ou
8,459,ou,ou
9,244,ou,ou
...,...,...,...
663,972,ou,ou
669,193,ou,ou
669,244,ou,ou
670,307,ou,ou


In [22]:
len(filtered_rows)

865

In [25]:
## then categorize

In [None]:
ham_words = clean(' '.join(df[df.label == 'ham'].text))
spam_words = clean(' '.join(df[df.label == 'spam'].text))
all_words = clean(' '.join(df.text)) 