# Modeling

In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder



from sklearn.metrics import classification_report as class_rep
from prepare import clean, lemmatize

In [2]:
# reading in file to generate dataframe

In [3]:
repos_df = pd.read_csv('github_repos.csv')
repos_df.drop(columns=('repo'))

Unnamed: 0,language,text
0,Python,"<img src=""https://github.com/AtsushiSakai/Pyth..."
1,Python,"# Modern Robotics: Mechanics, Planning, and C..."
2,C++,# CppRobotics\n\nThis is the cpp implementatio...
3,Other,"<a href=""https://jderobot.github.io/""><img src..."
4,C++,# probabilistic_robotics\nI am working on deta...
...,...,...
536,Other,"<div align=center>\n\t<img src=""readme-img/cov..."
537,Other,"<h1 align=""center"" style=""font-family:Papyrus;..."
538,Python,"<div align=""center"">\n\n<img src=""figs/logo.pn..."
539,Other,# Compose Actors :dancer:\n\n## Roadmap v0.3.0...


In [4]:
repos_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541 entries, 0 to 540
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   repo      541 non-null    object
 1   language  541 non-null    object
 2   text      541 non-null    object
dtypes: object(3)
memory usage: 12.8+ KB


In [5]:
repos_df['clean_text'] = repos_df.text.apply(clean)
repos_df.head()

Unnamed: 0,repo,language,text,clean_text
0,AtsushiSakai/PythonRobotics,Python,"<img src=""https://github.com/AtsushiSakai/Pyth...","[img, srchttpsgithubcomatsushisakaipythonrobot..."
1,NxRLab/ModernRobotics,Python,"# Modern Robotics: Mechanics, Planning, and C...","[modern, robotics, mechanic, planning, control..."
2,onlytailei/CppRobotics,C++,# CppRobotics\n\nThis is the cpp implementatio...,"[cpprobotics, cpp, implementation, pythonrobot..."
3,JdeRobot/RoboticsAcademy,Other,"<a href=""https://jderobot.github.io/""><img src...","[hrefhttpsjderobotgithubioimg, srcimglogogif, ..."
4,pptacher/probabilistic_robotics,C++,# probabilistic_robotics\nI am working on deta...,"[probabilistic_robotics, working, detailed, so..."


In [6]:
repos_df['lemmas'] = repos_df.text.apply(lemmatize)
repos_df.head()

Unnamed: 0,repo,language,text,clean_text,lemmas
0,AtsushiSakai/PythonRobotics,Python,"<img src=""https://github.com/AtsushiSakai/Pyth...","[img, srchttpsgithubcomatsushisakaipythonrobot...","<img src=""https://github.com/AtsushiSakai/Pyth..."
1,NxRLab/ModernRobotics,Python,"# Modern Robotics: Mechanics, Planning, and C...","[modern, robotics, mechanic, planning, control...","# Modern Robotics: Mechanics, Planning, and Co..."
2,onlytailei/CppRobotics,C++,# CppRobotics\n\nThis is the cpp implementatio...,"[cpprobotics, cpp, implementation, pythonrobot...",# CppRobotics This is the cpp implementation o...
3,JdeRobot/RoboticsAcademy,Other,"<a href=""https://jderobot.github.io/""><img src...","[hrefhttpsjderobotgithubioimg, srcimglogogif, ...","<a href=""https://jderobot.github.io/""><img src..."
4,pptacher/probabilistic_robotics,C++,# probabilistic_robotics\nI am working on deta...,"[probabilistic_robotics, working, detailed, so...",# probabilistic_robotics I am working on detai...


In [7]:
# repos_df = repos_df[
#     (repos_df['lemmas'].str.len() >= 2) &
#     (repos_df['lemmas'].str.len() <= 12) &
#     (repos_df['lemmas'] != 'ro')
# ]

In [8]:
repos_df

Unnamed: 0,repo,language,text,clean_text,lemmas
0,AtsushiSakai/PythonRobotics,Python,"<img src=""https://github.com/AtsushiSakai/Pyth...","[img, srchttpsgithubcomatsushisakaipythonrobot...","<img src=""https://github.com/AtsushiSakai/Pyth..."
1,NxRLab/ModernRobotics,Python,"# Modern Robotics: Mechanics, Planning, and C...","[modern, robotics, mechanic, planning, control...","# Modern Robotics: Mechanics, Planning, and Co..."
2,onlytailei/CppRobotics,C++,# CppRobotics\n\nThis is the cpp implementatio...,"[cpprobotics, cpp, implementation, pythonrobot...",# CppRobotics This is the cpp implementation o...
3,JdeRobot/RoboticsAcademy,Other,"<a href=""https://jderobot.github.io/""><img src...","[hrefhttpsjderobotgithubioimg, srcimglogogif, ...","<a href=""https://jderobot.github.io/""><img src..."
4,pptacher/probabilistic_robotics,C++,# probabilistic_robotics\nI am working on deta...,"[probabilistic_robotics, working, detailed, so...",# probabilistic_robotics I am working on detai...
...,...,...,...,...,...
536,Skythinker616/foc-wheel-legged-robot,Other,"<div align=center>\n\t<img src=""readme-img/cov...","[div, aligncenter, img, srcreadmeimgcoverjpg, ...","<div align=center> <img src=""readme-img/cover...."
537,Sollimann/bonsai,Other,"<h1 align=""center"" style=""font-family:Papyrus;...","[h1, aligncenter, stylefontfamilypapyrus, font...","<h1 align=""center"" style=""font-family:Papyrus;..."
538,zjunlp/EasyInstruct,Python,"<div align=""center"">\n\n<img src=""figs/logo.pn...","[div, aligncenter, img, srcfigslogopng, width3...","<div align=""center""> <img src=""figs/logo.png"" ..."
539,RajashekarRaju/compose-actors,Other,# Compose Actors :dancer:\n\n## Roadmap v0.3.0...,"[compose, actor, dancer, roadmap, v030, x, let...",# Compose Actors :dancer: ## Roadmap v0.3.0 - ...


In [9]:
X = repos_df.lemmas
y = repos_df.language

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7,
                                                    random_state=42)

In [10]:
X

0      <img src="https://github.com/AtsushiSakai/Pyth...
1      # Modern Robotics: Mechanics, Planning, and Co...
2      # CppRobotics This is the cpp implementation o...
3      <a href="https://jderobot.github.io/"><img src...
4      # probabilistic_robotics I am working on detai...
                             ...                        
536    <div align=center> <img src="readme-img/cover....
537    <h1 align="center" style="font-family:Papyrus;...
538    <div align="center"> <img src="figs/logo.png" ...
539    # Compose Actors :dancer: ## Roadmap v0.3.0 - ...
540    pkg update && pkg upgrade pkg install bash pkg...
Name: lemmas, Length: 541, dtype: object

In [11]:
tfidf = TfidfVectorizer()

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

X_train_tfidf[:10]

<10x42504 sparse matrix of type '<class 'numpy.float64'>'
	with 10809 stored elements in Compressed Sparse Row format>

In [12]:
baseline_acc = y_train.value_counts().max() / y_train.shape[0] * 100

print(f'baseline: {round(baseline_acc, 2)}.')

baseline: 37.3.


In [13]:
pd.DataFrame(X_train_tfidf.todense(), columns=tfidf.get_feature_names_out())

Unnamed: 0,00,000,0000,00000,000000,00000006,0000000rgba,0000000rgbb,000000s,0000048rgb,...,ｓｏｆｔｗａｒｅ,ｔｈｅ,ｔｈｉｓ,ｔｏ,ｕｎｗｉｓｅ,ｕｓｅ,ｗｅｂｓｉｔｅ,ｽﾀｯｸﾁｬﾝ,𝑅𝑖𝑚,𝑅𝑟𝑒
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
373,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.005612,0.005612
374,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
376,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000


# logistic regression model

In [14]:
lm = LogisticRegression()

lm.fit(X_train_tfidf, y_train)

In [15]:
y_train_res = pd.DataFrame({'actual': y_train,
                            'preds': lm.predict(X_train_tfidf)})
y_train_res.head()

Unnamed: 0,actual,preds
428,Python,Python
370,C++,C++
57,Python,Python
332,C++,C++
24,C++,C++


In [16]:
print(class_rep(y_train_res.actual, y_train_res.preds))

              precision    recall  f1-score   support

         C++       0.94      0.99      0.97       141
       Other       1.00      0.93      0.96        97
      Python       0.98      0.97      0.97       140

    accuracy                           0.97       378
   macro avg       0.97      0.96      0.97       378
weighted avg       0.97      0.97      0.97       378



# Logistic Regression Test

In [17]:
y_test_res = pd.DataFrame({'actual': y_test,
                           'preds': lm.predict(X_test_tfidf)})
y_test_res.head()

Unnamed: 0,actual,preds
229,Other,Other
73,C++,C++
352,Python,Python
86,C++,C++
470,Python,C++


In [18]:
print(class_rep(y_test_res.actual, y_test_res.preds))

              precision    recall  f1-score   support

         C++       0.69      0.79      0.74        67
       Other       0.76      0.44      0.56        36
      Python       0.71      0.77      0.74        60

    accuracy                           0.71       163
   macro avg       0.72      0.67      0.68       163
weighted avg       0.71      0.71      0.70       163



# KNN Classifier Model

In [19]:
knn = KNeighborsClassifier()

knn.fit(X_train_tfidf, y_train)

In [20]:
y_train_res = pd.DataFrame({'actual': y_train,
                            'preds': knn.predict(X_train_tfidf)})
y_train_res.head()

Unnamed: 0,actual,preds
428,Python,C++
370,C++,C++
57,Python,Python
332,C++,C++
24,C++,C++


In [21]:
print(class_rep(y_train_res.actual, y_train_res.preds))

              precision    recall  f1-score   support

         C++       0.62      0.88      0.73       141
       Other       0.78      0.41      0.54        97
      Python       0.82      0.74      0.78       140

    accuracy                           0.71       378
   macro avg       0.74      0.68      0.68       378
weighted avg       0.74      0.71      0.70       378



# KNN test

In [22]:
y_test_res = pd.DataFrame({'actual': y_test,
                           'preds': knn.predict(X_test_tfidf)})
y_test_res.head()

Unnamed: 0,actual,preds
229,Other,Other
73,C++,C++
352,Python,Python
86,C++,C++
470,Python,C++


In [23]:
print(class_rep(y_test_res.actual, y_test_res.preds))

              precision    recall  f1-score   support

         C++       0.63      0.76      0.69        67
       Other       0.56      0.28      0.37        36
      Python       0.61      0.65      0.63        60

    accuracy                           0.61       163
   macro avg       0.60      0.56      0.56       163
weighted avg       0.61      0.61      0.60       163



# Random Forest Classier Model

In [24]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()

rf.fit(X_train_tfidf, y_train)

In [25]:
y_train_res = pd.DataFrame({'actual': y_train,
                            'preds': rf.predict(X_train_tfidf)})
y_train_res.head()

Unnamed: 0,actual,preds
428,Python,Python
370,C++,C++
57,Python,Python
332,C++,C++
24,C++,C++


In [26]:
print(class_rep(y_train_res.actual, y_train_res.preds))

              precision    recall  f1-score   support

         C++       1.00      1.00      1.00       141
       Other       1.00      1.00      1.00        97
      Python       1.00      1.00      1.00       140

    accuracy                           1.00       378
   macro avg       1.00      1.00      1.00       378
weighted avg       1.00      1.00      1.00       378



In [27]:
y_test_res = pd.DataFrame({'actual': y_test,
                           'preds': rf.predict(X_test_tfidf)})
y_test_res.head()

Unnamed: 0,actual,preds
229,Other,Other
73,C++,C++
352,Python,Python
86,C++,C++
470,Python,C++


In [28]:
print(class_rep(y_test_res.actual, y_test_res.preds))

              precision    recall  f1-score   support

         C++       0.72      0.87      0.78        67
       Other       0.83      0.53      0.64        36
      Python       0.75      0.73      0.74        60

    accuracy                           0.74       163
   macro avg       0.76      0.71      0.72       163
weighted avg       0.75      0.74      0.74       163



# XGBClassifier Model

In [37]:
from sklearn.preprocessing import LabelEncoder

# Initialize the label encoder
label_encoder = LabelEncoder()

X = repos_df.lemmas
y = repos_df.language

# Fit the label encoder to your class labels and transform them
y_encoded = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, train_size=0.7, random_state=42)

tfidf = TfidfVectorizer()

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Create the XGBoost classifier instance
bst = XGBClassifier(n_estimators=2, max_depth=2, learning_rate=1, objective='multi:softprob', num_class=len(label_encoder.classes_))

# Fit the XGBoost model on the training data
bst.fit(X_train_tfidf, y_train)

# Predict the classes on the test data
preds = bst.predict(X_test_tfidf)

# If you want to decode the predicted labels back to their original class names:
preds_decoded = label_encoder.inverse_transform(preds)


In [38]:
y_train_res = pd.DataFrame({'actual': y_train,
                            'preds': bst.predict(X_train_tfidf)})
y_train_res.head()

Unnamed: 0,actual,preds
0,2,2
1,0,1
2,2,2
3,0,0
4,0,0


In [39]:
print(class_rep(y_train_res.actual, y_train_res.preds))

              precision    recall  f1-score   support

           0       0.75      0.81      0.78       141
           1       0.70      0.70      0.70        97
           2       0.86      0.80      0.83       140

    accuracy                           0.78       378
   macro avg       0.77      0.77      0.77       378
weighted avg       0.78      0.78      0.78       378



In [40]:
y_test_res = pd.DataFrame({'actual': y_test,
                           'preds': bst.predict(X_test_tfidf)})
y_test_res.head()

Unnamed: 0,actual,preds
0,1,1
1,0,0
2,2,2
3,0,1
4,2,0


In [41]:
print(class_rep(y_test_res.actual, y_test_res.preds))

              precision    recall  f1-score   support

           0       0.77      0.73      0.75        67
           1       0.68      0.75      0.71        36
           2       0.80      0.78      0.79        60

    accuracy                           0.75       163
   macro avg       0.75      0.75      0.75       163
weighted avg       0.76      0.75      0.76       163

