# Modeling

In [1]:
import pandas as pd
import numpy as np
import json

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder



from sklearn.metrics import classification_report as class_rep
from prepare import clean, lemmatize

In [2]:
# reading in file to generate dataframe

In [3]:
repos_df = pd.read_csv('processed_repos.csv', index_col=0)
repos_df.drop(columns=('repo'))

Unnamed: 0,language,text
0,Python,pythonrobotics githubactionlinuxci githubactio...
1,Other,awesome robotics awesome list various book cou...
2,Python,modern robotics mechanic planning control code...
3,Other,want use arduino raspberry pi make robot short...
4,C++,cpprobotics cpp implementation pythonrobotics ...
...,...,...
670,Other,compose actor dancer roadmap v030 x let user s...
671,Other,failtoloadreadme
673,Other,srt vtt api key api 100 api 510 2023130 420 20...
674,Python,pkg update pkg upgrade pkg install bash pkg in...


In [4]:
repos_df = pd.DataFrame(repos_df)
repos_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 609 entries, 0 to 676
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   repo      609 non-null    object
 1   language  609 non-null    object
 2   text      609 non-null    object
dtypes: object(3)
memory usage: 19.0+ KB


In [5]:
repos_df['clean_text'] = repos_df.text.apply(clean)
repos_df.head()

Unnamed: 0,repo,language,text,clean_text
0,AtsushiSakai/PythonRobotics,Python,pythonrobotics githubactionlinuxci githubactio...,"[pythonrobotics, githubactionlinuxci, githubac..."
1,kiloreux/awesome-robotics,Other,awesome robotics awesome list various book cou...,"[awesome, robotics, awesome, list, various, bo..."
2,NxRLab/ModernRobotics,Python,modern robotics mechanic planning control code...,"[modern, robotics, mechanic, planning, control..."
3,mithi/robotics-coursework,Other,want use arduino raspberry pi make robot short...,"[want, use, arduino, raspberry, pi, make, robo..."
4,onlytailei/CppRobotics,C++,cpprobotics cpp implementation pythonrobotics ...,"[cpprobotics, cpp, implementation, pythonrobot..."


In [6]:
repos_df['lemmas'] = repos_df.text.apply(lemmatize)
repos_df.head()

Unnamed: 0,repo,language,text,clean_text,lemmas
0,AtsushiSakai/PythonRobotics,Python,pythonrobotics githubactionlinuxci githubactio...,"[pythonrobotics, githubactionlinuxci, githubac...",pythonrobotics githubactionlinuxci githubactio...
1,kiloreux/awesome-robotics,Other,awesome robotics awesome list various book cou...,"[awesome, robotics, awesome, list, various, bo...",awesome robotics awesome list various book cou...
2,NxRLab/ModernRobotics,Python,modern robotics mechanic planning control code...,"[modern, robotics, mechanic, planning, control...",modern robotics mechanic planning control code...
3,mithi/robotics-coursework,Other,want use arduino raspberry pi make robot short...,"[want, use, arduino, raspberry, pi, make, robo...",want use arduino raspberry pi make robot short...
4,onlytailei/CppRobotics,C++,cpprobotics cpp implementation pythonrobotics ...,"[cpprobotics, cpp, implementation, pythonrobot...",cpprobotics cpp implementation pythonrobotics ...


In [7]:
# repos_df = repos_df[
#     (repos_df['lemmas'].str.len() >= 2) &
#     (repos_df['lemmas'].str.len() <= 12) &
#     (repos_df['lemmas'] != 'ro')
# ]

In [8]:
repos_df

Unnamed: 0,repo,language,text,clean_text,lemmas
0,AtsushiSakai/PythonRobotics,Python,pythonrobotics githubactionlinuxci githubactio...,"[pythonrobotics, githubactionlinuxci, githubac...",pythonrobotics githubactionlinuxci githubactio...
1,kiloreux/awesome-robotics,Other,awesome robotics awesome list various book cou...,"[awesome, robotics, awesome, list, various, bo...",awesome robotics awesome list various book cou...
2,NxRLab/ModernRobotics,Python,modern robotics mechanic planning control code...,"[modern, robotics, mechanic, planning, control...",modern robotics mechanic planning control code...
3,mithi/robotics-coursework,Other,want use arduino raspberry pi make robot short...,"[want, use, arduino, raspberry, pi, make, robo...",want use arduino raspberry pi make robot short...
4,onlytailei/CppRobotics,C++,cpprobotics cpp implementation pythonrobotics ...,"[cpprobotics, cpp, implementation, pythonrobot...",cpprobotics cpp implementation pythonrobotics ...
...,...,...,...,...,...
670,RajashekarRaju/compose-actors,Other,compose actor dancer roadmap v030 x let user s...,"[compose, actor, dancer, roadmap, v030, x, let...",compose actor dancer roadmap v030 x let user s...
671,MMehrez/MPC-and-MHE-implementation-in-MATLAB-u...,Other,failtoloadreadme,[failtoloadreadme],failtoloadreadme
673,1c7/Translate-Subtitle-File,Other,srt vtt api key api 100 api 510 2023130 420 20...,"[srt, vtt, api, key, api, 100, api, 510, 20231...",srt vtt api key api 100 api 510 2023130 420 20...
674,landy22granatt/Kumpulan-Script-Termux,Python,pkg update pkg upgrade pkg install bash pkg in...,"[pkg, update, pkg, upgrade, pkg, install, bash...",pkg update pkg upgrade pkg install bash pkg in...


In [9]:
X = repos_df.text
y = repos_df.language

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7,
                                                    random_state=42)

In [10]:
tfidf = TfidfVectorizer()

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

X_train_tfidf[:10]

<10x36021 sparse matrix of type '<class 'numpy.float64'>'
	with 2285 stored elements in Compressed Sparse Row format>

In [11]:
baseline_acc = y_train.value_counts().max() / y_train.shape[0] * 100

print(f'baseline: {round(baseline_acc, 2)}.')

baseline: 35.21.


In [12]:
pd.DataFrame(X_train_tfidf.todense(), columns=tfidf.get_feature_names_out())

Unnamed: 0,00,000,0000,00000,000000,0000000,000000006,0000000s,00001,00006,...,ztractor,zuhlke,zulip,zup,zurich,zuxin,zv,zxy,zynq,zyx
0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
421,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
422,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
423,0.014681,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
424,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# logistic regression model

In [13]:
lm = LogisticRegression()

lm.fit(X_train_tfidf, y_train)

In [14]:
y_train_res = pd.DataFrame({'actual': y_train,
                            'preds': lm.predict(X_train_tfidf)})
y_train_res.head()

Unnamed: 0,actual,preds
102,Other,Other
450,Python,Python
120,Python,Python
306,Other,Other
676,Other,Other


In [15]:
print(class_rep(y_train_res.actual, y_train_res.preds))

              precision    recall  f1-score   support

         C++       0.99      0.99      0.99       143
       Other       0.99      0.99      0.99       133
      Python       0.99      0.98      0.98       150

    accuracy                           0.99       426
   macro avg       0.99      0.99      0.99       426
weighted avg       0.99      0.99      0.99       426



# Logistic Regression Test

In [16]:
y_test_res = pd.DataFrame({'actual': y_test,
                           'preds': lm.predict(X_test_tfidf)})
y_test_res.head()

Unnamed: 0,actual,preds
323,Other,Other
404,Python,Python
518,Other,Other
629,C++,Python
132,Python,Python


In [17]:
print(class_rep(y_test_res.actual, y_test_res.preds))

              precision    recall  f1-score   support

         C++       0.70      0.69      0.69        64
       Other       0.69      0.70      0.70        67
      Python       0.71      0.71      0.71        52

    accuracy                           0.70       183
   macro avg       0.70      0.70      0.70       183
weighted avg       0.70      0.70      0.70       183



# KNN Classifier Model

In [18]:
knn = KNeighborsClassifier()

knn.fit(X_train_tfidf, y_train)

In [19]:
y_train_res = pd.DataFrame({'actual': y_train,
                            'preds': knn.predict(X_train_tfidf)})
y_train_res.head()

Unnamed: 0,actual,preds
102,Other,Other
450,Python,Python
120,Python,Python
306,Other,Other
676,Other,Other


In [20]:
print(class_rep(y_train_res.actual, y_train_res.preds))

              precision    recall  f1-score   support

         C++       0.68      0.82      0.74       143
       Other       0.66      0.73      0.70       133
      Python       0.88      0.63      0.73       150

    accuracy                           0.72       426
   macro avg       0.74      0.72      0.72       426
weighted avg       0.74      0.72      0.72       426



# KNN test

In [21]:
y_test_res = pd.DataFrame({'actual': y_test,
                           'preds': knn.predict(X_test_tfidf)})
y_test_res.head()

Unnamed: 0,actual,preds
323,Other,Other
404,Python,Python
518,Other,Other
629,C++,Python
132,Python,C++


In [22]:
print(class_rep(y_test_res.actual, y_test_res.preds))

              precision    recall  f1-score   support

         C++       0.58      0.72      0.64        64
       Other       0.67      0.72      0.69        67
      Python       0.72      0.44      0.55        52

    accuracy                           0.64       183
   macro avg       0.66      0.63      0.63       183
weighted avg       0.65      0.64      0.63       183



# Random Forest Classier Model

In [23]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()

rf.fit(X_train_tfidf, y_train)

In [24]:
y_train_res = pd.DataFrame({'actual': y_train,
                            'preds': rf.predict(X_train_tfidf)})
y_train_res.head()

Unnamed: 0,actual,preds
102,Other,Other
450,Python,Python
120,Python,Python
306,Other,Other
676,Other,Other


In [25]:
print(class_rep(y_train_res.actual, y_train_res.preds))

              precision    recall  f1-score   support

         C++       1.00      1.00      1.00       143
       Other       0.99      1.00      1.00       133
      Python       1.00      0.99      1.00       150

    accuracy                           1.00       426
   macro avg       1.00      1.00      1.00       426
weighted avg       1.00      1.00      1.00       426



In [26]:
y_test_res = pd.DataFrame({'actual': y_test,
                           'preds': rf.predict(X_test_tfidf)})
y_test_res.head()

Unnamed: 0,actual,preds
323,Other,Other
404,Python,Python
518,Other,Other
629,C++,Python
132,Python,Python


In [27]:
print(class_rep(y_test_res.actual, y_test_res.preds))

              precision    recall  f1-score   support

         C++       0.79      0.70      0.74        64
       Other       0.71      0.78      0.74        67
      Python       0.74      0.75      0.74        52

    accuracy                           0.74       183
   macro avg       0.75      0.74      0.74       183
weighted avg       0.75      0.74      0.74       183



# XGBClassifier Model

In [38]:
from sklearn.preprocessing import LabelEncoder

# Initialize the label encoder
label_encoder = LabelEncoder()

X = repos_df.text
y = repos_df.language

# Fit the label encoder to your class labels and transform them
y_encoded = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, train_size=0.7, random_state=42)

tfidf = TfidfVectorizer()

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Create the XGBoost classifier instance
bst = XGBClassifier(n_estimators=2, max_depth=3, learning_rate=1, objective='multi:softprob', num_class=len(label_encoder.classes_))

# Fit the XGBoost model on the training data
bst.fit(X_train_tfidf, y_train)

# Predict the classes on the test data
preds = bst.predict(X_test_tfidf)

# If you want to decode the predicted labels back to their original class names:
preds_decoded = label_encoder.inverse_transform(preds)


In [39]:
y_train_res = pd.DataFrame({'actual': y_train,
                            'preds': bst.predict(X_train_tfidf)})
y_train_res.head()

Unnamed: 0,actual,preds
0,1,1
1,2,2
2,2,2
3,1,0
4,1,1


In [40]:
print(class_rep(y_train_res.actual, y_train_res.preds))

              precision    recall  f1-score   support

           0       0.70      0.45      0.55       143
           1       0.47      0.84      0.60       133
           2       0.83      0.53      0.65       150

    accuracy                           0.60       426
   macro avg       0.67      0.61      0.60       426
weighted avg       0.68      0.60      0.60       426



In [41]:
y_test_res = pd.DataFrame({'actual': y_test,
                           'preds': bst.predict(X_test_tfidf)})
y_test_res.head()

Unnamed: 0,actual,preds
0,1,1
1,2,2
2,1,1
3,0,2
4,2,2


In [42]:
print(class_rep(y_test_res.actual, y_test_res.preds))

              precision    recall  f1-score   support

           0       0.61      0.39      0.48        64
           1       0.54      0.82      0.65        67
           2       0.71      0.56      0.62        52

    accuracy                           0.60       183
   macro avg       0.62      0.59      0.58       183
weighted avg       0.61      0.60      0.58       183

