# Modeling

In [2]:
import pandas as pd
import numpy as np
import json

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import hstack




from sklearn.metrics import classification_report as class_rep

In [3]:
repos_df = pd.read_csv('processed_repos.csv', index_col=0)
repos_df.drop(columns=('repo'))

Unnamed: 0,language,text,bigrams,trigrams
0,Python,pythonrobotics githubactionlinuxci githubactio...,path_planning grid_based path_tracking steerin...,pythonroboticsfigure1png_master_atsushisakaipy...
2,Python,modern robotics mechanic planning control code...,modern_robotics robotics_mechanic mechanic_pla...,modern_robotics_mechanic robotics_mechanic_pla...
4,C++,cpprobotics cpp implementation pythonrobotics ...,red_circle black_line steering_control point_r...,speed_steering_control black_line_planned line...
5,Other,roboticsacademy learn robotics artificial inte...,learn_robotics mini_radi roboticsacademy_learn...,roboticsacademy_learn_robotics learn_robotics_...
6,C++,probabilisticrobotics working detailed solutio...,probabilistic_robotics victoria_park park_data...,victoria_park_dataset probabilisticrobotics_wo...
...,...,...,...,...
665,Other,foc english solidworks matlab simulink simscap...,2000_2000 solidworks_matlab 15000_2000 foc_eng...,15000_2000_2000 foc_english_solidworks english...
667,Other,bonsai rust implementation behavior tree build...,behavior_tree mut_bt bonsaibtsuccess_dt dt_els...,bonsaibtsuccess_dt_else b_run_b run_b_parallel...
669,Python,easytouse instruction generation framework lar...,easyinstruct_import example_python python_easy...,example_python_easyinstruct python_easyinstruc...
670,Other,compose actor dancer roadmap v030 x let user s...,x_add detail_screen tmdb_api ui_state screen_x...,ha_viewmodel_manage viewmodel_manage_ui manage...


In [4]:
repos_df = pd.DataFrame(repos_df)
repos_df = repos_df.drop(columns='repo')
repos_df = repos_df.dropna()
repos_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 541 entries, 0 to 674
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   language  541 non-null    object
 1   text      541 non-null    object
 2   bigrams   541 non-null    object
 3   trigrams  541 non-null    object
dtypes: object(4)
memory usage: 21.1+ KB


In [5]:
repos_df

Unnamed: 0,language,text,bigrams,trigrams
0,Python,pythonrobotics githubactionlinuxci githubactio...,path_planning grid_based path_tracking steerin...,pythonroboticsfigure1png_master_atsushisakaipy...
2,Python,modern robotics mechanic planning control code...,modern_robotics robotics_mechanic mechanic_pla...,modern_robotics_mechanic robotics_mechanic_pla...
4,C++,cpprobotics cpp implementation pythonrobotics ...,red_circle black_line steering_control point_r...,speed_steering_control black_line_planned line...
5,Other,roboticsacademy learn robotics artificial inte...,learn_robotics mini_radi roboticsacademy_learn...,roboticsacademy_learn_robotics learn_robotics_...
6,C++,probabilisticrobotics working detailed solutio...,probabilistic_robotics victoria_park park_data...,victoria_park_dataset probabilisticrobotics_wo...
...,...,...,...,...
665,Other,foc english solidworks matlab simulink simscap...,2000_2000 solidworks_matlab 15000_2000 foc_eng...,15000_2000_2000 foc_english_solidworks english...
667,Other,bonsai rust implementation behavior tree build...,behavior_tree mut_bt bonsaibtsuccess_dt dt_els...,bonsaibtsuccess_dt_else b_run_b run_b_parallel...
669,Python,easytouse instruction generation framework lar...,easyinstruct_import example_python python_easy...,example_python_easyinstruct python_easyinstruc...
670,Other,compose actor dancer roadmap v030 x let user s...,x_add detail_screen tmdb_api ui_state screen_x...,ha_viewmodel_manage viewmodel_manage_ui manage...


In [9]:
# First set of data using the 'text' column
X = repos_df.text
y = repos_df.language

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

tfidf = TfidfVectorizer()

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Display the first 10 samples
print(X_train_tfidf[:10])

  (0, 7164)	0.022659303985166024
  (0, 21728)	0.022659303985166024
  (0, 7172)	0.022659303985166024
  (0, 7163)	0.022659303985166024
  (0, 10397)	0.008184292619263874
  (0, 10699)	0.008117708248989086
  (0, 17665)	0.022659303985166024
  (0, 12400)	0.030227076091948184
  (0, 21702)	0.019334317321012044
  (0, 6897)	0.011281390741219541
  (0, 6101)	0.022659303985166024
  (0, 20163)	0.01468613328327391
  (0, 9566)	0.04531860797033205
  (0, 16120)	0.009863036043695546
  (0, 9565)	0.04531860797033205
  (0, 21727)	0.04531860797033205
  (0, 21729)	0.04531860797033205
  (0, 5846)	0.010978820030730106
  (0, 2572)	0.010836498010374696
  (0, 11964)	0.014303806694884088
  (0, 9559)	0.060841045277857136
  (0, 20866)	0.022659303985166024
  (0, 21612)	0.022659303985166024
  (0, 15537)	0.006474571148011304
  (0, 21609)	0.022659303985166024
  :	:
  (9, 10441)	0.011077005264371872
  (9, 6986)	0.010992941419332396
  (9, 15054)	0.014401105071371496
  (9, 4199)	0.04930426447828915
  (9, 28359)	0.03133059764

In [10]:
# Second set of data using the 'bigrams' column
X2 = repos_df.bigrams
y2 = repos_df.language

X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, train_size=0.7, random_state=42)

tfidf2 = TfidfVectorizer()

X2_train_tfidf = tfidf2.fit_transform(X2_train)
X2_test_tfidf = tfidf2.transform(X2_test)

# Display the first 10 samples from the second set
print(X2_train_tfidf[:10])

  (0, 4601)	0.2236067977499789
  (0, 1266)	0.2236067977499789
  (0, 2092)	0.2236067977499789
  (0, 1040)	0.2236067977499789
  (0, 5109)	0.2236067977499789
  (0, 4002)	0.2236067977499789
  (0, 958)	0.2236067977499789
  (0, 1304)	0.2236067977499789
  (0, 4440)	0.2236067977499789
  (0, 1311)	0.2236067977499789
  (0, 4175)	0.2236067977499789
  (0, 5793)	0.2236067977499789
  (0, 5108)	0.2236067977499789
  (0, 4571)	0.2236067977499789
  (0, 1835)	0.2236067977499789
  (0, 3572)	0.2236067977499789
  (0, 4570)	0.2236067977499789
  (0, 1926)	0.2236067977499789
  (0, 1267)	0.2236067977499789
  (0, 554)	0.2236067977499789
  (1, 4890)	0.2236067977499789
  (1, 2228)	0.2236067977499789
  (1, 4905)	0.2236067977499789
  (1, 3327)	0.2236067977499789
  (1, 3056)	0.2236067977499789
  :	:
  (8, 4021)	0.18223577135737892
  (8, 4898)	0.2279720150842213
  (8, 1660)	0.2279720150842213
  (8, 584)	0.2131691741051027
  (8, 1092)	0.2279720150842213
  (9, 4048)	0.22628997366453707
  (9, 3210)	0.22628997366453707
  

In [11]:
# third set of data using the 'trigrams' column
X3 = repos_df.trigrams
y3 = repos_df.language

X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, train_size=0.7, random_state=42)

tfidf3 = TfidfVectorizer()

X3_train_tfidf = tfidf3.fit_transform(X3_train)
X3_test_tfidf = tfidf3.transform(X3_test)

# Display the first 10 samples from the second set
print(X3_train_tfidf[:10])

  (0, 4872)	0.2236067977499789
  (0, 1992)	0.2236067977499789
  (0, 1427)	0.2236067977499789
  (0, 4216)	0.2236067977499789
  (0, 2015)	0.2236067977499789
  (0, 1858)	0.2236067977499789
  (0, 6481)	0.2236067977499789
  (0, 4870)	0.2236067977499789
  (0, 2310)	0.2236067977499789
  (0, 1179)	0.2236067977499789
  (0, 5693)	0.2236067977499789
  (0, 1461)	0.2236067977499789
  (0, 2111)	0.2236067977499789
  (0, 4905)	0.2236067977499789
  (0, 1475)	0.2236067977499789
  (0, 2112)	0.2236067977499789
  (0, 4590)	0.2236067977499789
  (0, 617)	0.2236067977499789
  (0, 5075)	0.2236067977499789
  (0, 1428)	0.2236067977499789
  (1, 2349)	0.2243125249588985
  (1, 5484)	0.2243125249588985
  (1, 914)	0.2243125249588985
  (1, 5245)	0.2243125249588985
  (1, 965)	0.2243125249588985
  :	:
  (8, 5383)	0.2236067977499789
  (8, 4572)	0.2236067977499789
  (8, 1816)	0.2236067977499789
  (8, 5931)	0.2236067977499789
  (8, 646)	0.2236067977499789
  (9, 6215)	0.2243125249588985
  (9, 4431)	0.2243125249588985
  (9, 

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

# Original features
X = repos_df[['text', 'bigrams', 'trigrams']]  # Include both 'text' and 'bigrams' and 'trigrams'

# Target variable
y = repos_df.language

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

# Create separate TF-IDF vectorizers for 'text' and 'bigrams'
tfidf_text = TfidfVectorizer()
tfidf_bigrams = TfidfVectorizer()
tfidf_trigrams = TfidfVectorizer()

# Transform the 'text' and 'bigrams' columns separately
X_train_text_tfidf = tfidf_text.fit_transform(X_train['text'])
X_test_text_tfidf = tfidf_text.transform(X_test['text'])
X_train_bigrams_tfidf = tfidf_bigrams.fit_transform(X_train['bigrams'])
X_test_bigrams_tfidf = tfidf_bigrams.transform(X_test['bigrams'])
X_train_trigrams_tfidf = tfidf_trigrams.fit_transform(X_train['trigrams'])
X_test_trigrams_tfidf = tfidf_trigrams.transform(X_test['trigrams'])

# Concatenate the TF-IDF transformed features with the original features
X_train_combined = hstack((X_train_text_tfidf, X_train_bigrams_tfidf, X_train_trigrams_tfidf, X_train.drop(['text', 'bigrams', 'trigrams'], axis=1)))
X_test_combined = hstack((X_test_text_tfidf, X_test_bigrams_tfidf, X_test_trigrams_tfidf, X_test.drop(['text', 'bigrams', 'trigrams'], axis=1)))

In [13]:
# X = repos_df.text
# y = repos_df.language

# X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7,
#                                                     random_state=42)

In [14]:
# tfidf = TfidfVectorizer()

# X_train_tfidf = tfidf.fit_transform(X_train)
# X_test_tfidf = tfidf.transform(X_test)

# X_train_tfidf[:10]

In [15]:
baseline_acc = y_train.value_counts().max() / y_train.shape[0] * 100

print(f'baseline: {round(baseline_acc, 2)}.')

baseline: 38.89.


In [16]:
pd.DataFrame(X_train_tfidf.todense(), columns=tfidf.get_feature_names_out())

Unnamed: 0,00,000,0000,000000,000000006,0000000s,00001,00008,00009,0001,...,ztranslation,zuhlke,zup,zurich,zv,zxvf,zxy,zypper,zyx,zyxconvention
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
373,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
374,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
376,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# logistic regression model

In [17]:
lm = LogisticRegression()

lm.fit(X_train_tfidf, y_train)

In [18]:
y_train_res = pd.DataFrame({'actual': y_train,
                            'preds': lm.predict(X_train_tfidf)})
y_train_res.head()

Unnamed: 0,actual,preds
537,Other,Other
466,C++,C++
79,Python,Python
426,Python,Python
36,C++,C++


In [19]:
print(class_rep(y_train_res.actual, y_train_res.preds))

              precision    recall  f1-score   support

         C++       0.98      0.99      0.99       147
       Other       1.00      0.95      0.97        91
      Python       0.98      1.00      0.99       140

    accuracy                           0.98       378
   macro avg       0.99      0.98      0.98       378
weighted avg       0.98      0.98      0.98       378



# Logistic Regression Test

In [20]:
y_test_res = pd.DataFrame({'actual': y_test,
                           'preds': lm.predict(X_test_tfidf)})
y_test_res.head()

Unnamed: 0,actual,preds
296,Other,C++
101,C++,C++
447,Python,Python
117,C++,C++
590,C++,C++


In [21]:
print(class_rep(y_test_res.actual, y_test_res.preds))

              precision    recall  f1-score   support

         C++       0.64      0.87      0.74        60
       Other       0.75      0.35      0.48        43
      Python       0.79      0.82      0.80        60

    accuracy                           0.71       163
   macro avg       0.73      0.68      0.67       163
weighted avg       0.73      0.71      0.69       163



# KNN Classifier Model

In [22]:
knn = KNeighborsClassifier()

knn.fit(X_train_tfidf, y_train)

In [23]:
y_train_res = pd.DataFrame({'actual': y_train,
                            'preds': knn.predict(X_train_tfidf)})
y_train_res.head()

Unnamed: 0,actual,preds
537,Other,Python
466,C++,C++
79,Python,Python
426,Python,Python
36,C++,C++


In [24]:
print(class_rep(y_train_res.actual, y_train_res.preds))

              precision    recall  f1-score   support

         C++       0.70      0.91      0.79       147
       Other       0.77      0.60      0.68        91
      Python       0.87      0.72      0.79       140

    accuracy                           0.77       378
   macro avg       0.78      0.75      0.75       378
weighted avg       0.78      0.77      0.76       378



# KNN test

In [25]:
y_test_res = pd.DataFrame({'actual': y_test,
                           'preds': knn.predict(X_test_tfidf)})
y_test_res.head()

Unnamed: 0,actual,preds
296,Other,C++
101,C++,C++
447,Python,Python
117,C++,C++
590,C++,C++


In [26]:
print(class_rep(y_test_res.actual, y_test_res.preds))

              precision    recall  f1-score   support

         C++       0.56      0.85      0.68        60
       Other       0.67      0.37      0.48        43
      Python       0.75      0.60      0.67        60

    accuracy                           0.63       163
   macro avg       0.66      0.61      0.61       163
weighted avg       0.66      0.63      0.62       163



# Random Forest Classier Model

In [27]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()

rf.fit(X_train_tfidf, y_train)

In [28]:
y_train_res = pd.DataFrame({'actual': y_train,
                            'preds': rf.predict(X_train_tfidf)})
y_train_res.head()

Unnamed: 0,actual,preds
537,Other,Other
466,C++,C++
79,Python,Python
426,Python,Python
36,C++,C++


In [29]:
print(class_rep(y_train_res.actual, y_train_res.preds))

              precision    recall  f1-score   support

         C++       1.00      1.00      1.00       147
       Other       1.00      1.00      1.00        91
      Python       1.00      1.00      1.00       140

    accuracy                           1.00       378
   macro avg       1.00      1.00      1.00       378
weighted avg       1.00      1.00      1.00       378



In [30]:
y_test_res = pd.DataFrame({'actual': y_test,
                           'preds': rf.predict(X_test_tfidf)})
y_test_res.head()

Unnamed: 0,actual,preds
296,Other,C++
101,C++,C++
447,Python,Python
117,C++,C++
590,C++,C++


In [31]:
print(class_rep(y_test_res.actual, y_test_res.preds))

              precision    recall  f1-score   support

         C++       0.57      0.88      0.69        60
       Other       0.82      0.33      0.47        43
      Python       0.81      0.72      0.76        60

    accuracy                           0.67       163
   macro avg       0.73      0.64      0.64       163
weighted avg       0.73      0.67      0.66       163



# XGBClassifier Model

In [32]:
from sklearn.preprocessing import LabelEncoder

# Initialize the label encoder
label_encoder = LabelEncoder()

X = repos_df.text
y = repos_df.language

# Fit the label encoder to your class labels and transform them
y_encoded = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, train_size=0.7, random_state=42)

tfidf = TfidfVectorizer()

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Create the XGBoost classifier instance
bst = XGBClassifier(n_estimators=2, max_depth=3, learning_rate=1, objective='multi:softprob', num_class=len(label_encoder.classes_))

# Fit the XGBoost model on the training data
bst.fit(X_train_tfidf, y_train)

# Predict the classes on the test data
preds = bst.predict(X_test_tfidf)

# If you want to decode the predicted labels back to their original class names:
preds_decoded = label_encoder.inverse_transform(preds)


In [33]:
y_train_res = pd.DataFrame({'actual': y_train,
                            'preds': bst.predict(X_train_tfidf)})
y_train_res.head()

Unnamed: 0,actual,preds
0,1,0
1,0,0
2,2,2
3,2,2
4,0,0


In [34]:
print(class_rep(y_train_res.actual, y_train_res.preds))

              precision    recall  f1-score   support

           0       0.78      0.95      0.86       147
           1       0.95      0.66      0.78        91
           2       0.93      0.90      0.91       140

    accuracy                           0.86       378
   macro avg       0.89      0.84      0.85       378
weighted avg       0.88      0.86      0.86       378



In [35]:
y_test_res = pd.DataFrame({'actual': y_test,
                           'preds': bst.predict(X_test_tfidf)})
y_test_res.head()

Unnamed: 0,actual,preds
0,1,1
1,0,0
2,2,2
3,0,0
4,0,0


In [36]:
print(class_rep(y_test_res.actual, y_test_res.preds))

              precision    recall  f1-score   support

           0       0.62      0.83      0.71        60
           1       0.80      0.37      0.51        43
           2       0.76      0.80      0.78        60

    accuracy                           0.70       163
   macro avg       0.73      0.67      0.67       163
weighted avg       0.72      0.70      0.68       163

