# Modeling

In [1]:
import pandas as pd
import numpy as np
import json

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import hstack




from sklearn.metrics import classification_report as class_rep
from prepare import clean, lemmatize

In [2]:
# reading in file to generate dataframe

In [3]:
repos_df = pd.read_csv('processed_repos.csv', index_col=0)
repos_df.drop(columns=('repo'))

Unnamed: 0,language,text,bigrams,trigrams
0,Python,pythonrobotics githubactionlinuxci githubactio...,path_planning grid_based path_tracking steerin...,pythonroboticsfigure1png_master_atsushisakaipy...
2,Python,modern robotics mechanic planning control code...,modern_robotics robotics_mechanic mechanic_pla...,modern_robotics_mechanic robotics_mechanic_pla...
4,C++,cpprobotics cpp implementation pythonrobotics ...,red_circle black_line steering_control point_r...,speed_steering_control black_line_planned line...
5,Other,roboticsacademy learn robotics artificial inte...,learn_robotics mini_radi roboticsacademy_learn...,roboticsacademy_learn_robotics learn_robotics_...
6,C++,probabilisticrobotics working detailed solutio...,probabilistic_robotics victoria_park park_data...,victoria_park_dataset probabilisticrobotics_wo...
...,...,...,...,...
665,Other,foc english solidworks matlab simulink simscap...,2000_2000 solidworks_matlab 15000_2000 foc_eng...,15000_2000_2000 foc_english_solidworks english...
667,Other,bonsai rust implementation behavior tree build...,behavior_tree mut_bt bonsaibtsuccess_dt dt_els...,bonsaibtsuccess_dt_else b_run_b run_b_parallel...
669,Python,easytouse instruction generation framework lar...,easyinstruct_import example_python python_easy...,example_python_easyinstruct python_easyinstruc...
670,Other,compose actor dancer roadmap v030 x let user s...,x_add detail_screen tmdb_api ui_state screen_x...,ha_viewmodel_manage viewmodel_manage_ui manage...


In [4]:
repos_df = pd.DataFrame(repos_df)
repos_df = repos_df.drop(columns=['repo', 'bigrams', 'trigrams'])
repos_df = repos_df.dropna()
repos_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 541 entries, 0 to 674
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   language  541 non-null    object
 1   text      541 non-null    object
dtypes: object(2)
memory usage: 12.7+ KB


In [5]:
repos_df

Unnamed: 0,language,text
0,Python,pythonrobotics githubactionlinuxci githubactio...
2,Python,modern robotics mechanic planning control code...
4,C++,cpprobotics cpp implementation pythonrobotics ...
5,Other,roboticsacademy learn robotics artificial inte...
6,C++,probabilisticrobotics working detailed solutio...
...,...,...
665,Other,foc english solidworks matlab simulink simscap...
667,Other,bonsai rust implementation behavior tree build...
669,Python,easytouse instruction generation framework lar...
670,Other,compose actor dancer roadmap v030 x let user s...


In [6]:
from sklearn.model_selection import train_test_split

X = repos_df.text
y = repos_df.language

# Split the data into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, train_size=0.7, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf = tfidf.transform(X_val)
X_test_tfidf = tfidf.transform(X_test)
X_train_tfidf[:10]

<10x28955 sparse matrix of type '<class 'numpy.float64'>'
	with 2121 stored elements in Compressed Sparse Row format>

In [7]:
train_baseline_acc = y_train.value_counts().max() / y_train.shape[0] * 100

print(f'baseline: {round(train_baseline_acc, 2)}.')

baseline: 38.89.


In [8]:
val_baseline_acc = y_val.value_counts().max() / y_val.shape[0] * 100

print(f'baseline: {round(val_baseline_acc, 2)}.')

baseline: 37.04.


In [9]:
# pd.DataFrame(X_train_tfidf.todense(), columns=tfidf.get_feature_names_out())

# Logistic regression model

In [10]:
lm = LogisticRegression(
    penalty='l2',  # L2 regularization (Ridge)
    C=1.0,  # Inverse of regularization strength
    fit_intercept=False,  # Include an intercept
    class_weight='balanced',  # You can set class weights if needed
    solver='liblinear',  # Choose a solver appropriate for your data
    max_iter=100,  # You may need to increase this if the model doesn't converge
    random_state=42  # For reproducibility
)

lm.fit(X_train_tfidf, y_train)

In [11]:
y_train_res = pd.DataFrame({'actual': y_train,
                            'preds': lm.predict(X_train_tfidf)})
y_train_res.head()

Unnamed: 0,actual,preds
537,Other,Other
466,C++,C++
79,Python,Python
426,Python,Python
36,C++,C++


In [12]:
print(class_rep(y_train_res.actual, y_train_res.preds))

              precision    recall  f1-score   support

         C++       0.97      0.98      0.97       147
       Other       0.99      0.95      0.97        91
      Python       0.96      0.98      0.97       140

    accuracy                           0.97       378
   macro avg       0.97      0.97      0.97       378
weighted avg       0.97      0.97      0.97       378



In [13]:
train_accuracy = accuracy_score(y_train_res['actual'], y_train_res['preds'])
print(f'Train Accuracy: {train_accuracy:.2f}')

Train Accuracy: 0.97


# Logistic Regression Validate

In [14]:
y_val_res = pd.DataFrame({'actual': y_val,
                           'preds': lm.predict(X_val_tfidf)})
y_val_res.head()

Unnamed: 0,actual,preds
110,Other,C++
341,C++,C++
120,Python,Python
236,C++,C++
351,Other,Other


In [15]:
print(class_rep(y_val_res.actual, y_val_res.preds))

              precision    recall  f1-score   support

         C++       0.72      0.87      0.79        30
       Other       0.80      0.67      0.73        24
      Python       0.84      0.78      0.81        27

    accuracy                           0.78        81
   macro avg       0.79      0.77      0.77        81
weighted avg       0.78      0.78      0.78        81



In [16]:
val_accuracy = accuracy_score(y_val_res['actual'], y_val_res['preds'])
print(f'Validation Accuracy: {val_accuracy:.2f}')

Validation Accuracy: 0.78


# KNN Classifier Model

In [17]:
knn = KNeighborsClassifier(
    n_neighbors=2,  
    weights='distance',  # distance
    p=2,  # Euclidean distance
    algorithm='auto',  # 'ball_tree', 'kd_tree', or 'brute'
    leaf_size=30,  
    metric='euclidean'  # You can choose other metrics or provide custom ones
)

knn.fit(X_train_tfidf, y_train)

In [18]:
y_train_res = pd.DataFrame({'actual': y_train,
                            'preds': knn.predict(X_train_tfidf)})
y_train_res.head()

Unnamed: 0,actual,preds
537,Other,Other
466,C++,C++
79,Python,Python
426,Python,Python
36,C++,C++


In [19]:
print(class_rep(y_train_res.actual, y_train_res.preds))

              precision    recall  f1-score   support

         C++       1.00      1.00      1.00       147
       Other       1.00      1.00      1.00        91
      Python       1.00      1.00      1.00       140

    accuracy                           1.00       378
   macro avg       1.00      1.00      1.00       378
weighted avg       1.00      1.00      1.00       378



In [20]:
train_accuracy = accuracy_score(y_train_res['actual'], y_train_res['preds'])
print(f'Train Accuracy: {train_accuracy:.2f}')

Train Accuracy: 1.00


# KNN Val

In [21]:
y_val_res = pd.DataFrame({'actual': y_val,
                           'preds': lm.predict(X_val_tfidf)})
y_val_res.head()

Unnamed: 0,actual,preds
110,Other,C++
341,C++,C++
120,Python,Python
236,C++,C++
351,Other,Other


In [22]:
print(class_rep(y_val_res.actual, y_val_res.preds))

              precision    recall  f1-score   support

         C++       0.72      0.87      0.79        30
       Other       0.80      0.67      0.73        24
      Python       0.84      0.78      0.81        27

    accuracy                           0.78        81
   macro avg       0.79      0.77      0.77        81
weighted avg       0.78      0.78      0.78        81



In [23]:
val_accuracy = accuracy_score(y_val_res['actual'], y_val_res['preds'])
print(f'Validation Accuracy: {val_accuracy:.2f}')

Validation Accuracy: 0.78


# Random Forest Classier Model

In [24]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=50,
    max_depth=None, #increasing caused scores to decrease
    min_samples_split=2,
    min_samples_leaf=1,
    max_features=None,
    bootstrap=True,
    random_state=42,  # For reproducibility
    class_weight='balanced',  # Handle class imbalance
    criterion='entropy'  # Or 'entropy'
)

rf.fit(X_train_tfidf, y_train)

In [25]:
y_train_res = pd.DataFrame({'actual': y_train,
                            'preds': rf.predict(X_train_tfidf)})
y_train_res.head()

Unnamed: 0,actual,preds
537,Other,Other
466,C++,C++
79,Python,Python
426,Python,Python
36,C++,C++


In [26]:
print(class_rep(y_train_res.actual, y_train_res.preds))

              precision    recall  f1-score   support

         C++       1.00      1.00      1.00       147
       Other       1.00      1.00      1.00        91
      Python       1.00      1.00      1.00       140

    accuracy                           1.00       378
   macro avg       1.00      1.00      1.00       378
weighted avg       1.00      1.00      1.00       378



In [27]:
train_accuracy = accuracy_score(y_train_res['actual'], y_train_res['preds'])
print(f'Train Accuracy: {train_accuracy:.2f}')

Train Accuracy: 1.00


# RandomForestClassifier Val

In [28]:
y_val_res = pd.DataFrame({'actual': y_val,
                           'preds': lm.predict(X_val_tfidf)})
y_val_res.head()

Unnamed: 0,actual,preds
110,Other,C++
341,C++,C++
120,Python,Python
236,C++,C++
351,Other,Other


In [29]:
print(class_rep(y_val_res.actual, y_val_res.preds))

              precision    recall  f1-score   support

         C++       0.72      0.87      0.79        30
       Other       0.80      0.67      0.73        24
      Python       0.84      0.78      0.81        27

    accuracy                           0.78        81
   macro avg       0.79      0.77      0.77        81
weighted avg       0.78      0.78      0.78        81



In [30]:
val_accuracy = accuracy_score(y_val_res['actual'], y_val_res['preds'])
print(f'Validation Accuracy: {val_accuracy:.2f}')

Validation Accuracy: 0.78


# XGBClassifier Model

In [31]:
# Initialize the label encoder
label_encoder = LabelEncoder()

# Encode the target labels
y_encoded = label_encoder.fit_transform(repos_df.language)

# Split the data into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(repos_df.text, y_encoded, train_size=0.7, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Initialize and fit the TfidfVectorizer on the training data
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf = tfidf.transform(X_val)
X_test_tfidf = tfidf.transform(X_test)

# Create the XGBoost classifier instance
bst = XGBClassifier(n_estimators=100, max_depth=2, learning_rate=0.25, objective='multi:softprob', num_class=len(label_encoder.classes_))

# Fit the XGBoost model on the training data
bst.fit(X_train_tfidf, y_train)

# Predict the classes on the validation data
preds = bst.predict(X_val_tfidf)

# If you want to decode the predicted labels back to their original class names:
preds_decoded = label_encoder.inverse_transform(preds)

In [32]:
y_train_res = pd.DataFrame({'actual': y_train,
                            'preds': bst.predict(X_train_tfidf)})
y_train_res.head()

Unnamed: 0,actual,preds
0,1,1
1,0,0
2,2,2
3,2,2
4,0,0


In [33]:
print(class_rep(y_train_res.actual, y_train_res.preds))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       147
           1       1.00      1.00      1.00        91
           2       1.00      1.00      1.00       140

    accuracy                           1.00       378
   macro avg       1.00      1.00      1.00       378
weighted avg       1.00      1.00      1.00       378



In [34]:
train_accuracy = accuracy_score(y_train_res['actual'], y_train_res['preds'])
print(f'Train Accuracy: {train_accuracy:.2f}')

Train Accuracy: 1.00


In [35]:
y_val_res = pd.DataFrame({'actual': y_val,
                           'preds': bst.predict(X_val_tfidf)})
y_val_res.head()

Unnamed: 0,actual,preds
0,1,0
1,0,0
2,2,2
3,0,1
4,1,0


In [36]:
print(class_rep(y_val_res.actual, y_val_res.preds))

              precision    recall  f1-score   support

           0       0.70      0.87      0.78        30
           1       0.74      0.58      0.65        24
           2       0.84      0.78      0.81        27

    accuracy                           0.75        81
   macro avg       0.76      0.74      0.74        81
weighted avg       0.76      0.75      0.75        81



In [37]:
val_accuracy = accuracy_score(y_val_res['actual'], y_val_res['preds'])
print(f'Validation Accuracy: {val_accuracy:.2f}')

Validation Accuracy: 0.75


# Final Model Logisitic Regression with Hyperparameter tuning   

-----

# Logistic Regression Validate

In [31]:
y_test_res = pd.DataFrame({'actual': y_test,
                           'preds': lm.predict(X_test_tfidf)})
y_test_res.head()

Unnamed: 0,actual,preds
456,Python,Python
392,C++,C++
478,C++,C++
416,Python,C++
173,C++,C++


In [32]:
print(class_rep(y_test_res.actual, y_test_res.preds))

              precision    recall  f1-score   support

         C++       0.69      0.83      0.76        30
       Other       0.65      0.58      0.61        19
      Python       0.86      0.76      0.81        33

    accuracy                           0.74        82
   macro avg       0.73      0.72      0.73        82
weighted avg       0.75      0.74      0.74        82



In [33]:
test_accuracy = accuracy_score(y_test_res['actual'], y_test_res['preds'])
print(f'Test Accuracy: {test_accuracy:.2f}')

Test Accuracy: 0.74


In [1]:
import model as m

In [2]:
m.model_1()



Logisitic Regression Model (Hyperparameters Used)

Train Accuracy: 0.97


Validation Accuracy: 0.78



In [3]:
m.model_2()



KNearest Neighbors (Hyperparameters Used)

Train Accuracy: 1.00


Validation Accuracy: 0.63



In [4]:
m.model_3()



XGBClassifier Model (Hyperparameters Used)

Train Accuracy: 1.00


Validation Accuracy: 0.75



In [5]:
m.model_4()

NameError: name 'lm' is not defined