### Mohammed Abunada
### Modelling

#### 1. Imports

In [1]:
import imblearn

In [2]:
from imblearn.over_sampling import SMOTE

In [3]:
import pandas as pd
import numpy as np
import plotly.express as px

import re
import string
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import download

In [4]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


In [5]:
from warnings import filterwarnings
filterwarnings('ignore')

In [6]:
# Download necessary NLTK data
download('punkt')
download('wordnet')
download('omw-1.4')
download('punkt_tab')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Mohammed\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Mohammed\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Mohammed\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Mohammed\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

#### 2. Data Loading

In [7]:
# I read the java, python & pharo train files
# using pd.read_parquet into 3 different pandas files

java = pd.read_parquet("data/java_train-00000-of-00001.parquet")
python = pd.read_parquet("data/python_train-00000-of-00001.parquet")
pharo = pd.read_parquet("data/pharo_train-00000-of-00001.parquet")

#### 3. Text Cleaning

In [8]:
lemmatizer = WordNetLemmatizer()

In [9]:
def clean_text(x):
    ### convert to lowercase
    s = x.lower()
    
    # remove symbols
    s = re.sub(r"[^a-zA-Z\s]", "", s)
    words = word_tokenize(s)
    
    # lematize words
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return " ".join(lemmatized_words)

In [10]:
java['clean_text'] = java['combo'].apply(clean_text)
python['clean_text'] = python['combo'].apply(clean_text)
pharo['clean_text'] = pharo['combo'].apply(clean_text)

In [11]:
# Convert labels to single value instead of list

In [12]:
java['labels'].apply(lambda x: x.tolist().index(1))

0       0
1       2
2       0
3       2
4       2
       ..
7609    0
7610    0
7611    0
7612    0
7613    0
Name: labels, Length: 7614, dtype: int64

In [13]:
java['labels'] = java['labels'].apply(lambda x: x.tolist().index(1))
python['labels'] = python['labels'].apply(lambda x: x.tolist().index(1))
pharo['labels'] = pharo['labels'].apply(lambda x: x.tolist().index(1))

#### 4. Test vectorization

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [15]:
tf = TfidfVectorizer()
java_tf = tf.fit(java['clean_text'])
java_X = java_tf.transform(java['clean_text'])
java_y = java['labels']

In [16]:
tf = TfidfVectorizer()
python_tf = tf.fit(python['clean_text'])
python_X = python_tf.transform(python['clean_text'])
python_y = python['labels']

In [17]:
tf = TfidfVectorizer()
pharo_tf = tf.fit(pharo['clean_text'])
pharo_X = pharo_tf.transform(pharo['clean_text'])
pharo_y = pharo['labels']

#### 5. Class rebalancing

In [18]:
# Resampling all three using SMOTE

In [19]:
oversample = SMOTE()
java_X, java_y = oversample.fit_resample(java_X, java_y)

In [20]:
oversample = SMOTE()
python_X, python_y = oversample.fit_resample(python_X, python_y)

In [21]:
oversample = SMOTE()
pharo_X, pharo_y = oversample.fit_resample(pharo_X, pharo_y)

#### 5. Model Generation

#### 5.1 Logistic Regression

In [22]:
param_grid = {
    'penalty': ['l1', 'l2', None],  # Regularization terms
    'C': [0.1, 1.0, 10,],   # Inverse regularization strength
}

In [23]:
logistic_regression = LogisticRegression(multi_class='multinomial')

# Set up the grid search
grid_search = GridSearchCV(
    estimator=logistic_regression,
    param_grid=param_grid,
    scoring='accuracy',       # Use accuracy as the scoring metric
    cv=5,                     # 5-fold cross-validation
    verbose=1,                # Output progress messages
    n_jobs=-1                 # Use all available cores
)



###### 5.1.1 Java

In [24]:
grid_search.fit(java_X, java_y)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


In [25]:
grid_search.best_score_

0.9514444004748714

In [26]:
grid_search.best_params_

{'C': 10, 'penalty': 'l2'}

###### 5.1.2 python

In [27]:
grid_search.fit(python_X, python_y)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


In [28]:
grid_search.best_score_

0.7138408304498269

In [29]:
grid_search.best_params_

{'C': 10, 'penalty': 'l2'}

###### 5.1.3 pharo

In [30]:
grid_search.fit(pharo_X, pharo_y)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


In [31]:
grid_search.best_score_

0.9185593458967327

In [32]:
grid_search.best_params_

{'C': 10, 'penalty': 'l2'}

#### 5.2 Decision Tree

In [33]:
param_grid = {
    'criterion': ['gini', 'entropy'],  # Splitting criteria
    'max_depth': [None, 10, 20, 30],              # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],              # Minimum samples required to split a node
}

In [34]:
decision_tree = DecisionTreeClassifier()

# Set up the grid search
grid_search = GridSearchCV(
    estimator=decision_tree,
    param_grid=param_grid,
    scoring='accuracy',       # Use accuracy as the scoring metric
    cv=5,                     # 5-fold cross-validation
    verbose=1,                # Output progress messages
    n_jobs=-1                 # Use all available cores
)



###### 5.1.1 Java

In [35]:
grid_search.fit(java_X, java_y)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


In [36]:
grid_search.best_score_

0.9291254451919272

In [37]:
grid_search.best_params_

{'criterion': 'gini', 'max_depth': None, 'min_samples_split': 5}

###### 5.1.2 python

In [38]:
grid_search.fit(python_X, python_y)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


In [39]:
grid_search.best_score_

0.6307958477508651

In [40]:
grid_search.best_params_

{'criterion': 'gini', 'max_depth': None, 'min_samples_split': 2}

###### 5.1.3 pharo

In [41]:
grid_search.fit(pharo_X, pharo_y)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


In [42]:
grid_search.best_score_

0.8492524186455584

In [43]:
grid_search.best_params_

{'criterion': 'gini', 'max_depth': None, 'min_samples_split': 5}

#### 5.3 Random Forest

In [44]:
param_grid = {
    'n_estimators': [50, 100, 200],               # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],              # Maximum depth of the tree
    'criterion': ['gini', 'entropy'],             # Function to measure the quality of a split
}


In [45]:
rf = RandomForestClassifier()

# Set up the grid search
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    scoring='accuracy',       # Use accuracy as the scoring metric
    cv=5,                     # 5-fold cross-validation
    verbose=1,                # Output progress messages
    n_jobs=-1                 # Use all available cores
)



###### 5.1.1 Java

In [46]:
grid_search.fit(java_X, java_y)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


In [47]:
grid_search.best_score_

0.9662841313810844

In [48]:
grid_search.best_params_

{'criterion': 'entropy', 'max_depth': None, 'n_estimators': 200}

###### 5.1.2 python

In [49]:
grid_search.fit(python_X, python_y)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


In [50]:
grid_search.best_score_

0.7467128027681661

In [51]:
grid_search.best_params_

{'criterion': 'gini', 'max_depth': None, 'n_estimators': 50}

###### 5.1.3 pharo

In [52]:
grid_search.fit(pharo_X, pharo_y)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


In [53]:
grid_search.best_score_

0.9238405623285744

In [54]:
grid_search.best_params_

{'criterion': 'entropy', 'max_depth': None, 'n_estimators': 200}

#### 6. Model Testing

In [55]:
### Recreating random forest with the best results

In [56]:
rf = RandomForestClassifier(criterion='entropy', max_depth=None, n_estimators=200)

In [57]:
### Reading and processing test data with teh same methodology as train dataset 
test_java = pd.read_parquet("data/java_test-00000-of-00001.parquet")
test_python = pd.read_parquet("data/python_test-00000-of-00001.parquet")
test_pharo = pd.read_parquet("data/pharo_test-00000-of-00001.parquet")

test_java['clean_text'] = test_java['combo'].apply(clean_text)
test_python['clean_text'] = test_python['combo'].apply(clean_text)
test_pharo['clean_text'] = test_pharo['combo'].apply(clean_text)

In [74]:
test_java['labels'] = test_java['labels'].apply(lambda x: x.tolist().index(1))
test_python['labels'] = test_python['labels'].apply(lambda x: x.tolist().index(1))
test_pharo['labels'] = test_pharo['labels'].apply(lambda x: x.tolist().index(1))

In [75]:
test_java_X = java_tf.transform(test_java['clean_text'])
test_java_y = test_java['labels']

In [76]:
test_python_X = python_tf.transform(test_python['clean_text'])
test_python_y = test_python['labels']

In [77]:
test_pharo_X = pharo_tf.transform(test_pharo['clean_text'])
test_pharo_y = test_pharo['labels']

In [83]:
from sklearn.metrics import accuracy_score

In [85]:
### Getting test accuracy for java dataset


java_rf = rf.fit(java_X, java_y)

java_accuracy = accuracy_score(java_rf.predict(test_java_X), test_java_y)

In [86]:
java_accuracy

0.7640579710144928

In [87]:
### Getting test accuracy for python dataset


python_rf = rf.fit(python_X, python_y)

python_accuracy = accuracy_score(python_rf.predict(test_python_X), test_python_y)

In [88]:
python_accuracy

0.5443349753694581

In [89]:
### Getting test accuracy for pharo dataset


pharo_rf = rf.fit(pharo_X, pharo_y)

pharo_accuracy = accuracy_score(pharo_rf.predict(test_pharo_X), test_pharo_y)

In [90]:
pharo_accuracy

0.671280276816609

#### Visualizing final test result

In [91]:
import plotly.express as px

In [100]:
fig = px.bar(x=["Python", "Java", "Pharo"], text=[str(i)+"%" for i in (np.array([python_accuracy, java_accuracy, pharo_accuracy])*100).round(2)], color=["Python", "Java", "Pharo"], y=[python_accuracy, java_accuracy, pharo_accuracy], template='plotly_white')
fig.update_layout(yaxis=dict(tickformat='%', title='Accuracy'), xaxis=dict(title='Language'), title="Test Dataset Performance")