## Can we develop a recommendation model to predict programming language preferences based on individual developer choices and attributes?

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score


pd.options.mode.copy_on_write = True
survey_2024_data = pd.read_csv('survey_results_public_2024.csv')
survey_2024_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65437 entries, 0 to 65436
Columns: 114 entries, ResponseId to JobSat
dtypes: float64(13), int64(1), object(100)
memory usage: 56.9+ MB


In [2]:
# Filter the dataset to use developer informations that is currently employed
filtered_data = survey_2024_data.loc[
    (survey_2024_data['MainBranch'] == "I am a developer by profession") &
    (survey_2024_data['Employment'].isin([
        "Employed, full-time",
        "Independent contractor, freelancer, or self-employed",
        "Employed, part-time"
    ]))
]

# Select the relevant columns
selected_columns = [
    'MainBranch',
    'Employment',
    'EdLevel',
    'DevType',
    'OrgSize',
    'Country',
    'OpSysProfessional use',
    'LanguageHaveWorkedWith'
]

refined_data = filtered_data[selected_columns]

# Display the structure of the refined dataset
refined_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 39543 entries, 0 to 65435
Data columns (total 8 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   MainBranch              39543 non-null  object
 1   Employment              39543 non-null  object
 2   EdLevel                 37030 non-null  object
 3   DevType                 36490 non-null  object
 4   OrgSize                 36092 non-null  object
 5   Country                 35764 non-null  object
 6   OpSysProfessional use   33212 non-null  object
 7   LanguageHaveWorkedWith  36408 non-null  object
dtypes: object(8)
memory usage: 2.7+ MB


In [3]:
# Treat NAN

refined_data.dropna(subset='LanguageHaveWorkedWith', inplace=True)
refined_data.drop(columns=['MainBranch', 'Employment'], inplace=True)
refined_data.fillna('Unknown', inplace=True)

refined_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 36408 entries, 1 to 65435
Data columns (total 6 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   EdLevel                 36408 non-null  object
 1   DevType                 36408 non-null  object
 2   OrgSize                 36408 non-null  object
 3   Country                 36408 non-null  object
 4   OpSysProfessional use   36408 non-null  object
 5   LanguageHaveWorkedWith  36408 non-null  object
dtypes: object(6)
memory usage: 1.9+ MB


In [4]:
# Splitting the column with more then one answer into lists
refined_data['LanguageHaveWorkedWith'] = refined_data['LanguageHaveWorkedWith'].str.split(';')
refined_data['OpSysProfessional use'] = refined_data['OpSysProfessional use'].str.split(';')

# Expanding the column lists so that each element becomes a new row
refined_data = refined_data.explode('LanguageHaveWorkedWith')
refined_data = refined_data.explode('OpSysProfessional use')
display(refined_data.head(10))

Unnamed: 0,EdLevel,DevType,OrgSize,Country,OpSysProfessional use,LanguageHaveWorkedWith
1,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)","Developer, full-stack",Unknown,United Kingdom of Great Britain and Northern I...,MacOS,Bash/Shell (all shells)
1,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)","Developer, full-stack",Unknown,United Kingdom of Great Britain and Northern I...,MacOS,Go
1,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)","Developer, full-stack",Unknown,United Kingdom of Great Britain and Northern I...,MacOS,HTML/CSS
1,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)","Developer, full-stack",Unknown,United Kingdom of Great Britain and Northern I...,MacOS,Java
1,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)","Developer, full-stack",Unknown,United Kingdom of Great Britain and Northern I...,MacOS,JavaScript
1,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)","Developer, full-stack",Unknown,United Kingdom of Great Britain and Northern I...,MacOS,Python
1,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)","Developer, full-stack",Unknown,United Kingdom of Great Britain and Northern I...,MacOS,TypeScript
2,"Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",Developer Experience,Unknown,United Kingdom of Great Britain and Northern I...,Windows,C#
9,"Master’s degree (M.A., M.S., M.Eng., MBA, etc.)","Developer, full-stack",Unknown,Serbia,Windows,HTML/CSS
9,"Master’s degree (M.A., M.S., M.Eng., MBA, etc.)","Developer, full-stack",Unknown,Serbia,Windows,JavaScript


In [5]:
# Calculate the top 5 most frequent values.
top_languages = refined_data['LanguageHaveWorkedWith'].value_counts().nlargest(5).index
top_os = refined_data['OpSysProfessional use'].value_counts().nlargest(5).index

# Filter the dataframe to include only rows with top 5 values
filtered_data = refined_data[
    refined_data['LanguageHaveWorkedWith'].isin(top_languages) & 
    refined_data['OpSysProfessional use'].isin(top_os)
]

filtered_data['LanguageHaveWorkedWith'].value_counts()


LanguageHaveWorkedWith
JavaScript    31502
SQL           27253
HTML/CSS      25539
Python        23683
TypeScript    21216
Name: count, dtype: int64

In [8]:
label_encoders = {}

for column in filtered_data.columns:
    # Initializing a LabelEncoder
    le = LabelEncoder() 
    # Applying LabelEncoder to transform categorical data into numeric format
    filtered_data[column] = le.fit_transform(filtered_data[column].astype(str))
    # Save the encoder for potential inverse transformation
    label_encoders[column] = le

# Splitting the data into features (X) and target (y)
X = filtered_data.drop(columns=['LanguageHaveWorkedWith'])
y = filtered_data['LanguageHaveWorkedWith']

# Splitting into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Building and training the Random Forest model
model = RandomForestClassifier(n_estimators=30, max_depth=7, random_state=42)
model.fit(X_train, y_train)

# Making predictions on the test set
y_pred = model.predict(X_test)

# Evaluating the model's performance
accuracy = accuracy_score(y_test, y_pred)
classification_metrics = classification_report(y_test, y_pred, zero_division=0)

print("Accuracy:", accuracy)
print("Classification Metrics:\n", classification_metrics)

Accuracy: 0.2738496071829405
Classification Metrics:
               precision    recall  f1-score   support

           0       0.03      0.00      0.00      5062
           1       0.26      0.70      0.38      6297
           2       0.33      0.34      0.34      4775
           3       0.26      0.20      0.22      5413
           4       0.00      0.00      0.00      4292

    accuracy                           0.27     25839
   macro avg       0.18      0.25      0.19     25839
weighted avg       0.19      0.27      0.20     25839

