In [None]:
import pandas as pd 
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import sklearn.model_selection as model_selection 
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn import tree
import matplotlib.pyplot as plt 
from sklearn.cluster import KMeans
import seaborn as sns

In [None]:
url = "/kaggle/input/stack-overflow-annual-developer-survey-2024/survey_results_public.csv"
missing_values = ["n.a.","?","NA","n/a", "na", "--"] 
column_names = ['YearsCodePro','EdLevel','DevType','Country','Employment','LanguageHaveWorkedWith','OrgSize','Age','RemoteWork','LearnCodeOnline','YearsCode','ConvertedCompYearly']
survey = pd.read_csv(url, usecols=column_names, na_values=["n.a.", "?", "NA", "n/a", "na", "--"])
pd.set_option('display.max_columns', None)


survey.isnull().sum()

EDA and Data Cleaning - here we explore the data and handle and missing values using the mode or mean values.

In [None]:
stack_survey_cleaned = survey.copy()
number_columns = ['YearsCodePro', 'YearsCode']
categorical_columns = ['EdLevel', 'DevType', 'Country', 'LanguageHaveWorkedWith', 'OrgSize', 'RemoteWork', 'LearnCodeOnline']


for columns in number_columns:
    stack_survey_cleaned[columns] = pd.to_numeric(stack_survey_cleaned[columns], errors='coerce')

for columns in number_columns:
    stack_survey_cleaned[columns].fillna(stack_survey_cleaned[columns].mean(), inplace=True)  

for columns in categorical_columns:
    stack_survey_cleaned[columns].fillna(stack_survey_cleaned[columns].mode()[0], inplace=True) 

stack_survey_cleaned = stack_survey_cleaned.dropna(subset=['ConvertedCompYearly'])


print(stack_survey_cleaned.isnull().sum())

we can now see the cleaned data set with 0 missing values.

In [None]:
stack_survey_cleaned.describe()

In [None]:
print(stack_survey_cleaned.dtypes)

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(30, 10))


sns.histplot(stack_survey_cleaned['Age'], bins=30, kde=True, ax=axes[0, 0], color='blue')
axes[0, 0].set_title('Age Distribution')


sns.histplot(stack_survey_cleaned['YearsCode'], bins=30, kde=True, ax=axes[0, 1], color='green')
axes[0, 1].set_title('Years of Coding Distribution')


sns.violinplot(y=stack_survey_cleaned['ConvertedCompYearly'], ax=axes[1, 0], color='purple')
axes[1, 0].set_title('Income Distribution (ConvertedCompYearly)')


sns.countplot(y=stack_survey_cleaned['EdLevel'], order=stack_survey_cleaned['EdLevel'].value_counts().index, ax=axes[1, 1], palette='coolwarm')
axes[1, 1].set_title('Education Level Distribution')


plt.tight_layout()
plt.show()

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 10))


sns.histplot(stack_survey_cleaned['YearsCodePro'], bins=30, kde=True, ax=axes[0, 0], color='blue')
axes[0, 0].set_title('Years Pro Coding Distribution')




plt.tight_layout()
plt.show()

In [None]:
countries = stack_survey_cleaned['Country'].value_counts()
countries

In [None]:
print(stack_survey_cleaned.dtypes)
print(stack_survey_cleaned.describe())

In [None]:
stack_survey_cleaned['Country'].unique()

In [None]:
def select_countries(x):
    if x =="United States of America":
        x = "USA"
    elif x == "United Kingdom of Great Britain and Northern Ireland":
        x = "UK" 
    elif x == "India":
        x = "India" 
    elif x == "Germany":
        x = "Germany" 
    elif x == "Ukraine":
        x = "Ukraine"
    else:
        x = "Others"
    return x
    
stack_survey_cleaned['themostCountries']  = stack_survey_cleaned['Country'].astype(str)
stack_survey_cleaned['themostCountries'] = stack_survey_cleaned['themostCountries'].apply(lambda x: select_countries(x))
stack_survey_cleaned['themostCountries'].head()

In [None]:
stack_survey_cleaned['themostCountries'].value_counts().plot(kind = 'pie', title = 'Country', autopct='%2.0f%%')

In [None]:
median_income = stack_survey_cleaned['ConvertedCompYearly'].median()
print(median_income)

In [None]:
stack_survey_cleaned['IncomeClass'] = stack_survey_cleaned['ConvertedCompYearly'].apply(lambda x: 'High Earner' if x > median_income else 'Low Earner')
income_counts = stack_survey_cleaned['IncomeClass'].value_counts()
income_counts

In [None]:
stack_survey_cleaned['IncomeClass'].value_counts().plot(kind = 'pie', title = 'High income vs Low income', autopct='%2.0f%%')

In [None]:
stack_survey_cleaned['RemoteWork'].value_counts().plot(kind = 'pie', title = 'Work Types', autopct='%2.0f%%')

In [None]:
stack_survey_cleaned['OrgSize'].value_counts().plot(kind = 'pie', title = 'Organisation Size', autopct='%2.0f%%')

In [None]:
group = stack_survey_cleaned.groupby('themostCountries')   #EDA
group['ConvertedCompYearly'].median().plot.bar();

In [None]:
group = stack_survey_cleaned.groupby('Age')   #EDA
group['ConvertedCompYearly'].median().plot.bar();

In [None]:
group = stack_survey_cleaned.groupby('YearsCode')   #EDA
group['ConvertedCompYearly'].median().plot.bar();

In [None]:
group = stack_survey_cleaned.groupby('YearsCodePro')   #EDA
group['ConvertedCompYearly'].median().plot.bar();

In [None]:
group = stack_survey_cleaned.groupby('OrgSize')   #EDA
group['ConvertedCompYearly'].median().plot.bar();

In [None]:
group = stack_survey_cleaned.groupby('DevType')   #EDA
group['ConvertedCompYearly'].median().plot.bar();

In [None]:
columnsToDrop = ['YearsCode','RemoteWork','LearnCodeOnline']
stack_survey_cleaned = stack_survey_cleaned.drop(columns=columnsToDrop)

we now drop any of the features that will not be used moving forward.

In [None]:
stack_survey_cleaned.head()

In [None]:
stack_survey_cleaned.describe()

In [None]:
print(stack_survey_cleaned.dtypes)

Cluster Analysis

In [None]:
scaler = StandardScaler()
stack_survey_cleaned[['YearsCodePro_s', 'ConvertedCompYearly_s']] = scaler.fit_transform(stack_survey_cleaned[['YearsCodePro', 'ConvertedCompYearly']])
stack_survey_cleaned

now we will display all of the unique values for each category to see which we can apply one hot encoder to.

In [None]:
stack_survey_cleaned['Age'].unique()

In [None]:
stack_survey_cleaned['Employment'].unique()

In [None]:
stack_survey_cleaned['EdLevel'].unique()

In [None]:
stack_survey_cleaned['DevType'].unique()

In [None]:
stack_survey_cleaned['OrgSize'].unique()

In [None]:
stack_survey_cleaned['Country'].unique()

In [None]:
stack_survey_cleaned['LanguageHaveWorkedWith'].unique()

In [None]:

encoder = OrdinalEncoder()


stack_survey_cleaned["EdLevel"] = encoder.fit_transform(stack_survey_cleaned[['EdLevel']])


stack_survey_cleaned.head()

In [None]:
encoder = OrdinalEncoder()


stack_survey_cleaned["OrgSize"] = encoder.fit_transform(stack_survey_cleaned[['OrgSize']])


stack_survey_cleaned.head()

we can now drop country since we are using most country instead and need to encode.

In [None]:
stack_survey_cleaned.head()
countryToDrop = ['Country']
stack_survey_cleaned = stack_survey_cleaned.drop(columns = countryToDrop)

In [None]:
stack_survey_cleaned = pd.get_dummies(stack_survey_cleaned)

In [None]:
stack_survey_cleaned.describe()

In [None]:
def elbow_plot(data, max_k):
    means = []
    inertias = []

    for k in range(1, max_k):
        kmeans = KMeans(n_clusters=k)
        kmeans.fit(data)
    
        means.append(k)
        inertias.append(kmeans.inertia_)


    fig = plt.subplots(figsize=(10,5))
    plt.plot(means, inertias, 'o-')
    plt.xlabel('number of clusters')
    plt.ylabel('inertia')
    plt.grid(True)
    plt.show()


elbow_plot(stack_survey_cleaned,10)

4 clusters to be used as the slope becomes more gradual after this point.

In [None]:
finalFeaturesToDrop = ['YearsCodePro','ConvertedCompYearly']
stack_survey_cleaned = stack_survey_cleaned.drop(columns=finalFeaturesToDrop)

In [None]:
kmeans = KMeans(n_clusters = 4, random_state=91)
y_cluster = kmeans.fit_predict(stack_survey_cleaned)
stack_survey_cleaned['Cluster'] = y_cluster
stack_survey_cleaned.head()

In [None]:
import seaborn as sns
sns.countplot(data=stack_survey_cleaned, x="Cluster", hue="IncomeClass_High Earner")

In [None]:
import seaborn as sns
sns.countplot(data=stack_survey_cleaned, x="Cluster", hue="mostCountry_Others")

In [None]:
import seaborn as sns 
sns.countplot(data=stack_survey_cleaned, x = "Cluster", hue = "IncomeClass_Low Earner" )

In [None]:
import seaborn as sns
sns.countplot(data=stack_survey_cleaned, x="Cluster", hue="OrgSize")

{'1,000 to 4,999 employees': 0, '10 to 19 employees': 1, '10,000 or more employees': 2, '100 to 499 employees': 3, '2 to 9 employees': 4, '20 to 99 employees': 5, '5,000 to 9,999 employees': 6, '500 to 999 employees': 7, 'I don’t know': 8, 'Just me - I am a freelancer, sole proprietor, etc.': 9}

In [None]:
import seaborn as sns
sns.countplot(data=stack_survey_cleaned, x="Cluster", hue="EdLevel")

In [None]:
import seaborn as sns 
sns.countplot(data=stack_survey_cleaned, x="Cluster", hue="Age_18-24 years old")

In [None]:
import seaborn as sns 
sns.countplot(data=stack_survey_cleaned, x="Cluster", hue="Age_35-44 years old")

KNN Implementation

In [None]:
X = stack_survey_cleaned.drop(columns=['IncomeClass_High Earner','IncomeClass_Low Earner','ConvertedCompYearly_s']) 
y = stack_survey_cleaned['IncomeClass_High Earner']  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=91, stratify=y)

create a copy of survey_cleaned removing the high income and low income categories so we can use high income as the target variable.

In [None]:
knn = KNeighborsClassifier(n_neighbors=7)  
knn.fit(X_train, y_train)

In [None]:
y_pred = knn.predict(X_test)

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Logistic Regression Implementation

In [None]:

logistic_reg = LogisticRegression(max_iter=1000, random_state=91)


logistic_reg.fit(X_train, y_train)


y_pred2 = logistic_reg.predict(X_test)

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred2))
print("Classification Report:\n", classification_report(y_test, y_pred2))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred2))

Decision Tree Implementation

In [None]:
from sklearn import tree 
DecTree = DecisionTreeClassifier(criterion='entropy',max_depth=9, ccp_alpha=0.004)
DecTree = DecisionTreeClassifier(class_weight="balanced", random_state=91)
DecTree = DecTree.fit(X_train,y_train)

In [None]:
y_pred3 = DecTree.predict(X_test)

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred3))
print("Classification Report:\n", classification_report(y_test, y_pred3))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred3))

Ensemble Modelling - Random Forest

In [None]:
from sklearn . ensemble import RandomForestClassifier
R_forest = RandomForestClassifier (n_estimators =150 ,random_state =10,min_samples_split=10)

R_forest = R_forest .fit(X_train ,y_train)
y_train_pred = R_forest.predict(X_train)
y_test_pred = R_forest.predict(X_test)
train_score = accuracy_score(y_train ,y_train_pred)
test_score = accuracy_score(y_test ,y_test_pred)

print(f'Train/Test Accuracies: {train_score:.3f} / {test_score:.3f}')