In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('job-data.csv')

In [3]:
df.head(2)

Unnamed: 0,joining_date,job_title,required_experience,Overall_Satisfication,Working_model,Considering_Job_Switching_in_Future,Main_Reasons,Salary
0,12/10/2023,Junior Software Developer (Backend),Fresh graduate,Satisfied,On Site,No,I am happy with my work role,19000
1,03/01/2022,Data Scientist,More than 2 years,Satisfied,Remote,Yes,Better salary/benefits,50000


In [4]:
# Define salary ranges for each category
salary_ranges = [(0, 30000), (30001, 50000), (50001, 70000), (70001, 90000), (90001, float('inf'))]
categories = ['Low', 'Medium-Low', 'Medium', 'Medium-High', 'High']

# Create a new categorical column based on the salary ranges
def categorize_salary(salary):
    for i, (lower, upper) in enumerate(salary_ranges):
        if lower <= salary <= upper:
            return categories[i]

df['Salary_Category'] = df['Salary'].apply(categorize_salary)

df.tail(3)

Unnamed: 0,joining_date,job_title,required_experience,Overall_Satisfication,Working_model,Considering_Job_Switching_in_Future,Main_Reasons,Salary,Salary_Category
246,02/01/2022,Software Developer (Frontend),More than 3 years,Satisfied,On Site,Yes,Better salary/benefits,26000,Low
247,01/02/2023,Full Stack Developer,Less than 2 years,Satisfied,Remote,Yes,Better salary/benefits,75000,Medium-High
248,01/02/2022,Full Stack Developer,More than 5 years,Satisfied,Remote,Yes,Better salary/benefits,110000,High


In [5]:
df.drop(columns=['Salary'], inplace=True)
df.head(4)

Unnamed: 0,joining_date,job_title,required_experience,Overall_Satisfication,Working_model,Considering_Job_Switching_in_Future,Main_Reasons,Salary_Category
0,12/10/2023,Junior Software Developer (Backend),Fresh graduate,Satisfied,On Site,No,I am happy with my work role,Low
1,03/01/2022,Data Scientist,More than 2 years,Satisfied,Remote,Yes,Better salary/benefits,Medium-Low
2,02/06/2022,Data Scientist,More than 2 years,Satisfied,On Site,Yes,"Career Development, Salary andBenefits",High
3,01/03/2021,Senior Software Engineer (Backend),More than 2 years,Very Satisfied,On Site,No,I am happy with my work role,High


In [6]:
X = df.drop('Salary_Category', axis=1)
X

Unnamed: 0,joining_date,job_title,required_experience,Overall_Satisfication,Working_model,Considering_Job_Switching_in_Future,Main_Reasons
0,12/10/2023,Junior Software Developer (Backend),Fresh graduate,Satisfied,On Site,No,I am happy with my work role
1,03/01/2022,Data Scientist,More than 2 years,Satisfied,Remote,Yes,Better salary/benefits
2,02/06/2022,Data Scientist,More than 2 years,Satisfied,On Site,Yes,"Career Development, Salary andBenefits"
3,01/03/2021,Senior Software Engineer (Backend),More than 2 years,Very Satisfied,On Site,No,I am happy with my work role
4,03/08/2021,Software Developer (Backend),More than 2 years,Satisfied,On Site,Yes,Seeking better work-life balance
...,...,...,...,...,...,...,...
244,01/02/2024,Software Quality Assurance Engineer,Fresh graduate,Neutral,On Site,Yes,"Career Development, Salary and Benefits"
245,01/02/2023,Software Developer (Backend),More than 3 years,Satisfied,On Site,No,I am happy with my work role
246,02/01/2022,Software Developer (Frontend),More than 3 years,Satisfied,On Site,Yes,Better salary/benefits
247,01/02/2023,Full Stack Developer,Less than 2 years,Satisfied,Remote,Yes,Better salary/benefits


In [7]:
y = df.Salary_Category

In [8]:
y

0              Low
1       Medium-Low
2             High
3             High
4       Medium-Low
          ...     
244            Low
245    Medium-High
246            Low
247    Medium-High
248           High
Name: Salary_Category, Length: 249, dtype: object

In [9]:
from sklearn.preprocessing import LabelEncoder

In [10]:
le_x = LabelEncoder()
X = X.apply(le_x.fit_transform)

In [11]:
X

Unnamed: 0,joining_date,job_title,required_experience,Overall_Satisfication,Working_model,Considering_Job_Switching_in_Future,Main_Reasons
0,88,14,0,2,1,1,4
1,25,3,2,2,2,2,0
2,21,3,2,2,1,2,2
3,9,20,2,3,1,1,4
4,28,23,2,2,1,2,6
...,...,...,...,...,...,...,...
244,8,28,0,1,1,2,1
245,7,23,3,2,1,1,4
246,15,24,3,2,1,2,0
247,7,9,1,2,2,2,0


In [12]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=.2, random_state= 2)

In [13]:
from sklearn.tree import DecisionTreeClassifier
dect = DecisionTreeClassifier()
dect.fit(xtrain, ytrain)

In [14]:
dect.score(xtest, ytest)

0.4

<h2 style="color:green">Performance Using Corss_val_score<h2>

In [15]:
from sklearn.model_selection import cross_val_score

In [16]:
result = cross_val_score(DecisionTreeClassifier(), X, y, cv=5)
result

array([0.44      , 0.4       , 0.44      , 0.4       , 0.46938776])

In [17]:
result.mean()

0.42987755102040814

In [None]:
from sklearn.linear_model import LogisticRegression
result = cross_val_score(LogisticRegression(), X, y, cv=5)
result

In [19]:
result.mean()

0.43763265306122445

In [20]:
from sklearn.neighbors import KNeighborsClassifier
result = cross_val_score(KNeighborsClassifier(n_neighbors=3), X, y, cv=5)

In [21]:
result

array([0.18     , 0.22     , 0.36     , 0.44     , 0.3877551])

In [22]:
result.mean()

0.31755102040816324

In [23]:
y_predict = dect.predict(xtest)

In [24]:
y_predict

array(['Medium', 'Medium', 'Medium-Low', 'High', 'High', 'Medium',
       'Medium-High', 'Medium-Low', 'Medium', 'Medium-Low', 'Medium-Low',
       'Low', 'Medium-Low', 'Medium', 'Medium', 'Medium-High',
       'Medium-Low', 'High', 'High', 'Medium-Low', 'Low', 'High',
       'Medium-Low', 'Medium-Low', 'High', 'High', 'Low', 'Medium-High',
       'Medium-High', 'Medium', 'Low', 'High', 'Medium', 'Medium-High',
       'Medium-Low', 'Low', 'Medium-Low', 'Low', 'Low', 'Medium-High',
       'Medium-Low', 'Medium-Low', 'Medium-High', 'Medium-Low',
       'Medium-High', 'Medium-High', 'Medium', 'Low', 'Medium-High',
       'Medium-Low'], dtype=object)

In [25]:
from sklearn.metrics import classification_report, confusion_matrix

In [26]:
print(classification_report(ytest, y_predict))

              precision    recall  f1-score   support

        High       0.88      0.37      0.52        19
         Low       0.62      0.50      0.56        10
      Medium       0.22      0.29      0.25         7
 Medium-High       0.10      0.33      0.15         3
  Medium-Low       0.33      0.45      0.38        11

    accuracy                           0.40        50
   macro avg       0.43      0.39      0.37        50
weighted avg       0.57      0.40      0.44        50

