In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Load the data
df = pd.read_csv('job-data-LinerR.csv')

# Perform one-hot encoding for categorical variables
df_encoded = pd.get_dummies(df, columns=["required_experience", "Overall_Satisfication", "Working_model"])

# Separate features (X) and target variable (y)
X = df_encoded.drop(columns=["joining_date", "Salary"])
y = df_encoded["Salary"]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build and train the multivariable regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)


print("R-squared or accuracy", r2)


R-squared or accuracy 0.5387180182620748


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, classification_report

# Load the data
df = pd.read_csv('job-data-LinerR.csv')

# Define salary ranges for each category
salary_ranges = [(0, 30000), (30001, 50000), (50001, 70000), (70001, 90000), (90001, float('inf'))]
categories = ['Low', 'Medium-Low', 'Medium', 'Medium-High', 'High']

# Create a new categorical column based on the salary ranges
def categorize_salary(salary):
    for i, (lower, upper) in enumerate(salary_ranges):
        if lower <= salary <= upper:
            return categories[i]

df['Salary_Category'] = df['Salary'].apply(categorize_salary)

df.head(5)


Unnamed: 0,joining_date,required_experience,Overall_Satisfication,Working_model,Salary,Salary_Category
0,12/10/2023,Fresh graduate,Satisfied,On Site,19000,Low
1,03/01/2022,More than 2 years,Satisfied,Remote,50000,Medium-Low
2,02/06/2022,More than 2 years,Satisfied,On Site,115000,High
3,01/03/2021,More than 2 years,Very Satisfied,On Site,100000,High
4,03/08/2021,More than 2 years,Satisfied,On Site,47000,Medium-Low


In [3]:

# Separate features (X) and target variable (y)
X = df.drop(columns=["joining_date", "Salary", "Salary_Category"])
# X = df.drop(columns=["joining_date", "Salary_Category"])
y = df['Salary_Category']

# Perform one-hot encoding for categorical variables
categorical_cols = ["required_experience", "Overall_Satisfication", "Working_model"]
preprocessor = ColumnTransformer(transformers=[('cat', OneHotEncoder(), categorical_cols)], remainder='passthrough')
X_encoded = preprocessor.fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Build and train the logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.5
Classification Report:
              precision    recall  f1-score   support

        High       0.60      0.50      0.55        12
         Low       1.00      0.62      0.76        13
      Medium       0.00      0.00      0.00         8
 Medium-High       0.17      0.25      0.20         4
  Medium-Low       0.43      0.77      0.56        13

    accuracy                           0.50        50
   macro avg       0.44      0.43      0.41        50
weighted avg       0.53      0.50      0.49        50

