In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Load the data
df = pd.read_csv('job-data.csv')

# Exclude "joining_date" column
df_exclude_date = df.drop(columns=["joining_date"])

# Perform one-hot encoding for categorical variables
df_encoded = pd.get_dummies(df_exclude_date, columns=["job_title", "required_experience", "Industary_Sector", "Demanding_Of_the_Job", "Overall_Satisfication", "Working_model", "Considering_Job_Switching_in_Future", "Main_Reasons"])

# Separate features (X) and target variable (y)
X = df_encoded.drop(columns=["Salary"])
y = df_encoded["Salary"]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)
# print(model.score(X_test, y_test))

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print("R-squared:", r2)


Mean Squared Error: 1.691231985004573e+34
Mean Absolute Error: 2.8351644207944856e+16
R-squared: -9.520035293065258e+24


In [6]:
# Define tolerance level
tolerance = 0.05  # 5% tolerance

# Calculate absolute percentage error
abs_percentage_error = abs((y_pred - y_test) / y_test)

# Calculate accuracy within tolerance level
accuracy = (abs_percentage_error <= tolerance).mean() * 100

print("Accuracy within ±{}%: {:.2f}%".format(tolerance * 100, accuracy))


Accuracy within ±5.0%: 0.00%


In [12]:
model.score(X_test, y_test)

-9.520035293065258e+24