In [61]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing  import StandardScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.impute import SimpleImputer

In [62]:
def is_line_valid(line):
    # A simple heuristic: check if quotes are balanced
    return line.count('"') % 2 == 0

input_file = "job_descriptions.csv"
output_file = "job_descriptions_cleaned.csv"
max_lines = 10000

written_lines = 0

with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
    for line in infile:
        if is_line_valid(line):
            outfile.write(line)
            written_lines += 1
        if written_lines >= max_lines:
            break

print(f"Cleaning done. {written_lines} valid lines written to {output_file}.")


Cleaning done. 10000 valid lines written to job_descriptions_cleaned.csv.


In [63]:
import pandas as pd
df = pd.read_csv("job_descriptions_cleaned.csv")
selected_columns = ['Job Id', 'Experience',"Qualifications",'Company','Preference', 'Salary Range', 'Country', 'Work Type', 'Job Title']
df = df[selected_columns]
print(df.head())

             Job Id     Experience Qualifications  \
0  1089843540111562  5 to 15 Years         M.Tech   
1   398454096642776  2 to 12 Years            BCA   
2   481640072963533  0 to 12 Years            PhD   
3   688192671473044  4 to 11 Years            PhD   
4   117057806156508  1 to 12 Years            MBA   

                            Company Preference Salary Range           Country  \
0                 Icahn Enterprises     Female    $59K-$99K       Isle of Man   
1      PNC Financial Services Group     Female   $56K-$116K      Turkmenistan   
2  United Services Automobile Assn.       Male   $61K-$104K  Macao SAR, China   
3                              Hess     Female    $65K-$91K             Benin   
4                      Cairn Energy     Female    $64K-$87K             Chile   

   Work Type                     Job Title  
0     Intern  Digital Marketing Specialist  
1     Intern                 Web Developer  
2  Temporary            Operations Manager  
3  Full-Time  

In [64]:
cols = df.columns
df.head()

Unnamed: 0,Job Id,Experience,Qualifications,Company,Preference,Salary Range,Country,Work Type,Job Title
0,1089843540111562,5 to 15 Years,M.Tech,Icahn Enterprises,Female,$59K-$99K,Isle of Man,Intern,Digital Marketing Specialist
1,398454096642776,2 to 12 Years,BCA,PNC Financial Services Group,Female,$56K-$116K,Turkmenistan,Intern,Web Developer
2,481640072963533,0 to 12 Years,PhD,United Services Automobile Assn.,Male,$61K-$104K,"Macao SAR, China",Temporary,Operations Manager
3,688192671473044,4 to 11 Years,PhD,Hess,Female,$65K-$91K,Benin,Full-Time,Network Engineer
4,117057806156508,1 to 12 Years,MBA,Cairn Energy,Female,$64K-$87K,Chile,Intern,Event Manager


In [65]:
# imputer= SimpleImputer(strategy='most_frequent')
# imputer.fit(df.iloc[:, 0:3])
# df.iloc[:, 0:3] = imputer.transform(df.iloc[:, 0:3])
# # df.head()

from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='most_frequent')

# Select all columns except the first one (index 0)
imputer.fit(df.iloc[:, 1:])
df.iloc[:, 1:] = imputer.transform(df.iloc[:, 1:])


In [66]:
imputer= SimpleImputer(strategy='mean')
imputer.fit(df.iloc[:, 0:1])
df.iloc[:, 0:1] = imputer.transform(df.iloc[:, 0:1])
df.head()

Unnamed: 0,Job Id,Experience,Qualifications,Company,Preference,Salary Range,Country,Work Type,Job Title
0,1089843540111562,5 to 15 Years,M.Tech,Icahn Enterprises,Female,$59K-$99K,Isle of Man,Intern,Digital Marketing Specialist
1,398454096642776,2 to 12 Years,BCA,PNC Financial Services Group,Female,$56K-$116K,Turkmenistan,Intern,Web Developer
2,481640072963533,0 to 12 Years,PhD,United Services Automobile Assn.,Male,$61K-$104K,"Macao SAR, China",Temporary,Operations Manager
3,688192671473044,4 to 11 Years,PhD,Hess,Female,$65K-$91K,Benin,Full-Time,Network Engineer
4,117057806156508,1 to 12 Years,MBA,Cairn Energy,Female,$64K-$87K,Chile,Intern,Event Manager


In [67]:
df["Work Type"] = (df["Work Type"] == 'Intern').astype(int)

# Move "Work Type" to the end
cols = [col for col in df.columns if col != "Work Type"] + ["Work Type"]
df = df[cols]


df.head()

Unnamed: 0,Job Id,Experience,Qualifications,Company,Preference,Salary Range,Country,Job Title,Work Type
0,1089843540111562,5 to 15 Years,M.Tech,Icahn Enterprises,Female,$59K-$99K,Isle of Man,Digital Marketing Specialist,1
1,398454096642776,2 to 12 Years,BCA,PNC Financial Services Group,Female,$56K-$116K,Turkmenistan,Web Developer,1
2,481640072963533,0 to 12 Years,PhD,United Services Automobile Assn.,Male,$61K-$104K,"Macao SAR, China",Operations Manager,0
3,688192671473044,4 to 11 Years,PhD,Hess,Female,$65K-$91K,Benin,Network Engineer,0
4,117057806156508,1 to 12 Years,MBA,Cairn Energy,Female,$64K-$87K,Chile,Event Manager,1


In [50]:
# import matplotlib.pyplot as plt

# for label in cols[:-1]:  # Assuming cols is your list of columns

#         # Plot histogram for Experience_Years > 20 (blue for 'gamma')
#         plt.hist(df[df["Work Type"] == 'Intern'][label],
#                  color='blue', label='Intern', alpha=0.7, density=True)

#         # Plot histogram for Experience_Years <= 20 (red for 'hadron')
#         plt.hist(df[df["Work Type"] != 'Intern'][label],
#                  color='red', label='!Intern', alpha=0.7, density=True)

#         # Set the title, labels, and legend
#         plt.title(f"Distribution of {label}")
#         plt.ylabel("Probability")
#         plt.xlabel(label)
#         plt.legend()  # Call legend after both histograms are plotted

#         # Show the plot
#         plt.show()


In [68]:
#Catgorical data
from sklearn.preprocessing import LabelEncoder
label_encoder_x= LabelEncoder()
df.iloc[:, 3]= label_encoder_x.fit_transform(df.iloc[:, 3])
df.iloc[:, 1]= label_encoder_x.fit_transform(df.iloc[:, 1])
df.iloc[:, 2]= label_encoder_x.fit_transform(df.iloc[:, 2])
df.iloc[:, 4]= label_encoder_x.fit_transform(df.iloc[:, 4])
df.iloc[:, 5]= label_encoder_x.fit_transform(df.iloc[:, 5])
df.iloc[:, 6]= label_encoder_x.fit_transform(df.iloc[:, 6])
df.iloc[:, 7]= label_encoder_x.fit_transform(df.iloc[:, 7])
df.head()






Unnamed: 0,Job Id,Experience,Qualifications,Company,Preference,Salary Range,Country,Job Title,Work Type
0,1089843540111562,45,6,427,1,254,92,31,1
1,398454096642776,18,4,619,1,67,198,145,1
2,481640072963533,2,9,819,2,310,114,90,0
3,688192671473044,33,9,404,1,552,20,83,0
4,117057806156508,10,7,174,1,497,39,38,1


In [69]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Define categorical columns
categorical_columns = ["Experience","Qualifications","Company","Preference", "Salary Range", "Country", "Job Title"]

# Create ColumnTransformer
ct = ColumnTransformer(
    transformers=[("onehot", OneHotEncoder(sparse_output=False), categorical_columns)],
    remainder='passthrough'
)

# Fit and transform the data
df_encoded = ct.fit_transform(df)

# Get new column names from encoder
onehot_feature_names = ct.named_transformers_["onehot"].get_feature_names_out(categorical_columns)
other_columns = [col for col in df.columns if col not in categorical_columns]
final_columns = list(onehot_feature_names) + other_columns

# Create final DataFrame
df = pd.DataFrame(df_encoded, columns=final_columns)

# Show result
print(df.head())


   Experience_0  Experience_1  Experience_2  Experience_3  Experience_4  \
0           0.0           0.0           0.0           0.0           0.0   
1           0.0           0.0           0.0           0.0           0.0   
2           0.0           0.0           1.0           0.0           0.0   
3           0.0           0.0           0.0           0.0           0.0   
4           0.0           0.0           0.0           0.0           0.0   

   Experience_5  Experience_6  Experience_7  Experience_8  Experience_9  ...  \
0           0.0           0.0           0.0           0.0           0.0  ...   
1           0.0           0.0           0.0           0.0           0.0  ...   
2           0.0           0.0           0.0           0.0           0.0  ...   
3           0.0           0.0           0.0           0.0           0.0  ...   
4           0.0           0.0           0.0           0.0           0.0  ...   

   Job Title_139  Job Title_140  Job Title_141  Job Title_142  Job T

In [70]:
train, valid , test = np.split(df.sample(frac = 1), [int(0.6 * len(df)) , int(0.8 * len(df))])
df.head()

  return bound(*args, **kwds)


Unnamed: 0,Experience_0,Experience_1,Experience_2,Experience_3,Experience_4,Experience_5,Experience_6,Experience_7,Experience_8,Experience_9,...,Job Title_139,Job Title_140,Job Title_141,Job Title_142,Job Title_143,Job Title_144,Job Title_145,Job Title_146,Job Id,Work Type
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1089844000000000.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,398454100000000.0,1.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,481640100000000.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,688192700000000.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117057800000000.0,1.0


In [72]:
def scale_dataset(dataframe, oversample = False):
  x = dataframe[dataframe.columns[:-1]].values
  y = dataframe[dataframe.columns[-1]].values

  scaler = StandardScaler()
  x = scaler.fit_transform(x)
  if oversample:
    ros = RandomOverSampler()
    x,y =ros.fit_resample(x,y)
  data = np.hstack((x,np.reshape(y,(-1,1))))
  return data,x,y

In [73]:
train, X_train, y_train = scale_dataset(train, oversample = False)
valid, X_valid, y_valid = scale_dataset(valid, oversample = False)
test, X_test, y_test = scale_dataset(test, oversample = False)

In [74]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
knn_model = KNeighborsClassifier(n_neighbors = 3)
knn_model.fit(X_train,y_train)

In [75]:
y_pred = knn_model.predict(X_test)

In [76]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.80      0.91      0.85      1604
         1.0       0.20      0.09      0.12       396

    accuracy                           0.75      2000
   macro avg       0.50      0.50      0.49      2000
weighted avg       0.68      0.75      0.71      2000



In [None]:
from sklearn.svm import SVC
svm_model = SVC()
svm_model = svm_model.fit(X_train,y_train)
y_pred = svm_model.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression()
lr_model = lr_model.fit(X_train,y_train)

y_pred = lr_model.predict(X_test)
print(classification_report(y_test, y_pred))