In [15]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import classification_report
import numpy as np
import pandas as pd

In [4]:
file_path = "data/user_task_preferences.csv"
df = pd.read_csv(file_path)
print(len(df))
df.head()

100


Unnamed: 0,Task Type,Field,Difficulty,Current Skills,Skills to Learn,Available Time per Day (hrs),Suggested Task
0,Administrative,Healthcare,Hard,Photoshop,"Communication, Google Ads",5,Design Marketing Posters
1,Volunteer,Design,Medium,"Data Analysis, Python, Google Ads",Data Analysis,7,Optimize Ad Campaigns
2,Creative,Design,Medium,"Excel, Communication","Communication, Team Management",7,General Assistant Task
3,Volunteer,Marketing,Hard,"Photoshop, Communication, Google Ads","Photoshop, Communication",6,Design Marketing Posters
4,Volunteer,Research,Easy,"Team Management, Python",Photoshop,5,Statistical Report Automation


In [5]:
df['Current Skills'] = df['Current Skills'].apply(lambda x: [skill.strip() for skill in x.split(',')])
df['Skills to Learn'] = df['Skills to Learn'].apply(lambda x: [skill.strip() for skill in x.split(',')])

In [None]:
label_encoder = LabelEncoder()
df['Suggested Task Encoded'] = label_encoder.fit_transform(df['Suggested Task'])

In [7]:
X = df.drop(columns=['Suggested Task', 'Suggested Task Encoded'])
y = df['Suggested Task Encoded']

In [40]:
def skills_tokenizer(x):
    return x
preprocessor = ColumnTransformer(transformers=[
    ('task_type', CountVectorizer(), 'Task Type'),
    ('field', CountVectorizer(), 'Field'),
    ('difficulty', CountVectorizer(), 'Difficulty'),
    ('current_skills', CountVectorizer(tokenizer=skills_tokenizer, preprocessor=skills_tokenizer), 'Current Skills'),
    ('skills_to_learn', CountVectorizer(tokenizer=skills_tokenizer, preprocessor=skills_tokenizer), 'Skills to Learn'),
], remainder='passthrough')  # Keep numeric column as-is


In [43]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [45]:
pipeline.fit(X_train, y_train)




In [46]:
y_pred = pipeline.predict(X_test)


In [None]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.96


In [51]:
example_input = {
    'Task Type': 'Volunteer',
    'Field': 'Design',
    'Difficulty': 'Medium',
    'Current Skills': 'Photoshop, Excel',
    'Skills to Learn': 'Communication, Ads',
    'Available Time per Day (hrs)': 9
}
recommended = pipeline.predict(pd.DataFrame(example_input, index=[0]))
task= label_encoder.inverse_transform(recommended)
print(f"Recommended Task: {task[0]}")

Recommended Task: Community Awareness Campaign


In [49]:
import joblib 
joblib.dump(pipeline, 'task_recommendation_model.pkl')
joblib.dump(label_encoder, 'label_encoder.pkl')

['label_encoder.pkl']