In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import nltk

nltk.download('punkt')

stemmer = PorterStemmer()
def stem_words(text):
    tokens = word_tokenize(text.lower())
    return ' '.join([stemmer.stem(word) for word in tokens])

df = pd.read_csv('student_project_data.csv')

df['Past Project Titles'] = df['Past Project Titles'].apply(stem_words)

X = df['Past Project Titles']  # Input: project titles
y = df['Skills']  # Target: corresponding skills

# Vectorization + Naive Bayes Pipeline
vectorizer = TfidfVectorizer(stop_words='english')  # Vectorization
model = MultinomialNB()  # Naive Bayes model

# Build pipeline: Vectorizer + Naive Bayes
pipeline = make_pipeline(vectorizer, model)

# Split data for training/testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
pipeline.fit(X_train, y_train)

# Test the model with a new project title
predicted_skills = pipeline.predict(["Desktop App"])
print(predicted_skills)  # Output: predicted skills based on the title

# Evaluate model accuracy
accuracy = pipeline.score(X_test, y_test)
print(f"Model Accuracy: {accuracy * 100:.2f}%")


['PyQt5, .Net, Python, Flask, Atom']
Model Accuracy: 0.00%


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [18]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import nltk
import joblib

nltk.download('punkt')

# Initialize stemmer
stemmer = PorterStemmer()

def stem_words(text):
    tokens = word_tokenize(text.lower())
    return ' '.join([stemmer.stem(word) for word in tokens])

def load_data(file_path):
    df = pd.read_csv(file_path)
    return df

def preprocess_data(df):
    df['Past Project Titles'] = df['Past Project Titles'].apply(stem_words)
    return df

def train_model(X, y):
    vectorizer = TfidfVectorizer(stop_words='english')
    model = MultinomialNB()
    pipeline = make_pipeline(vectorizer, model)
    pipeline.fit(X, y)
    return pipeline

def evaluate_model(pipeline, X_test, y_test):
    accuracy = pipeline.score(X_test, y_test)
    return accuracy

def save_model(pipeline, file_name):
    joblib.dump(pipeline, file_name)

def predict_skills(pipeline, project_title):
    return pipeline.predict([project_title])

def main():
    df = load_data('student_project_data.csv')

    df = preprocess_data(df)

    X = df['Past Project Titles']  # Input: project titles
    y = df['Skills']  # Target: corresponding skills
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    pipeline = train_model(X_train, y_train)

    predicted_skills = predict_skills(pipeline, "website ")
    print(predicted_skills)

    accuracy = evaluate_model(pipeline, X_test, y_test)
    print(f"Model Accuracy: {accuracy * 100:.2f}%")

    # Save the model
    # save_model(pipeline, 'naive_bayes_model.pkl')

if __name__ == "__main__":
    main()


['Figma, Adobe XD, Sketch, HTML/CSS']
Model Accuracy: 0.00%


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [47]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import nltk
import joblib
import numpy as np

nltk.download('punkt')

# Initialize stemmer
stemmer = PorterStemmer()

def stem_words(text):
    tokens = word_tokenize(text.lower())
    return ' '.join([stemmer.stem(word) for word in tokens])

def load_data(file_path):
    df = pd.read_csv(file_path)
    return df

def preprocess_data(df):
    df['Past Project Titles'] = df['Past Project Titles'].apply(stem_words)
    return df

def train_model(X, y):
    vectorizer = TfidfVectorizer(stop_words='english')
    model = MultinomialNB()
    pipeline = make_pipeline(vectorizer, model)
    pipeline.fit(X, y)
    return pipeline

def evaluate_model(pipeline, X_test, y_test):
    accuracy = pipeline.score(X_test, y_test)
    return accuracy

def save_model(pipeline, file_name):
    joblib.dump(pipeline, file_name)

def predict_top_n_skills(pipeline, project_title, n=2):
    probas = pipeline.predict_proba([project_title])

    top_n_indices = np.argsort(probas[0])[-n:][::-1]

    top_skills = pipeline.classes_[top_n_indices]
    return top_skills

def main():
    df = load_data('student_project_data.csv')

    df = preprocess_data(df)

    X = df['Past Project Titles']  # Input: project titles
    y = df['Skills']  # Target: corresponding skills
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    pipeline = train_model(X_train, y_train)

    top_skills = predict_top_n_skills(pipeline, "Website", n=3)
    print(top_skills)

    accuracy = evaluate_model(pipeline, X_test, y_test)
    print(f"Model Accuracy: {accuracy * 100:.2f}%")

    # Save the model
    save_model(pipeline, 'naive_bayes_model.pkl')

if __name__ == "__main__":
    main()


['WordPress, PHP, HTML, CSS' 'Shopify' 'React, Angular, Javascript']
Model Accuracy: 0.00%


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
