In [7]:
# ✅ Step 1: Install and download spaCy model
!pip install -q spacy scikit-learn pandas
!python -m spacy download en_core_web_sm

# ✅ Step 2: Imports
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

nlp = spacy.load("en_core_web_sm")

# ✅ Step 3: Synthetic Dataset (balanced and larger)
data = {
    "resume_text": [
        # Data Science
        "Experienced data scientist with Python, pandas, and scikit-learn.",
        "Built ML pipelines and visualizations using matplotlib and seaborn.",
        "Expert in machine learning, NLP, and deep learning models.",
        "Used TensorFlow for building CNNs and RNNs.",
        "Worked on data preprocessing and model evaluation.",
        # Web Dev
        "Full-stack developer with JavaScript, React, and Node.js experience.",
        "Built and deployed web apps using HTML, CSS, and MongoDB.",
        "Worked with Express and REST APIs for backend services.",
        "Integrated OAuth and user authentication features.",
        "Skilled in modern web design and responsive UI.",
        # Mobile Dev
        "Developed cross-platform mobile apps using Flutter and Dart.",
        "Android app developer with Java and Kotlin expertise.",
        "Built iOS apps using Swift and Xcode.",
        "Created mobile UIs and integrated APIs.",
        "Published apps to Google Play and App Store.",
        # Data Analyst
        "Analyzed datasets using Excel and Power BI.",
        "Created dashboards and reports for business intelligence.",
        "Worked with SQL and relational databases.",
        "Performed statistical analysis and hypothesis testing.",
        "Cleaned and processed data for reporting tasks.",
        # AI Engineer
        "Implemented deep learning models using PyTorch.",
        "Worked on computer vision projects using OpenCV.",
        "Built generative AI apps with transformers.",
        "Experience with LLMs and text summarization.",
        "Used GANs for synthetic image generation.",
        # Marketing
        "Managed SEO and SEM strategies for campaigns.",
        "Used Google Analytics and Ads for performance tracking.",
        "Created content for social media marketing.",
        "Analyzed customer trends using CRM tools.",
        "Worked on brand campaigns and A/B testing."
    ],
    "category": (
        ["Data Science"] * 5 +
        ["Web Development"] * 5 +
        ["Mobile Development"] * 5 +
        ["Data Analyst"] * 5 +
        ["AI Engineer"] * 5 +
        ["Marketing"] * 5
    )
}

df = pd.DataFrame(data)

# ✅ Step 4: Preprocessing with spaCy
def preprocess(text):
    doc = nlp(text)
    return " ".join([
        token.lemma_.lower() for token in doc
        if not token.is_stop and not token.is_punct and token.lemma_ != "-PRON-"
    ])

df["cleaned_text"] = df["resume_text"].apply(preprocess)

# ✅ Step 5: TF-IDF + Train/Test Split
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["cleaned_text"])
y = df["category"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

# ✅ Step 6: Train Model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# ✅ Step 7: Evaluation
y_pred = model.predict(X_test)
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\n📄 Classification Report:\n", classification_report(y_test, y_pred))


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m48.1 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
✅ Accuracy: 0.0

📄 Classification Report:
                     precision    recall  f1-score   support

       AI Engineer       0.00      0.00      0.00       1.0
      Data Analyst       0.00      0.00      0.00       1.0
      Data Science       0.00      0.00      0.00       2.0
         Marketing       0.00      0.00      0.00  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
