<a href="https://colab.research.google.com/github/Mouni-thebeginner/team3/blob/main/Milestone_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files
uploaded = files.upload()

# Import libraries
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

# Download NLTK resources
nltk.download("punkt")
nltk.download("punkt_tab")   # ✅ Fix LookupError
nltk.download("wordnet")
nltk.download("stopwords")

# Load dataset
df = pd.read_csv("fake_job_postings.csv", encoding="latin-1")

# Function to preprocess text
def preprocess_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if w.isalpha()]
    stop_words = set(stopwords.words("english"))
    tokens = [word for word in tokens if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    lems = [lemmatizer.lemmatize(word) for word in tokens]
    return " ".join(lems)

print("\n🔄 Preprocessing data...")
df["description_clean"] = df["description"].apply(preprocess_text)
df = df.dropna(subset=["fraudulent"])  # Drop rows with missing target
print("✅ Preprocessing complete!")

# Show first 5 preprocessed rows
print("\n📊 First 5 Preprocessed Rows:")
print(df[["title", "description", "description_clean"]].head(5))

# Features (X) and Target (y)
X = df["description_clean"]
y = df["fraudulent"]

# Convert text into TF-IDF features
vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = vectorizer.fit_transform(X)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, y, test_size=0.2, random_state=42, stratify=y
)

print("\n🔄 Training model...")
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
print("✅ Training complete!")

print("\n🔄 Testing model...")
y_pred = model.predict(X_test)
print("✅ Testing complete!")

# Show first 5 actual vs predicted results
results = pd.DataFrame({
    "Job Title": df.loc[y_test.index, "title"].values[:5],
    "Actual": y_test.values[:5],
    "Predicted": y_pred[:5]
})

print("\n📊 First 5 Test Results:")
print(results.to_string(index=False))

print("\n✅ All steps complete! (Preprocessing + Training + Testing)")


Saving fake_job_postings.csv to fake_job_postings (5).csv


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



🔄 Preprocessing data...
✅ Preprocessing complete!

📊 First 5 Preprocessed Rows:
                                       title  \
0                           Marketing Intern   
1  Customer Service - Cloud Video Production   
2    Commissioning Machinery Assistant (CMA)   
3          Account Executive - Washington DC   
4                        Bill Review Manager   

                                         description  \
0  Food52, a fast-growing, James Beard Award-winn...   
1  Organised - Focused - Vibrant - Awesome!Do you...   
2  Our client, located in Houston, is actively se...   
3  THE COMPANY: ESRI â Environmental Systems Re...   
4  JOB TITLE: Itemization Review ManagerLOCATION:...   

                                   description_clean  
0  james beard online food community curated reci...  
1  organised focused vibrant awesome passion cust...  
2  client located houston actively seeking experi...  
3  company esri environmental system research ins...  
4  job title itemi