In [62]:
# 1. 导入库
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import nltk
import re
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import hstack
from scipy.sparse import csr_matrix

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package punkt to /Users/zhangtang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/zhangtang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/zhangtang/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [63]:
# 2. 读取数据
df = pd.read_csv('/Users/zhangtang/Documents/Social Media and Text Analytics/Group Project/All combined data.csv')

In [64]:
# 3. 文本清洗
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', '', text)
    words = nltk.word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
    return ' '.join(words)

df['cleaned_description'] = df['description'].fillna('').apply(clean_text)

In [65]:
# 4. 随机划分 Validation Set (100条)
df_train_test, df_val = train_test_split(df, test_size=150, random_state=42, stratify=df['label'])

In [66]:
# 5. 特征工程 (fit on train_test, transform on both)

# 文本TF-IDF特征
vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=3)
X_text_train_test = vectorizer.fit_transform(df_train_test['cleaned_description'])
X_text_val = vectorizer.transform(df_val['cleaned_description'])

# employment_type OneHot特征
ohe = OneHotEncoder(sparse_output=True)
X_employment_type_train_test = ohe.fit_transform(df_train_test['employment_type'].fillna('unknown').values.reshape(-1, 1))
X_employment_type_val = ohe.transform(df_val['employment_type'].fillna('unknown').values.reshape(-1, 1))

# 手工数值特征

def build_numeric_features(sub_df):
    has_contact_info = sub_df['contact_info'].notna().astype(int)

    def is_company_email(contact):
        if pd.isna(contact):
            return 0
        contact = contact.lower()
        if any(domain in contact for domain in ['gmail.com', 'yahoo.com', 'hotmail.com', 'outlook.com']):
            return 0
        if '@' in contact:
            return 1
        return 0

    company_email = sub_df['contact_info'].apply(is_company_email)

    def has_phone(contact):
        if pd.isna(contact):
            return 0
        numbers = re.findall(r'\d{7,}', contact)
        return int(len(numbers) > 0)

    has_phone_number = sub_df['contact_info'].apply(has_phone)
    has_compensation = sub_df['compensation'].apply(lambda x: int(str(x).strip().lower() != 'compensation:' and pd.notna(x)))
    has_company_name = sub_df['company'].notna().astype(int)
    word_count = sub_df['description'].fillna('').apply(lambda x: len(str(x).split()))
    exclamation_count = sub_df['description'].fillna('').apply(lambda x: str(x).count('!'))

    X_numeric = np.vstack([
        has_contact_info,
        company_email,
        has_phone_number,
        has_compensation,
        has_company_name,
        word_count,
        exclamation_count
    ]).T

    return X_numeric

X_numeric_train_test = build_numeric_features(df_train_test)
X_numeric_val = build_numeric_features(df_val)

In [67]:
# 6. 特征组合与标签
df_train_test.reset_index(drop=True, inplace=True)
df_val.reset_index(drop=True, inplace=True)

X_train_test = hstack([X_text_train_test, X_numeric_train_test, X_employment_type_train_test])
X_val = hstack([X_text_val, X_numeric_val, X_employment_type_val])

X_train_test = csr_matrix(X_train_test)
X_val = csr_matrix(X_val)

y_train_test = df_train_test['label']
y_val = df_val['label']

In [68]:
# 7. 再将 train_test 分成 Train 和 Test
X_train, X_test, y_train, y_test = train_test_split(X_train_test, y_train_test, test_size=0.2, random_state=42, stratify=y_train_test)

In [69]:
# 8. 定义模型
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=100),
    'SVM (Linear Kernel)': SVC(kernel='linear', probability=True),
    'Naive Bayes': MultinomialNB()
}

In [70]:
# 9. 训练与验证
results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred_test = model.predict(X_test)
    y_pred_val = model.predict(X_val)

    results.append({
        'Model': name,
        'Test Accuracy': accuracy_score(y_test, y_pred_test),
        'Test Precision': precision_score(y_test, y_pred_test),
        'Test F1': f1_score(y_test, y_pred_test),
        'Validation Accuracy': accuracy_score(y_val, y_pred_val)
    })

In [71]:
# 10. 输出比较
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by='Validation Accuracy', ascending=False)
print(results_df)

                 Model  Test Accuracy  Test Precision   Test F1  \
2  SVM (Linear Kernel)           0.99        0.985294  0.992593   
0  Logistic Regression           0.96        0.943662  0.971014   
1        Random Forest           0.95        0.930556  0.964029   
3          Naive Bayes           0.74        0.720430  0.837500   

   Validation Accuracy  
2             0.986667  
0             0.980000  
1             0.973333  
3             0.793333  
