## Naive Bayers 朴素贝叶斯法

In [None]:
pip install --upgrade pandas openpyxl

In [None]:
import pandas as pd
import jieba
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
data = pd.read_excel('Training_set.xlsx')
predict_data = pd.read_excel('Predict.xlsx')

In [None]:
data['comment'] = data['comment'].astype(str)
predict_data['comment'] = predict_data['comment'].astype(str)

In [None]:
def tokenize(text):
    return ' '.join(jieba.cut(text))

data['tokenized'] = data['comment'].apply(tokenize)
predict_data['tokenized'] = predict_data['comment'].apply(tokenize)

In [None]:
X = data['tokenized']
y = data['sentiment']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

In [None]:
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec = vectorizer.transform(X_val)

In [None]:
clf = MultinomialNB()
clf.fit(X_train_vec, y_train)

In [None]:
y_pred = clf.predict(X_val_vec)

print("Accuracy:", accuracy_score(y_val, y_pred))
print("Precision:", precision_score(y_val, y_pred))
print("Recall:", recall_score(y_val, y_pred))
print("F1-score:", f1_score(y_val, y_pred))

In [None]:
X_predict = vectorizer.transform(predict_data['tokenized'])
predict_data['sentiment'] = clf.predict(X_predict)

In [None]:
predict_data[['comment', 'sentiment']].to_excel('Predict_with_sentiment.xlsx', index=False)

## Decision Tree 决策树

In [None]:
import pandas as pd
import jieba
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
data = pd.read_excel('Training_set.xlsx')
predict_data = pd.read_excel('Predict.xlsx')

In [None]:
data['comment'] = data['comment'].astype(str)
predict_data['comment'] = predict_data['comment'].astype(str)

def tokenize(text):
    return ' '.join(jieba.cut(text))

data['tokenized'] = data['comment'].apply(tokenize)
predict_data['tokenized'] = predict_data['comment'].apply(tokenize)

In [None]:
X = data['tokenized']
y = data['sentiment']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

In [None]:
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec = vectorizer.transform(X_val)

In [None]:
clf = DecisionTreeClassifier(criterion='entropy')
clf.fit(X_train_vec, y_train)

* 你可以通过更改方法“entropy”为其他方法来构建不同的决策树（如：'gini'）

In [None]:
y_pred = clf.predict(X_val_vec)

print("Accuracy:", accuracy_score(y_val, y_pred))
print("Precision:", precision_score(y_val, y_pred))
print("Recall:", recall_score(y_val, y_pred))
print("F1-score:", f1_score(y_val, y_pred))

In [None]:
X_predict = vectorizer.transform(predict_data['tokenized'])
predict_data['sentiment'] = clf.predict(X_predict)

In [None]:
predict_data[['comment', 'sentiment']].to_excel('Predict_with_sentiment.xlsx', index=False)

## K-NN

In [None]:
pip install --upgrade pandas openpyxl

In [None]:
import pandas as pd
import jieba
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
data = pd.read_excel('Training_set.xlsx')
predict_data = pd.read_excel('Predict.xlsx')

In [None]:
data['comment'] = data['comment'].astype(str)
predict_data['comment'] = predict_data['comment'].astype(str)

In [None]:
def tokenize(text):
    return ' '.join(jieba.cut(text))

data['tokenized'] = data['comment'].apply(tokenize)
predict_data['tokenized'] = predict_data['comment'].apply(tokenize)

In [None]:
X = data['tokenized']
y = data['sentiment']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

In [None]:
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec = vectorizer.transform(X_val)

In [None]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train_vec, y_train)

y_pred = knn.predict(X_val_vec)

In [None]:
print("Accuracy:", accuracy_score(y_val, y_pred))
print("Precision:", precision_score(y_val, y_pred, average='macro'))
print("Recall:", recall_score(y_val, y_pred, average='macro'))
print("F1-score:", f1_score(y_val, y_pred, average='macro'))

In [None]:
X_predict = vectorizer.transform(predict_data['tokenized'])
predict_data['sentiment'] = knn.predict(X_predict)

In [None]:
predict_data[['comment', 'sentiment']].to_excel('Predict_with_sentiment.xlsx', index=False)

## Random Forest 随机森林

In [None]:
import pandas as pd
import jieba
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
data = pd.read_excel('Training_set.xlsx')
predict_data = pd.read_excel('Predict.xlsx')

In [None]:
data['comment'] = data['comment'].astype(str)
predict_data['comment'] = predict_data['comment'].astype(str)

In [None]:
def tokenize(text):
    return ' '.join(jieba.cut(text))

data['tokenized'] = data['comment'].apply(tokenize)
predict_data['tokenized'] = predict_data['comment'].apply(tokenize)

In [None]:
X = data['tokenized']
y = data['sentiment']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

In [None]:
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec = vectorizer.transform(X_val)

In [None]:
random_forest = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest.fit(X_train_vec, y_train)

y_pred = random_forest.predict(X_val_vec)

In [None]:
print("Accuracy:", accuracy_score(y_val, y_pred))
print("Precision:", precision_score(y_val, y_pred, average='macro'))
print("Recall:", recall_score(y_val, y_pred, average='macro'))
print("F1-score:", f1_score(y_val, y_pred, average='macro'))

In [None]:
X_predict = vectorizer.transform(predict_data['tokenized'])
predict_data['sentiment'] = random_forest.predict(X_predict)

In [None]:
predict_data[['comment', 'sentiment']].to_excel('Predict_with_sentiment.xlsx', index=False)

## 对比四种不同方法

In [None]:
import pandas as pd
import jieba
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
data = pd.read_excel('Training_set.xlsx')
predict_data = pd.read_excel('Predict.xlsx')

In [None]:
data['comment'] = data['comment'].astype(str)
predict_data['comment'] = predict_data['comment'].astype(str)

In [None]:
def tokenize(text):
    return ' '.join(jieba.cut(text))

data['tokenized'] = data['comment'].apply(tokenize)
predict_data['tokenized'] = predict_data['comment'].apply(tokenize)

In [None]:
X = data['tokenized']
y = data['sentiment']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

In [None]:
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec = vectorizer.transform(X_val)

In [None]:
models = [
    ('Logistic Regression', LogisticRegression()),
    ('Naive Bayes', MultinomialNB()),
    ('Decision Tree', DecisionTreeClassifier()),
    ('Random Forest', RandomForestClassifier())
]

In [None]:
results = []
trained_models = []

for name, model in models:
    model.fit(X_train_vec, y_train)
    trained_models.append((name, model))
    
    y_pred = model.predict(X_val_vec)
    
    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    
    results.append([name, accuracy, precision, recall, f1])

In [None]:
results_df = pd.DataFrame(results, columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1-score'])
print(results_df.to_string(index=False))

In [None]:
X_predict = vectorizer.transform(predict_data['tokenized'])

for name, model in trained_models:
    predict_data[f'sentiment_{name}'] = model.predict(X_predict)

In [None]:
predict_data.to_excel('Predict_with_sentiment1.xlsx', index=False)