# 1. 与 GoogleDrive 链接，轻松导入文件

In [None]:
# 下面两步是如何链接colab
from google.colab import drive
drive.mount('/content/drive/')

# 通常，直接给出路径然后导入即可
# 另外，%cd命令到该路径，然后按文件名导入

dir_path = '/content/drive/MyDrive/2023NLPCourse/Assignment1/Part_A'
file_name = 'IMDB Dataset.csv'

# cd 到该路径下
%cd /content/drive/MyDrive/2023NLPCourse/Assignment1/Part_A

Mounted at /content/drive/
/content/drive/MyDrive/2023NLPCourse/Assignment1/Part_A


# 2.数据导入和数据标签处理

In [None]:
import pandas as pd

data = pd.read_csv("Data/"+file_name, error_bad_lines=False)
data['label'] = data['sentiment'].replace(['positive', 'negative'],['1', '0'])



  data = pd.read_csv("Data/"+file_name, error_bad_lines=False)


# 3.定义预处理函数

In [None]:
!pip install emoji
# 用于情感分析的文本预处理
import string
import emoji
import re
import nltk
nltk.download('stopwords')
from textblob import TextBlob
from nltk.corpus import wordnet
from nltk.metrics import edit_distance
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
import pandas as pd
stopwords = nltk.corpus.stopwords.words('english')

def preprocess(text):
    
    # 1. 生成推文中的单词列表（已删除标签和其他标点符号）
    text_blob = TextBlob(text) # 生成 textblob对象
    text = ' '.join(text_blob.words) # 空格连接textblob对象识别到的词语，即连接成整段文本，且去除其他符号
    
    # 2. re模块清理数字符号
    text = re.sub(r'[0-9]', '', text)
    
    # 3. 全部小写化
    text = text.lower()
    
    # 4. emoji模块将表情转换为文本
    text = emoji.demojize(text)
    
    # 5. 对于一些仍有可能存在的标点符号去除，使用string模块 
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    
    # 6. 进行分词处理
    text = word_tokenize(text)
    
    # 7. 移除可能存在的空白token
    text = [t for t in text if len(t) > 0]
    
    # 8. 移除一些非英文字母的token
    text = [t for t in text if t.isalpha()]
    
    # 9. 替换否定标记，便于感情识别
    replacer  = AntonymReplacer()
    text = replacer.replace_negations(text)
    
    # 10. 移除无用的停用词
    text = [i for i in text if i not in stopwords]
    
    # 11. 最后进行词干提取
    porter_stemmer = PorterStemmer()
    text = [porter_stemmer.stem(w) for w in text]
    
    return text

class AntonymReplacer(object):
    def replace(self, word, pos=None):
        antonyms = set()

        for syn in wordnet.synsets(word, pos=pos):
            for lemma in syn.lemmas():
                for antonym in lemma.antonyms():
                    antonyms.add(antonym.name())

        if len(antonyms) == 1:
            return antonyms.pop()
        else:
            return None

    def replace_negations(self, sent):
        i, l = 0, len(sent)
        words = []

        while i < l:
            word = sent[i]

            if word == 'not' and i+1 < l:
                ant = self.replace(sent[i+1])

                if ant:
                    words.append(ant)
                    i += 2
                    continue

            words.append(word)
            i += 1

        return words

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# 4. 导入 CountVectorizer & TfidfVectorizer 模型来生成对应特征表示向量

In [None]:
nltk.download('punkt')
nltk.download('wordnet')
# Fit：在数据上训练模型，在训练期间使用 3 中定义好的预处理函数
# Transform：最后用模型对数据集生成对应的向量数据
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

Count_vectorizer = CountVectorizer(analyzer=preprocess)
Tfidf_vectorizer = TfidfVectorizer(analyzer=preprocess, min_df=2, max_df=0.9, sublinear_tf=True, use_idf=True)

Count_model = Count_vectorizer.fit(data['review'])
Tfidf_model = Tfidf_vectorizer.fit(data['review'])

Count_data = Count_model.transform(data['review'])
Tfidf_data = Tfidf_model.transform(data['review'])

# 5. 保存模型与数据到对应的文件夹

In [None]:
import pickle

pickle.dump(Count_model, open('Models/Count_model.save', 'wb'))
pickle.dump(Tfidf_model, open('Models/Tfidf_model.save', 'wb'))

pickle.dump(Count_data, open('Data/Count_data.save', 'wb'))
pickle.dump(Tfidf_data, open('Data/Tfidf_data.save', 'wb'))

# 6. 分割数据集并训练两个SVM模型

In [None]:
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

def train(Data, model):
    SEED = 4000
    X_train, X_test, y_train, y_test = train_test_split(Data, data.label, test_size=0.2, random_state=SEED)
    model.fit(X_train, y_train.values.ravel())
    print('Result of {}\n'+str(model))
    predictions = model.predict(X_test)
    print(classification_report(predictions, y_test))
    print('\n')
    print('Confusion matrix: \n', confusion_matrix(predictions, y_test))
    print('\n')
    print('Accuracy score: ', accuracy_score(predictions, y_test))
    print('\n\n\n')

SVM_CountVectorizer = svm.SVC()
SVM_TfidfVectorizer = svm.SVC()

train(Count_data, SVM_CountVectorizer)
train(Tfidf_data, SVM_TfidfVectorizer)

Result of {}
SVC()
              precision    recall  f1-score   support

           0       0.85      0.90      0.88      4761
           1       0.90      0.86      0.88      5239

    accuracy                           0.88     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.88      0.88      0.88     10000



Confusion matrix: 
 [[4280  481]
 [ 726 4513]]


Accuracy score:  0.8793




Result of {}
SVC()
              precision    recall  f1-score   support

           0       0.89      0.91      0.90      4875
           1       0.91      0.89      0.90      5125

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000



Confusion matrix: 
 [[4435  440]
 [ 571 4554]]


Accuracy score:  0.8989






# 7. 保存两个 SVM 模型

In [None]:
pickle.dump(SVM_CountVectorizer, open('Models/SVM_CountVectorizer.save', 'wb'))
pickle.dump(SVM_TfidfVectorizer, open('Models/SVM_TfidfVectorizer.save', 'wb'))