# 数据挖掘作业

### 张贤益 控制2班 Y30180696

### 1.读取数据

使用[**`pandas`**](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html)中的`pandas.read_table`函数进行数据读取。

In [15]:
import pandas as pd
import numpy as np
from collections import defaultdict
train = pd.read_table('./smsspamcollection/SMSSpamCollection',names = ['Label','Text'])
train.index = np.arange(1, len(train) + 1)
train.head()

Unnamed: 0,Label,Text
1,ham,"Go until jurong point, crazy.. Available only ..."
2,ham,Ok lar... Joking wif u oni...
3,spam,Free entry in 2 a wkly comp to win FA Cup fina...
4,ham,U dun say so early hor... U c already then say...
5,ham,"Nah I don't think he goes to usf, he lives aro..."


对于其Label,我们需要把ham定义成0,把spam定义成1,并把cvs的数据以数组的形式保存,程序如下:

In [16]:
# 标签值转化
class_mapping = {'ham':0, 'spam':1}
train['Label'] = train['Label'].map(class_mapping)
# 把其中的label和content分别保存，之后要对content进行处理
train_label = train.Label.values
train_data = train.Text.values
train.head()

Unnamed: 0,Label,Text
1,0,"Go until jurong point, crazy.. Available only ..."
2,0,Ok lar... Joking wif u oni...
3,1,Free entry in 2 a wkly comp to win FA Cup fina...
4,0,U dun say so early hor... U c already then say...
5,0,"Nah I don't think he goes to usf, he lives aro..."


### 2.清洗数据

初步的清洗数据，包括

- 把所有的单词都转换成小写

- 删除除了英文之外的字符

- 把所有的单词恢复成词干

  使用 [`nltk.stem.snowball module`](https://www.nltk.org/api/nltk.stem.html?highlight=nltk%20stem%20snowball#module-nltk.stem.snowball) 用于词干处理

In [17]:
import re
import nltk
# nltk.download('punkt') # 没有下载的话要先下载
from nltk.tokenize import word_tokenize

def clean_text(comment_text):
    comment_list = []
    for text in comment_text:
        # 将单词转换为小写
        text = text.lower()
        # 删除非字母、数字字符
        text = re.sub(r"[^a-z']", " ", text)
        # 恢复常见的简写
        text = re.sub(r"what's", "what is ", text)
        text = re.sub(r"\'s", " ", text)
        text = re.sub(r"\'ve", " have ", text)
        text = re.sub(r"can't", "can not ", text)
        text = re.sub(r"cannot", "can not ", text)
        text = re.sub(r"n't", " not ", text)
        text = re.sub(r"\'m", " am ", text)
        text = re.sub(r"\'re", " are ", text)
        text = re.sub(r"\'d", " will ", text)
        text = re.sub(r"ain\'t", " are not ", text)
        text = re.sub(r"aren't", " are not ", text)
        text = re.sub(r"couldn\'t", " can not ", text)
        text = re.sub(r"didn't", " do not ", text)
        text = re.sub(r"doesn't", " do not ", text)
        text = re.sub(r"don't", " do not ", text)
        text = re.sub(r"hadn't", " have not ", text)
        text = re.sub(r"hasn't", " have not ", text)
        text = re.sub(r"\'ll", " will ", text)
        #进行词干提取
        new_text = ""
        s = nltk.stem.snowball.EnglishStemmer()  # 英文词干提取器
        for word in word_tokenize(text):
            new_text = new_text + " " + s.stem(word)
        # 放回去
        comment_list.append(new_text)
    return comment_list

train_data = clean_text(train_data)

### 3.训练集划分出验证集

因为本题目的测试集中没有label，不便与我们自己验证算法的好坏，因此我们把训练集中70%的数据作为训练集，30%的数据作为验证集

In [18]:
from sklearn.model_selection import train_test_split
# 训练集划分：测试集为30%，训练集为70%
x_train,x_test,y_train,y_test = train_test_split(train_data,train_label,test_size=0.3,random_state=0)

### 4. 朴素贝叶斯程序编写

In [19]:
class NaiveBayes:
    def __init__(self,unique_classes):
        self.classes=unique_classes # Constructor is sinply passed with unique number of classes of the training set
        
    def addToBow(self,example,dict_index):
        
        if isinstance(example,np.ndarray): example=example[0]
     
        for token_word in example.split(): #for every word in preprocessed example
          
            self.bow_dicts[dict_index][token_word]+=1 #increment in its count
            
    def train(self,dataset,labels):

        self.examples=dataset
        self.labels=labels
        self.bow_dicts=np.array([defaultdict(lambda:0) for index in range(self.classes.shape[0])])
        
        #only convert to numpy arrays if initially not passed as numpy arrays - else its a useless recomputation
        
        if not isinstance(self.examples,np.ndarray): self.examples=np.array(self.examples)
        if not isinstance(self.labels,np.ndarray): self.labels=np.array(self.labels)
            
        #constructing BoW for each category
        for cat_index,cat in enumerate(self.classes):
          
            all_cat_examples=self.examples[self.labels==cat] #filter all examples of category == cat
            
            #get examples preprocessed
            # cleaned_examples=[preprocess_string(cat_example) for cat_example in all_cat_examples]
            cleaned_examples = all_cat_examples # already been cleaned
            cleaned_examples=pd.DataFrame(data=cleaned_examples)
            
            #now costruct BoW of this particular category
            np.apply_along_axis(self.addToBow,1,cleaned_examples,cat_index)
            
        prob_classes=np.empty(self.classes.shape[0])
        all_words=[]
        cat_word_counts=np.empty(self.classes.shape[0])
        for cat_index,cat in enumerate(self.classes):
           
            #Calculating prior probability p(c) for each class
            prob_classes[cat_index]=np.sum(self.labels==cat)/float(self.labels.shape[0]) 
            
            #Calculating total counts of all the words of each class 
            count=list(self.bow_dicts[cat_index].values())
            cat_word_counts[cat_index]=np.sum(np.array(list(self.bow_dicts[cat_index].values())))+1 # |v| is remaining to be added
            
            #get all words of this category                                
            all_words+=self.bow_dicts[cat_index].keys()
                                                     
        
        #combine all words of every category & make them unique to get vocabulary -V- of entire training set
        
        self.vocab=np.unique(np.array(all_words))
        self.vocab_length=self.vocab.shape[0]
                                  
        #computing denominator value                                      
        denoms=np.array([cat_word_counts[cat_index]+self.vocab_length+1 for cat_index,cat in enumerate(self.classes)])                                                                          
     
        self.cats_info=[(self.bow_dicts[cat_index],prob_classes[cat_index],denoms[cat_index]) for cat_index,cat in enumerate(self.classes)]                               
        self.cats_info=np.array(self.cats_info)                                 
                                              
                                              
    def getExampleProb(self,test_example):                                
                                   
        likelihood_prob=np.zeros(self.classes.shape[0]) #to store probability w.r.t each class
        
        #finding probability w.r.t each class of the given test example
        for cat_index,cat in enumerate(self.classes): 
                             
            for test_token in test_example.split(): #split the test example and get p of each test word                              
                
                #get total count of this test token from it's respective training dict to get numerator value                           
                test_token_counts=self.cats_info[cat_index][0].get(test_token,0)+1
                
                #now get likelihood of this test_token word                              
                test_token_prob=test_token_counts/float(self.cats_info[cat_index][2])                              
                
                #remember why taking log? To prevent underflow!
                likelihood_prob[cat_index]+=np.log(test_token_prob)
                                              
        # we have likelihood estimate of the given example against every class but we need posterior probility
        post_prob=np.empty(self.classes.shape[0])
        for cat_index,cat in enumerate(self.classes):
            post_prob[cat_index]=likelihood_prob[cat_index]+np.log(self.cats_info[cat_index][1])                                  
      
        return post_prob
    
   
    def test(self,test_set):
      
        predictions=[] #to store prediction of each test example
        for example in test_set: 
                                              
            #preprocess the test example the same way we did for training set exampels                                  
            cleaned_example= example # already cleaned
             
            #simply get the posterior probability of every example                                  
            post_prob=self.getExampleProb(cleaned_example) #get prob of this example for both classes
            
            #simply pick the max value and map against self.classes!
            predictions.append(self.classes[np.argmax(post_prob)])
                
        return np.array(predictions)

### 5.训练及测试

In [20]:
# --------------------------training----------------------
nb = NaiveBayes(np.unique(y_train)) #instantiate a NB class object 
nb.train(x_train,y_train) #start tarining by calling the train function

# --------------------------testing----------------------
pclasses=nb.test(x_test) #get predcitions for test set
test_acc=np.sum(pclasses==y_test)/float(y_test.shape[0]) 

print ("Test Set Examples: ",y_test.shape[0])
print ("Test Set Accuracy: %f" %test_acc)

Test Set Examples:  1672
Test Set Accuracy: 0.977871


### 以下使用TF-IDF处理数据并使用sklearn的BernoulliNB模型
### ⚠️使用时只运行前面的1，2部分，之后的程序不要运行
---

### 6.TF-IDF计算

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
# 数据的TF-IDF信息计算
all_comment_list = list(train_data)
text_vector = TfidfVectorizer(sublinear_tf=True, strip_accents='unicode',token_pattern=r'\w{1,}',
                              max_features=5000, ngram_range=(1, 1), analyzer='word')
text_vector.fit(all_comment_list)

train_data = text_vector.transform(train_data)
train_data = train_data.toarray()

### 7.训练集划分出验证集

In [13]:
from sklearn.model_selection import train_test_split
# 训练集划分：测试集为30%，训练集为70%
x_train,x_test,y_train,y_test = train_test_split(train_data,train_label,test_size=0.3,random_state=0)

### 8.sklearn模型预测
采用BernoulliNB模型 使用sklearn

In [14]:
from sklearn.naive_bayes import BernoulliNB
from sklearn import metrics
clf = BernoulliNB(fit_prior=True)
clf.fit(x_train, y_train)
test_acc = metrics.accuracy_score(clf.predict(x_test), y_test)
print("BernoulliNB acc: %f "% test_acc)

BernoulliNB acc: 0.982656 
