In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv("weather.csv")

In [3]:
train.shape

(14, 5)

In [4]:
numOfExample = train.shape[0]
numOfFeature = train.shape[1]-1

In [5]:
train

Unnamed: 0,天气,气温,湿度,风,外出
0,晴朗,高温,高,无风,no
1,晴朗,高温,高,有风,no
2,多云,高温,高,无风,yes
3,下雨,温暖,高,无风,yes
4,下雨,寒冷,正常,无风,yes
5,下雨,寒冷,正常,有风,no
6,多云,寒冷,正常,有风,yes
7,晴朗,温暖,高,无风,no
8,晴朗,寒冷,正常,无风,yes
9,下雨,温暖,正常,无风,yes


In [6]:
train.columns.values[:-1]

array(['天气', '气温', '湿度', '风'], dtype=object)

# P(yes|E) = P(E|yes)*P(yes)/P(E)
# P(no|E) = P(E|no)*P(no)/P(E)

In [7]:
"""先求P(yes)和P(no)"""
numOfYes = 0
numOfNo = 0
yes_no_set =np.array(train["外出"].values)
for i in range(len(yes_no_set)):
    if yes_no_set[i] == "yes":
        numOfYes += 1
    if yes_no_set[i] == "no":
        numOfNo += 1
        
P_yes = numOfYes/(numOfYes + numOfNo)
P_no = numOfNo/(numOfYes + numOfNo)

In [8]:
print(P_yes)
print(P_no)

0.6428571428571429
0.35714285714285715


In [9]:
"""
    然后求P(E)
    P(E) = P(E1)*P(E2)*P(E3)......
"""
P_E = {}
for i in range(numOfFeature):
    labels = train.columns.values[:-1]
    label = labels[i]
    P_E[label] = {}
    for j in range(len(train[label])):
        #添加所有的特征值
        if train[label].values[j] not in P_E[label]:
            fea_name = train[label].values[j]
            P_E[label][fea_name] = 1
        else:
            fea_name = train[label].values[j]
            P_E[label][fea_name] += 1
    for key,value in P_E[label].items():
        # 要能够key和value同时解包，需要取其items
        P_E[label][key] /= numOfExample

In [10]:
P_E

{'天气': {'下雨': 0.35714285714285715,
  '多云': 0.2857142857142857,
  '晴朗': 0.35714285714285715},
 '气温': {'寒冷': 0.2857142857142857,
  '温暖': 0.42857142857142855,
  '高温': 0.2857142857142857},
 '湿度': {'正常': 0.5, '高': 0.5},
 '风': {'无风': 0.5714285714285714, '有风': 0.42857142857142855}}

In [11]:
"""
    最后求P(E|yes)和P(E|no)
    P(E|yes) = P(E1|yes)*P(E2|yes)*P(E3|yes)......

"""
P_E_yes = {}
P_E_no = {}
for i in range(numOfFeature):
    labels = train.columns.values[:-1]
    label = labels[i]
    P_E_yes[label + "_yes"] = {}
    P_E_no[label + "_no"] = {}
    E_yes_no_set = train.iloc[:,[i,-1]]
    for j in range(numOfExample):
        if E_yes_no_set.values[j][-1] == 'yes':
            fea_name = E_yes_no_set.values[j][0]
            if fea_name not in P_E_yes[label + "_yes"]:
                P_E_yes[label + "_yes"][fea_name] = 1
            else:
                P_E_yes[label + "_yes"][fea_name] += 1
                
        elif E_yes_no_set.values[j][-1] == 'no':
            fea_name = E_yes_no_set.values[j][0]
            if fea_name not in  P_E_no[label + "_no"]:
                P_E_no[label + "_no"][fea_name] = 1
            else:
                P_E_no[label + "_no"][fea_name] += 1
                
    value_sum = 0
    for key,value in P_E_yes[label + "_yes"].items():
        value_sum += value
    for key,value in P_E_yes[label + "_yes"].items():
        P_E_yes[label + "_yes"][key] /= value_sum 

    value_sum = 0
    for key,value in P_E_no[label + "_no"].items():
        value_sum += value
    for key,value in P_E_no[label + "_no"].items():
        P_E_no[label + "_no"][key] /= value_sum 

In [12]:
P_E_yes

{'天气_yes': {'下雨': 0.3333333333333333,
  '多云': 0.4444444444444444,
  '晴朗': 0.2222222222222222},
 '气温_yes': {'寒冷': 0.3333333333333333,
  '温暖': 0.4444444444444444,
  '高温': 0.2222222222222222},
 '湿度_yes': {'正常': 0.6666666666666666, '高': 0.3333333333333333},
 '风_yes': {'无风': 0.6666666666666666, '有风': 0.3333333333333333}}

In [13]:
P_E_no

{'天气_no': {'下雨': 0.4, '晴朗': 0.6},
 '气温_no': {'寒冷': 0.2, '温暖': 0.4, '高温': 0.4},
 '湿度_no': {'正常': 0.2, '高': 0.8},
 '风_no': {'无风': 0.4, '有风': 0.6}}

In [14]:
def split_feature(example):
    """输入类型为list[天气，气温，湿度，风]"""
    if(type(example) == list):
        pieces = []
        for i in range(numOfFeature):
            pieces.append(str(example[i]))
        return pieces
    else:
        print("please enter data whose type is list")

In [15]:
"""
    P(yes|E) = P(E|yes)*P(yes)/P(E) =  ∏P(Ei|yes) * P(yes) / ∏P(Ei)
    P(no|E) = P(E|no)*P(no)/P(E) =  =  ∏P(Ei|no) * P(no) / ∏P(Ei)
"""

def predict(example):
    pieces = split_feature(example)
    pro_bayes_yes = P_yes
    pro_bayes_no = P_no

    labels = train.columns.values[:-1]
    for i in range(numOfFeature):
        label = labels[i]
        # 对于每一个feature
        pro_bayes_yes *= P_E_yes[label +"_yes"][pieces[i]]
        pro_bayes_yes /= P_E[label][pieces[i]]
        
        pro_bayes_no *= P_E_no[label +"_no"][pieces[i]]
        pro_bayes_no /= P_E[label][pieces[i]]
        
    print("The probability of yes is " + str(pro_bayes_yes))
    print("The probability of no is " + str(pro_bayes_no))
    
    
    if pro_bayes_yes > pro_bayes_no:
        print("The result is yes")
    else:
        print("The result is no")
        

In [16]:
predict(['晴朗', '寒冷', '高', '有风'])

The probability of yes is 0.24197530864197522
The probability of no is 0.9408000000000001
The result is no
