In [1]:
from sklearn.datasets import load_iris, fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier

## 1.0  k近邻算法 对鸢尾花数据进行预测

In [2]:
def knn_iris_demo():
    '''
    使用 k近邻算法 对鸢尾花数据进行预测
    :return: None
    '''

    # 准备数据集
    iris = load_iris()
    #     特征值和目标值分清楚
    #     切分训练集和测试集
    x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, train_size=0.8)
    #     特征工程：
    #         标准化
    stand = StandardScaler()
    x_train = stand.fit_transform(x_train)
    x_test = stand.transform(x_test)
    # 使用k近邻算法进行训练
    #     实例化KNeighborsClassifier
    knn = KNeighborsClassifier(n_neighbors=5)
    #     训练 fit 训练集
    knn.fit(x_train, y_train)
    #     评估 score  测试集 n准确率
    score = knn.score(x_test, y_test)
    print('score', score)
    #     预测 predict 测试集
    y_predict = knn.predict(x_test)
    print(y_predict == y_test)

    return None

## 1.1    使用 k近邻算法 对鸢尾花数据进行预测  增加交叉验证网格搜索

In [3]:
def knn_iris_gridsearch_cv_demo():
    '''
    使用 k近邻算法 对鸢尾花数据进行预测
    增加交叉验证网格搜索
    :return: None
    '''

    # 准备数据集
    iris = load_iris()
    #     特征值和目标值分清楚
    #     切分训练集和测试集
    x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, train_size=0.8)
    #     特征工程：
    #         标准化
    stand = StandardScaler()
    x_train = stand.fit_transform(x_train)
    x_test = stand.transform(x_test)
    # 使用k近邻算法进行训练
    #     实例化KNeighborsClassifier
    knn = KNeighborsClassifier()
    # 增加交叉验证网格搜索
    # 构建参数字典
    param_dict = {'n_neighbors':[1,3,5,7,9,11,13]}
    gscv = GridSearchCV(knn, param_grid=param_dict, cv=3)
    #     训练 fit 训练集
    gscv.fit(x_train, y_train)
    #     评估 score  测试集 n准确率
    score = gscv.score(x_test, y_test)
    print('score', score)
    #     预测 predict 测试集
    y_predict = gscv.predict(x_test)
    print(y_predict == y_test)


    print('交叉验证最好的结果：', gscv.best_score_)
    print('最好的参数模型', gscv.best_estimator_)
    print('每次交叉验证的准确率结果', gscv.cv_results_)

    return None

## 2.0  新闻分类  :使用朴素贝叶斯方法

In [4]:
def fetch_20new_demo():
    '''
    新闻分类
    使用朴素贝叶斯方法
    :return: None
    '''
    # 获取数据集
    news = fetch_20newsgroups(subset='all')

    # 数据集合进行分割， 训练集合  测试集合
    x_train, x_test, y_train, y_test = train_test_split(news.data, news.target, train_size=0.8)

    # 特征抽取 tfidf
    tfidf = TfidfVectorizer()
    x_train = tfidf.fit_transform(x_train)
    x_test = tfidf.transform(x_test)

    # 使用朴素贝叶斯进行 训练 评估 预测
    mn = MultinomialNB(alpha=1)
    mn.fit(x_train, y_train)

    # 评估
    score = mn.score(x_test, y_test)
    print('score', score)

    y_predict = mn.predict(x_test)

    print(y_predict[0:100] == y_test[0:100])

    return None

## 3.1 决策树预测泰坦尼克乘客生存状况

In [5]:
def titanic_demo():
    '''
    决策树预测泰坦尼克乘客生存状况
    :return: None
    '''
    # 准备数据
    titan = pd.read_csv('titanic.txt')
    # 筛选特征值和目标值
    x = titan[['pclass', 'age', 'sex']]
    y = titan['survived']

    # 处理年龄缺失值
    x['age'].fillna(x['age'].mean(), inplace=True)
    # print(x['age'])
    # print(x['sex'])
    # 针对性别（类别型特征） 需要 one_hot 编码
    dict = DictVectorizer(sparse=False)
    x = dict.fit_transform(x.to_dict(orient='records'))

    # print(x)
    # print(dict.get_feature_names())

    # 数据集的划分
    x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8)

    # 使用决策树进行训练评估预测
    dt = DecisionTreeClassifier(criterion='entropy', max_depth=5)

    dt.fit(x_train, y_train)

    score = dt.score(x_test, y_test)

    print('score', score)

    # 把图结构保存成dot文件
    export_graphviz(dt,out_file='./tree.dot')


    return None

## 3.1 随机森林预测泰坦尼克乘客生存状况

In [6]:
def titanic_random_forest_demo():
    '''
    随机森林预测泰坦尼克乘客生存状况
    :return: None
    '''
    # 准备数据
    titan = pd.read_csv('titanic.txt')
    # 筛选特征值和目标值
    x = titan[['pclass', 'age', 'sex']]
    y = titan['survived']

    # 处理年龄缺失值
    x['age'].fillna(x['age'].mean(), inplace=True)
    # print(x['age'])
    # print(x['sex'])
    # 针对性别（类别型特征） 需要 one_hot 编码
    dict = DictVectorizer(sparse=False)
    x = dict.fit_transform(x.to_dict(orient='records'))

    # print(x)
    # print(dict.get_feature_names())

    # 数据集的划分
    x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8)

    # 使用随机森林进行训练评估预测
    rf = RandomForestClassifier()

    # 构建参数字典
    param_dict = {'n_estimators':[120, 200], 'max_depth':[5, 7, 10]}

    # 交叉验证网格搜索
    gscv = GridSearchCV(rf, param_grid=param_dict, cv=3)

    gscv.fit(x_train, y_train)
    score = gscv.score(x_test, y_test)
    print('score', score)

    print('最好的参数', gscv.best_estimator_)
    print('验证集上最好表现', gscv.best_score_)
    print('交叉验证的细节', gscv.cv_results_)
    return None

In [7]:
if __name__ == '__main__':
    titanic_random_forest_demo()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


score 0.8022813688212928
最好的参数 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=120, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
验证集上最好表现 0.8285714285714286
交叉验证的细节 {'mean_fit_time': array([0.09314807, 0.14096085, 0.08615502, 0.15935858, 0.09676544,
       0.15847731]), 'std_fit_time': array([7.01864519e-03, 5.15759597e-04, 2.12282430e-03, 1.64897065e-03,
       4.54491659e-05, 3.11645515e-03]), 'mean_score_time': array([0.00913119, 0.01392754, 0.00882165, 0.01650143, 0.01011864,
       0.01597436]), 'std_score_time': array([1.02105708e-04, 1.97986529e-04, 6.44772279e-05, 1.82994401e-04,
       1.43648903e-04, 7.62610422e-04]), 'param_max_depth': masked_array(data=[5,

In [8]:
knn_iris_demo()

score 0.9666666666666667
[ True  True  True False  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True]




In [9]:
knn_iris_gridsearch_cv_demo()

score 0.9666666666666667
[ True  True  True  True  True  True  True  True  True  True  True False
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True]
交叉验证最好的结果： 0.95
最好的参数模型 KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')
每次交叉验证的准确率结果 {'mean_fit_time': array([0.00052468, 0.00039593, 0.000542  , 0.00038298, 0.00036391,
       0.00033816, 0.00035532]), 'std_fit_time': array([1.11315408e-04, 5.65413513e-05, 1.42318297e-04, 2.47537228e-05,
       1.76551422e-05, 4.74580085e-06, 1.99858047e-05]), 'mean_score_time': array([0.00081094, 0.00096297, 0.00084805, 0.00084861, 0.00071605,
       0.00073338, 0.00073703]), 'std_score_time': array([9.85316681e-05, 1.93125623e-04, 8.35370526e-05, 5.49544338e-05,
       1.47875054e-05, 2.94373737e-05, 4.49511590e-05]), 'param_n_neighbors': masked_array(data=[1, 3, 5, 7, 9, 11, 1



In [10]:
titanic_demo()

score 0.8174904942965779


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [None]:
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
fetch_20new_demo()

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)
