In [2]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import jieba
import pandas as pd
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.feature_selection import VarianceThreshold
from scipy.stats import pearsonr
from sklearn.decomposition import PCA

def datasets_demo():
    '''
    sklearn数据集使用'''
    iris=load_iris()
    print('鸢尾花数据集:\n',iris)
    print('查看数据集描述:\n',iris['DESCR'])
    print('查看特征值的名字:\n',iris.feature_names)
    print('查看特征值:\n',iris.data,iris.data.shape)
#     数据集划分
    x_train,x_testchouqu,y_train,y_test=train_test_split(iris.data,iris.target,test_size=0.2,random_state=22)
    print('训练集的特征值:\n',x_train,x_train.shape)
    return None
def dict_demo():
    '''
    字典特征抽取'''
    
    data=[{'city':'北京','temperature':100},{'city':'上海','temperature':60},{'city':'深圳','temperature':30}]
    transfer=DictVectorizer(sparse=False)
    
    data_new=transfer.fit_transform(data)
    print("data_new:\n",data_new)
    print('特征名字：\n',transfer.get_feature_names())
    return None
def count_demo():
    '''
    文字特征抽取:CountVecotrizer
    '''
    data=['life is short,i like like python','life is too long,i dislike python']
    transfer =CountVectorizer()
    data_new=transfer.fit_transform(data)
    print('data_new:\n',data_new.toarray())
    print('特征名字:\n',transfer.get_feature_names())
    return None

def count_chinese_demo():
    '''
    中文文本特征抽取:CountVecotrizer
    '''
    data=['我 爱 北京 天安门','天安门 上 太阳 升']
    transfer =CountVectorizer()
    data_new=transfer.fit_transform(data)
    print('data_new:\n',data_new.toarray())
    print('特征名字:\n',transfer.get_feature_names())
    return None

def count_chinese_demo2():
    '''
    中文文本特征抽取
    '''
#     1、将中文文本进行分词
    data=['你就收到了的萨芬发过去文件.','阿斯顿法国红酒看来去微软.']
    data_new=[]
    for sent in data:
        data_new.append(cut_word(sent))
#     print(data_new)
    transfer=CountVectorizer(stop_words=['微软','法国'])
    data_final=transfer.fit_transform(data_new)
    print('data_new:\n',data_final.toarray())
    print('特征名字:\n',transfer.get_feature_names())
    
    return None

def cut_word(text):
    '''
    进行中文分词：'我爱北京天安门'-->'我 爱 北京 天安门'
    '''
    text=' '.join(list(jieba.cut(text)))
    
    return text

def tfidf_demo():
    '''
    用于TF-IDF的方法进行文本特征抽取
    '''
    
    #     1、将中文文本进行分词
    data=['你就收到了的萨芬发过去文件.','阿斯顿法国红酒看来去微软.']
    data_new=[]
    for sent in data:
        data_new.append(cut_word(sent))
#     print(data_new)
    transfer=TfidfVectorizer(stop_words=['微软','法国'])
    data_final=transfer.fit_transform(data_new)
    print('data_new:\n',data_final.toarray())
    print('特征名字:\n',transfer.get_feature_names())
    
    return None

def minmax_demo():
    '''
    归一化
    '''
    data=pd.read_csv("F:/机器学习/Python3天快速入门机器学习项目资料/机器学习day1资料/02-代码/dating.txt")
    data=data.iloc[:,:3]
    print('data:\n',data) 
    transfer=MinMaxScaler()
    data_new=transfer.fit_transform(data)
    print('data_new:\n',data_new)
    
    return None

def stand_demo():
    '''
    标准化
    '''
    data=pd.read_csv("F:/机器学习/Python3天快速入门机器学习项目资料/机器学习day1资料/02-代码/dating.txt")
    data=data.iloc[:,:3]
    print('data:\n',data) 
    transfer=StandardScaler()
    data_new=transfer.fit_transform(data)
    print('data_new:\n',data_new)
    
    return None

def variance_demo():
    '''
    过滤低方差特征
    '''
    
    data=pd.read_csv('F:/机器学习/Python3天快速入门机器学习项目资料/机器学习day1资料/02-代码/factor_returns.csv')
    
    data=data.iloc[:,1:-2]
    print('data\n',data) 
    transfer=VarianceThreshold(threshold=10)
    data_new=transfer.fit_transform(data)
    print('data_new\n',data_new,data_new.shape)
    
#     计算某两个变量之间的相关系数
    r1=pearsonr(data['pe_ratio'],data['pb_ratio'])
    print('相关系数：\n',r1)
    r2=pearsonr(data['revenue'],data['total_expense'])
    print('revenue与total_expense之间的相关性：\n',r2)

    return None

def pca_demo():
    '''
    PCA降维
    '''
    data=[[2,8,4,5],[6,3,0,8],[5,4,9,1]]
    transfer=PCA(n_components=0.95)     #n_components=2
    data_new=transfer.fit_transform(data)
    print('data_new:\n',data_new)
    return None

if __name__=='__main__':
#     datasets_demo()
#     dict_demo()
#     count_demo()
#     count_chinese_demo()
#     count_chinese_demo2()
#     print(cut_word('我爱北京天安门'))
    tfidf_demo()
#     minmax_demo()
#     stand_demo()
#     variance_demo()
#     pca_demo()

data_new:
 ['你 就 收到 了 的 萨芬发 过去 文件 .', '阿斯顿 法国 红酒 看来 去 微软 .']
data_new:
 [[0.5        0.5        0.         0.         0.5        0.5
  0.        ]
 [0.         0.         0.57735027 0.57735027 0.         0.
  0.57735027]]
特征名字:
 ['收到', '文件', '看来', '红酒', '萨芬发', '过去', '阿斯顿']
