In [1]:
import pandas as pd
df = pd.DataFrame({"gene_segA": [1, 0, 0, 1, 1, 1, 0, 0, 1, 0],
                  "gene_segB": [1, 0, 1, 0, 1, 1, 0, 0, 1, 0],
                  "hypertension": ["Y", "N", "N", "N", "N",
                                  "N", "Y", "N", "Y", "N"],
                  "Gallstones": ["Y", "N", "N", "N", "Y",
                                "Y", "Y", "N", "N", "Y"]})
df

Unnamed: 0,gene_segA,gene_segB,hypertension,Gallstones
0,1,1,Y,Y
1,0,0,N,N
2,0,1,N,N
3,1,0,N,N
4,1,1,N,Y
5,1,1,N,Y
6,0,0,Y,Y
7,0,0,N,N
8,1,1,Y,N
9,0,0,N,Y


In [2]:
# 将数据集中的Y/N替换为数字，用1替换Y，用0替换N
df.replace({"N": 0, "Y": 1})

Unnamed: 0,gene_segA,gene_segB,hypertension,Gallstones
0,1,1,1,1
1,0,0,0,0
2,0,1,0,0
3,1,0,0,0
4,1,1,0,1
5,1,1,0,1
6,0,0,1,1
7,0,0,0,0
8,1,1,1,0
9,0,0,0,1


In [3]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit_transform(df['hypertension'])

array([1, 0, 0, 0, 0, 0, 1, 0, 1, 0])

In [4]:
le.fit_transform([1, 3, 3, 7, 3, 1])

array([0, 1, 1, 2, 1, 0])

In [5]:
le.inverse_transform([0, 1, 1, 2, 1, 0])

array([1, 3, 3, 7, 3, 1])

# 项目案例

In [6]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(['white', 'green', 'red', 'green', 'white'])
le.classes_

array(['green', 'red', 'white'], dtype='<U5')

In [7]:
le.transform(['green', 'green', 'green', 'white'])

array([0, 0, 0, 2])

In [8]:
le.transform(['green', 'green', 'green', 'blue'])

ValueError: y contains previously unseen labels: ['blue']

# 动手练习

In [9]:
# 第一题
df = pd.read_csv("./data/cwurData.csv")
df

Unnamed: 0,world_rank,institution,country,national_rank,quality_of_education,alumni_employment,quality_of_faculty,publications,influence,citations,broad_impact,patents,score,year
0,1,Harvard University,USA,1,7,9,1,1,1,1,,5,100.00,2012
1,2,Massachusetts Institute of Technology,USA,2,9,17,3,12,4,4,,1,91.67,2012
2,3,Stanford University,USA,3,17,11,5,4,2,2,,15,89.50,2012
3,4,University of Cambridge,United Kingdom,1,10,24,4,16,16,11,,50,86.17,2012
4,5,California Institute of Technology,USA,4,2,29,7,37,22,22,,18,85.21,2012
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2195,996,University of the Algarve,Portugal,7,367,567,218,926,845,812,969.0,816,44.03,2015
2196,997,Alexandria University,Egypt,4,236,566,218,997,908,645,981.0,871,44.03,2015
2197,998,Federal University of Ceará,Brazil,18,367,549,218,830,823,812,975.0,824,44.03,2015
2198,999,University of A Coruña,Spain,40,367,567,218,886,974,812,975.0,651,44.02,2015


In [10]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit_transform(df['country'])

array([54, 54, 54, ...,  4, 48,  8])

In [11]:
# 第二题
d1 = "I am Laoqi. I am a programmer."
d2 = "Laoqi is in Soochow. It is a beautiful city."
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(stop_words="english", decode_error='ignore')
count_vect.fit_transform(d1)

ValueError: Iterable over raw text documents expected, string object received.

In [12]:
count_vect.fit_transform([d1])

<1x2 sparse matrix of type '<class 'numpy.int64'>'
	with 2 stored elements in Compressed Sparse Row format>

In [13]:
df = count_vect.fit_transform([d1, d2]).toarray()

In [14]:
df

array([[0, 0, 1, 1, 0],
       [1, 1, 1, 0, 1]])

In [15]:
# 参考答案
import re
d1 = "I am Laoqi. I am a programmer."
d2 = "Laoqi is in Soochow. It is a beautiful city."
words = re.findall(r"\w+", d1+d2)  # 以正则表达式提炼单词，不用split()，这样就避免了句点问题
words

['I',
 'am',
 'Laoqi',
 'I',
 'am',
 'a',
 'programmer',
 'Laoqi',
 'is',
 'in',
 'Soochow',
 'It',
 'is',
 'a',
 'beautiful',
 'city']

In [16]:
words = list(set(words))  # 唯一单词保存为列表
[w.lower() for w in words]
words

['in',
 'a',
 'beautiful',
 'Laoqi',
 'is',
 'city',
 'I',
 'am',
 'Soochow',
 'programmer',
 'It']

In [17]:
# 为每句话中的单词出现次数计数
def count_word(document, unique_words):
    count_doc = []
    for word in unique_words:
        n = document.lower().count(word)
        count_doc.append(n)
    return count_doc

count1 = count_word(d1, words)
count2 = count_word(d2, words)
print(count1)
print(count2)

[0, 5, 0, 0, 0, 0, 0, 3, 0, 1, 0]
[1, 3, 1, 0, 2, 1, 0, 0, 0, 0, 0]


In [18]:
# 保存为DataFrame
df = pd.DataFrame([count1, count2], columns=words, index=['d1', 'd2'])
df

Unnamed: 0,in,a,beautiful,Laoqi,is,city,I,am,Soochow,programmer,It
d1,0,5,0,0,0,0,0,3,0,1,0
d2,1,3,1,0,2,1,0,0,0,0,0


In [19]:
# 方法2
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
tf1 = count_vect.fit_transform([d1, d2])
tf1.shape

(2, 9)

In [20]:
count_vect.get_feature_names()

['am', 'beautiful', 'city', 'in', 'is', 'it', 'laoqi', 'programmer', 'soochow']

In [21]:
tf1

<2x9 sparse matrix of type '<class 'numpy.int64'>'
	with 10 stored elements in Compressed Sparse Row format>

In [22]:
tf1.toarray()

array([[2, 0, 0, 0, 0, 0, 1, 1, 0],
       [0, 1, 1, 1, 2, 1, 1, 0, 1]])