In [18]:
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import tensorflow as tf

  from ._conv import register_converters as _register_converters


In [2]:
source = "./breast-cancer-wisconsin.data"
column_names = ["Sampel code number", "Clump Thickness", "Uniformity of Cell Size", "Uniformity of Cell Shape",
                "Marginal Adhesion", "Single Epithelial Cell Size", "Bare Nuclei", "Bland Chromatin", 
                "Normal Nucleoli", "Mitoses", "Class"]
data = pd.read_csv(source, names=column_names)
data.shape

(699, 11)

In [3]:
data = data.replace(to_replace="?", value=np.nan)
data = data.dropna(how="any")
data.shape

(683, 11)

In [4]:
X_train, X_test, y_train, y_test = \
    train_test_split(data[column_names[1:10]], data[column_names[10]], test_size=0.25, random_state=33)

In [5]:
y_train.value_counts()

2    344
4    168
Name: Class, dtype: int64

In [6]:
y_test.value_counts()

2    100
4     71
Name: Class, dtype: int64

In [7]:
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.linear_model import LogisticRegression, SGDClassifier
std = StandardScaler()
# 正则化数据
X_train, X_test = std.fit_transform(X_train), std.fit_transform(X_test)
lr, sgdc = LogisticRegression(), SGDClassifier()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)
sgdc.fit(X_train, y_train)
sgdc_y_pred = sgdc.predict(X_test)



In [8]:
from sklearn.metrics import classification_report
print("Acc of LR Classifier:", lr.score(X_test, y_test))
print(classification_report(y_test, lr_pred, target_names=['Benign', 'Malignant']))

Acc of LR Classifier: 0.9707602339181286
             precision    recall  f1-score   support

     Benign       0.96      0.99      0.98       100
  Malignant       0.99      0.94      0.96        71

avg / total       0.97      0.97      0.97       171



In [9]:
print("Acc of SGD Classifier:", sgdc.score(X_test, y_test))
print(classification_report(y_test, sgdc_y_pred, target_names=['Benign', 'Malignant']))

Acc of SGD Classifier: 0.9239766081871345
             precision    recall  f1-score   support

     Benign       0.88      1.00      0.94       100
  Malignant       1.00      0.82      0.90        71

avg / total       0.93      0.92      0.92       171



In [10]:
from sklearn.datasets import fetch_20newsgroups
news = fetch_20newsgroups(subset='all')

In [11]:
print(len(news.data))

18846


In [12]:
print(news.data[0], news.target[0])

From: Mamatha Devineni Ratnam <mr47+@andrew.cmu.edu>
Subject: Pens fans reactions
Organization: Post Office, Carnegie Mellon, Pittsburgh, PA
Lines: 12
NNTP-Posting-Host: po4.andrew.cmu.edu



I am sure some bashers of Pens fans are pretty confused about the lack
of any kind of posts about the recent Pens massacre of the Devils. Actually,
I am  bit puzzled too and a bit relieved. However, I am going to put an end
to non-PIttsburghers' relief with a bit of praise for the Pens. Man, they
are killing those Devils worse than I thought. Jagr just showed you why
he is much better than his regular season stats. He is also a lot
fo fun to watch in the playoffs. Bowman should let JAgr have a lot of
fun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final
regular season game.          PENS RULE!!!

 10


In [13]:
X_train, X_test, y_train, y_test = train_test_split(news.data, news.target, test_size=0.25, random_state=33)

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer()
X_train, X_test = vec.fit_transform(X_train), vec.transform(X_test)

In [16]:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
mnb.fit(X_train, y_train)
mnb_y_pred = mnb.predict(X_test)

In [17]:
from sklearn.metrics import classification_report
print("Acc of NB Classifier:", mnb.score(X_test, y_test))
print(classification_report(y_test, mnb_y_pred, target_names=news.target_names))

Acc of NB Classifier: 0.8397707979626485
                          precision    recall  f1-score   support

             alt.atheism       0.86      0.86      0.86       201
           comp.graphics       0.59      0.86      0.70       250
 comp.os.ms-windows.misc       0.89      0.10      0.17       248
comp.sys.ibm.pc.hardware       0.60      0.88      0.72       240
   comp.sys.mac.hardware       0.93      0.78      0.85       242
          comp.windows.x       0.82      0.84      0.83       263
            misc.forsale       0.91      0.70      0.79       257
               rec.autos       0.89      0.89      0.89       238
         rec.motorcycles       0.98      0.92      0.95       276
      rec.sport.baseball       0.98      0.91      0.95       251
        rec.sport.hockey       0.93      0.99      0.96       233
               sci.crypt       0.86      0.98      0.91       238
         sci.electronics       0.85      0.88      0.86       249
                 sci.med       0.9

In [22]:
titanic = pd.read_csv('./titanic.txt')
titanic.head()

Unnamed: 0,row.names,pclass,survived,name,age,embarked,home.dest,room,ticket,boat,sex
0,1,1st,1,"Allen, Miss Elisabeth Walton",29.0,Southampton,"St Louis, MO",B-5,24160 L221,2,female
1,2,1st,0,"Allison, Miss Helen Loraine",2.0,Southampton,"Montreal, PQ / Chesterville, ON",C26,,,female
2,3,1st,0,"Allison, Mr Hudson Joshua Creighton",30.0,Southampton,"Montreal, PQ / Chesterville, ON",C26,,(135),male
3,4,1st,0,"Allison, Mrs Hudson J.C. (Bessie Waldo Daniels)",25.0,Southampton,"Montreal, PQ / Chesterville, ON",C26,,,female
4,5,1st,1,"Allison, Master Hudson Trevor",0.9167,Southampton,"Montreal, PQ / Chesterville, ON",C22,,11,male


In [35]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1313 entries, 0 to 1312
Data columns (total 11 columns):
row.names    1313 non-null int64
pclass       1313 non-null object
survived     1313 non-null int64
name         1313 non-null object
age          633 non-null float64
embarked     821 non-null object
home.dest    754 non-null object
room         77 non-null object
ticket       69 non-null object
boat         347 non-null object
sex          1313 non-null object
dtypes: float64(1), int64(2), object(8)
memory usage: 112.9+ KB


In [42]:
X = titanic[['pclass', 'age', 'sex']]
y = titanic[['survived']]
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1313 entries, 0 to 1312
Data columns (total 3 columns):
pclass    1313 non-null object
age       633 non-null float64
sex       1313 non-null object
dtypes: float64(1), object(2)
memory usage: 30.9+ KB


In [43]:
X['age'].fillna(X['age'].mean(), inplace=True)
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1313 entries, 0 to 1312
Data columns (total 3 columns):
pclass    1313 non-null object
age       1313 non-null float64
sex       1313 non-null object
dtypes: float64(1), object(2)
memory usage: 30.9+ KB


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [70]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33)

from sklearn.feature_extraction import DictVectorizer # 把文本项转为数值项
vec = DictVectorizer(sparse=True)
X_train = vec.fit_transform(X_train.to_dict(orient='record'))
print(vec.feature_names_)
X_test = vec.fit_transform(X_test.to_dict(orient='record'))

['age', 'pclass=1st', 'pclass=2nd', 'pclass=3rd', 'sex=female', 'sex=male']


In [90]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifiersifier()
dtc.fit(X_train, y_train)
y_pred = dtc.predict(X_test)

In [91]:
from sklearn.metrics import classification_report
print('Acc of dtc:', dtc.score(X_test, y_test))
print(classification_report(y_test, y_pred, target_names=['died', 'survived']))

Acc of dtc: 0.7811550151975684
             precision    recall  f1-score   support

       died       0.78      0.91      0.84       202
   survived       0.80      0.58      0.67       127

avg / total       0.78      0.78      0.77       329



In [94]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
rfc_pred = rfc.predict(X_test)

gbc = GradientBoostingClassifier()
gbc.fit(X_train, y_train)
gbc_y_pred = gbc.predict(X_test)

  after removing the cwd from sys.path.
  y = column_or_1d(y, warn=True)


In [99]:
print("Acc of decision is", dtc.score(X_test, y_test))
print(classification_report(y_pred, y_test))

Acc of decision is 0.7811550151975684
             precision    recall  f1-score   support

          0       0.91      0.78      0.84       236
          1       0.58      0.80      0.67        93

avg / total       0.81      0.78      0.79       329



In [100]:
print("Acc of rfc is", rfc.score(X_test, y_test))
print(classification_report(rfc_pred, y_test))

Acc of rfc is 0.7781155015197568
             precision    recall  f1-score   support

          0       0.91      0.77      0.83       237
          1       0.57      0.79      0.67        92

avg / total       0.81      0.78      0.79       329



In [101]:
print("Acc of bgc is", gbc.score(X_test, y_test))
print(classification_report(gbc_y_pred, y_test))

Acc of bgc is 0.790273556231003
             precision    recall  f1-score   support

          0       0.92      0.78      0.84       239
          1       0.58      0.82      0.68        90

avg / total       0.83      0.79      0.80       329

