# 第14章 DataFrameを用いたデータクレンジング

## 14.1 CSV

### 14.1.1 Pandaを用いたCSVの読み込み

In [189]:
import pandas as pd

csv_path = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data'

df = pd.read_csv(csv_path, header=None)

df.columns = [
    '', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium', 'Total phenols', 'Flavanoids',
    'Nonflavanoid phenols', 'Proanthocyanins', 'Color intensity', 'Hue', '0D280/0D315 of diluted wines', 'Proline'
]

print(df)

        Alcohol  Malic acid   Ash  Alcalinity of ash  Magnesium  \
0    1    14.23        1.71  2.43               15.6        127   
1    1    13.20        1.78  2.14               11.2        100   
2    1    13.16        2.36  2.67               18.6        101   
3    1    14.37        1.95  2.50               16.8        113   
4    1    13.24        2.59  2.87               21.0        118   
..  ..      ...         ...   ...                ...        ...   
173  3    13.71        5.65  2.45               20.5         95   
174  3    13.40        3.91  2.48               23.0        102   
175  3    13.27        4.28  2.26               20.0        120   
176  3    13.17        2.59  2.37               20.0        120   
177  3    14.13        4.10  2.74               24.5         96   

     Total phenols  Flavanoids  Nonflavanoid phenols  Proanthocyanins  \
0             2.80        3.06                  0.28             2.29   
1             2.65        2.76                  0

In [190]:
import pandas as pd

csv_path = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'

df = pd.read_csv(csv_path, header=None)

df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'class']

print(df)

     sepal length  sepal width  petal length  petal width           class
0             5.1          3.5           1.4          0.2     Iris-setosa
1             4.9          3.0           1.4          0.2     Iris-setosa
2             4.7          3.2           1.3          0.2     Iris-setosa
3             4.6          3.1           1.5          0.2     Iris-setosa
4             5.0          3.6           1.4          0.2     Iris-setosa
..            ...          ...           ...          ...             ...
145           6.7          3.0           5.2          2.3  Iris-virginica
146           6.3          2.5           5.0          1.9  Iris-virginica
147           6.5          3.0           5.2          2.0  Iris-virginica
148           6.2          3.4           5.4          2.3  Iris-virginica
149           5.9          3.0           5.1          1.8  Iris-virginica

[150 rows x 5 columns]


### 14.1.2 CSVライブラリを用いたCSVの作成

In [191]:
import csv

with open('csv00.csv', 'w') as csvfile:

    writer = csv.writer(csvfile, lineterminator='¥n')
    
    writer.writerow(['city', 'year', 'season'])
    writer.writerow(['Nagano', '1998', 'winter'])


### 14.1.3 Pandasを用いたCSVの作成

In [192]:
#
# DataFrame型データをCSVに保存する
#
import pandas as pd

data = {
    'city': ['Nagano', 'Sydney', 'Salt Lake City', 'Athens',],
    'year': [1998, 2000, 2002, 2004,],
    'season': ['winter', 'summer', 'winter', 'summer',],
}

df = pd.DataFrame(data)

print(df)

df.to_csv('csv01.csv')

             city  year  season
0          Nagano  1998  winter
1          Sydney  2000  summer
2  Salt Lake City  2002  winter
3          Athens  2004  summer


## 14.2 DataFrameの復習

### 14.2.1 DataFrameの復習

In [193]:
import pandas as pd
from pandas import Series, DataFrame

data1 = {
    'ID': [100, 101, 102, 103, 104, 106, 108, 110, 111, 113],
    'city': ['Tokyo', 'Osaka', 'Kyoto', 'Hokkaido', 'Tokyo', 'Tokyo', 'Osaka', 'Kyoto', 'Hakkaido', 'Tokyo'],
    'birth_year': [1990, 1989, 1992, 1997, 1982, 1991, 1988, 1990, 1995, 1981],
    'name': ['Hiroshi', 'Akiko', 'Yuki', 'Satoru', 'Steeve', 'Mituru', 'Aoi', 'Tarou', 'Suguru', 'Mitsuo'],
}

data1_df = DataFrame(data1)

print(data1_df)
print()

data2 = {
    'ID': [107, 109],
    'city': ['Sendai', 'Nagano'],
    'birth_year': [1994, 1988]
}

data2_df = DataFrame(data2)

print(data2_df)
print()

data1_df.append(data2_df).sort_values(by=['ID']).reset_index(drop=True)


    ID      city  birth_year     name
0  100     Tokyo        1990  Hiroshi
1  101     Osaka        1989    Akiko
2  102     Kyoto        1992     Yuki
3  103  Hokkaido        1997   Satoru
4  104     Tokyo        1982   Steeve
5  106     Tokyo        1991   Mituru
6  108     Osaka        1988      Aoi
7  110     Kyoto        1990    Tarou
8  111  Hakkaido        1995   Suguru
9  113     Tokyo        1981   Mitsuo

    ID    city  birth_year
0  107  Sendai        1994
1  109  Nagano        1988



  data1_df.append(data2_df).sort_values(by=['ID']).reset_index(drop=True)


Unnamed: 0,ID,city,birth_year,name
0,100,Tokyo,1990,Hiroshi
1,101,Osaka,1989,Akiko
2,102,Kyoto,1992,Yuki
3,103,Hokkaido,1997,Satoru
4,104,Tokyo,1982,Steeve
5,106,Tokyo,1991,Mituru
6,107,Sendai,1994,
7,108,Osaka,1988,Aoi
8,109,Nagano,1988,
9,110,Kyoto,1990,Tarou


## 14.3 欠損値

### 14.3.1 リストワイズ/ペアワイズ削除

In [194]:
import pandas as pd
import numpy as np
from numpy import nan as NA

np.random.seed(0)

sample_data_frame = pd.DataFrame(np.random.rand(10, 4))

sample_data_frame.iloc[1, 0] = NA
sample_data_frame.iloc[2, 2] = NA
sample_data_frame.iloc[5:, 3] = NA

sample_data_frame

Unnamed: 0,0,1,2,3
0,0.548814,0.715189,0.602763,0.544883
1,,0.645894,0.437587,0.891773
2,0.963663,0.383442,,0.528895
3,0.568045,0.925597,0.071036,0.087129
4,0.020218,0.83262,0.778157,0.870012
5,0.978618,0.799159,0.461479,
6,0.118274,0.639921,0.143353,
7,0.521848,0.414662,0.264556,
8,0.45615,0.568434,0.01879,
9,0.612096,0.616934,0.943748,


In [195]:
# リストワイズ削除
# .. データ欠損のある行をまるごと削除する
sample_data_frame.dropna()

Unnamed: 0,0,1,2,3
0,0.548814,0.715189,0.602763,0.544883
3,0.568045,0.925597,0.071036,0.087129
4,0.020218,0.83262,0.778157,0.870012


In [196]:
sample_data_frame[[0, 1, 2]].dropna()

Unnamed: 0,0,1,2
0,0.548814,0.715189,0.602763
3,0.568045,0.925597,0.071036
4,0.020218,0.83262,0.778157
5,0.978618,0.799159,0.461479
6,0.118274,0.639921,0.143353
7,0.521848,0.414662,0.264556
8,0.45615,0.568434,0.01879
9,0.612096,0.616934,0.943748


In [197]:
# ペアワイズ削除
# .. データ欠損のすくない行のみをのこして、NA行を削除

import pandas as pd
import numpy as np
from numpy import nan as NA

np.random.seed(0)

sample_data_frame = pd.DataFrame(np.random.rand(10, 4))

sample_data_frame.iloc[1, 0] = NA
sample_data_frame.iloc[2, 2] = NA
sample_data_frame.iloc[5:, 3] = NA

sample_data_frame[[0, 2]].dropna()

Unnamed: 0,0,2
0,0.548814,0.602763
3,0.568045,0.071036
4,0.020218,0.778157
5,0.978618,0.461479
6,0.118274,0.143353
7,0.521848,0.264556
8,0.45615,0.01879
9,0.612096,0.943748


### 14.3.2 欠損値の補完

In [198]:
import pandas as pd
import numpy as np
from numpy import nan as NA

np.random.seed(0)

sample_data_frame = pd.DataFrame(np.random.rand(10, 4))

sample_data_frame.iloc[1, 0] = NA
sample_data_frame.iloc[2, 2] = NA
sample_data_frame.iloc[5:, 3] = NA

# NAを0で補完する
sample_data_frame.fillna(0)

Unnamed: 0,0,1,2,3
0,0.548814,0.715189,0.602763,0.544883
1,0.0,0.645894,0.437587,0.891773
2,0.963663,0.383442,0.0,0.528895
3,0.568045,0.925597,0.071036,0.087129
4,0.020218,0.83262,0.778157,0.870012
5,0.978618,0.799159,0.461479,0.0
6,0.118274,0.639921,0.143353,0.0
7,0.521848,0.414662,0.264556,0.0
8,0.45615,0.568434,0.01879,0.0
9,0.612096,0.616934,0.943748,0.0


In [199]:
# 前の行の値で補完する
sample_data_frame.fillna(method='ffill')

Unnamed: 0,0,1,2,3
0,0.548814,0.715189,0.602763,0.544883
1,0.548814,0.645894,0.437587,0.891773
2,0.963663,0.383442,0.437587,0.528895
3,0.568045,0.925597,0.071036,0.087129
4,0.020218,0.83262,0.778157,0.870012
5,0.978618,0.799159,0.461479,0.870012
6,0.118274,0.639921,0.143353,0.870012
7,0.521848,0.414662,0.264556,0.870012
8,0.45615,0.568434,0.01879,0.870012
9,0.612096,0.616934,0.943748,0.870012


In [200]:
import pandas as pd
import numpy as np
from numpy import nan as NA

np.random.seed(0)

sample_data_frame = pd.DataFrame(np.random.rand(10, 4))

sample_data_frame.iloc[1, 0] = NA
sample_data_frame.iloc[6:, 2] = NA

# NAを0で補完する
sample_data_frame.fillna(method='ffill')

Unnamed: 0,0,1,2,3
0,0.548814,0.715189,0.602763,0.544883
1,0.548814,0.645894,0.437587,0.891773
2,0.963663,0.383442,0.791725,0.528895
3,0.568045,0.925597,0.071036,0.087129
4,0.020218,0.83262,0.778157,0.870012
5,0.978618,0.799159,0.461479,0.780529
6,0.118274,0.639921,0.461479,0.944669
7,0.521848,0.414662,0.461479,0.774234
8,0.45615,0.568434,0.461479,0.617635
9,0.612096,0.616934,0.461479,0.68182


### 14.3.3 欠損値の補完（平均値代入法）

In [201]:
import pandas as pd
import numpy as np
from numpy import nan as NA

np.random.seed(0)

sample_data_frame = pd.DataFrame(np.random.rand(10, 4))

sample_data_frame.iloc[1, 0] = NA
sample_data_frame.iloc[6:, 2] = NA

sample_data_frame.fillna( sample_data_frame.mean() )

Unnamed: 0,0,1,2,3
0,0.548814,0.715189,0.602763,0.544883
1,0.53197,0.645894,0.437587,0.891773
2,0.963663,0.383442,0.791725,0.528895
3,0.568045,0.925597,0.071036,0.087129
4,0.020218,0.83262,0.778157,0.870012
5,0.978618,0.799159,0.461479,0.780529
6,0.118274,0.639921,0.523791,0.944669
7,0.521848,0.414662,0.523791,0.774234
8,0.45615,0.568434,0.523791,0.617635
9,0.612096,0.616934,0.523791,0.68182


## 14.4 データ集約

### 14.4.1 キーごとの統計量の算出

In [202]:
import pandas as pd

csv_path = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data'

df = pd.read_csv(csv_path, header=None)

df.columns = [
    '', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium', 'Total phenols', 'Flavanoids',
    'Nonflavanoid phenols', 'Proanthocyanins', 'Color intensity', 'Hue', '0D280/0D315 of diluted wines', 'Proline'
]

df.mean()

                                  1.938202
Alcohol                          13.000618
Malic acid                        2.336348
Ash                               2.366517
Alcalinity of ash                19.494944
Magnesium                        99.741573
Total phenols                     2.295112
Flavanoids                        2.029270
Nonflavanoid phenols              0.361854
Proanthocyanins                   1.590899
Color intensity                   5.058090
Hue                               0.957449
0D280/0D315 of diluted wines      2.611685
Proline                         746.893258
dtype: float64

In [203]:
df['Alcohol'].mean()

13.00061797752809

In [204]:
df['Magnesium'].mean()

99.74157303370787

### 14.4.2 重複データ

In [205]:
import pandas as pd

dupli_data = pd.DataFrame({
    'col1': [1, 1, 2, 3, 4, 4, 6, 6],
    'col2': ['a', 'b', 'b', 'b', 'c', 'c', 'b', 'b'],
})

dupli_data

Unnamed: 0,col1,col2
0,1,a
1,1,b
2,2,b
3,3,b
4,4,c
5,4,c
6,6,b
7,6,b


In [206]:
# ダブりの行がある場合、True
dupli_data.duplicated()

0    False
1    False
2    False
3    False
4    False
5     True
6    False
7     True
dtype: bool

In [207]:
dupli_data['col1'].duplicated()

0    False
1     True
2    False
3    False
4    False
5     True
6    False
7     True
Name: col1, dtype: bool

In [208]:
dupli_data['col2'].duplicated()

0    False
1    False
2     True
3     True
4    False
5     True
6     True
7     True
Name: col2, dtype: bool

In [209]:
# 重複行を削除
dupli_data.drop_duplicates()

Unnamed: 0,col1,col2
0,1,a
1,1,b
2,2,b
3,3,b
4,4,c
6,6,b


### 14.4.3 マッピング

In [210]:
import pandas as pd

attri_data1 = {
    'ID': [100, 101, 102, 103, 104, 106, 108, 110, 111, 113],
    'city': ['Tokyo', 'Osaka', 'Kyoto', 'Hokkaido', 'Tokyo', 'Tokyo', 'Osaka', 'Kyoto', 'Hakkaido', 'Tokyo'],
    'birth_year': [1990, 1989, 1992, 1997, 1982, 1991, 1988, 1990, 1995, 1981],
    'name': ['Hiroshi', 'Akiko', 'Yuki', 'Satoru', 'Steeve', 'Mituru', 'Aoi', 'Tarou', 'Suguru', 'Mitsuo'],
}

attri_df = pd.DataFrame(attri_data1)

attri_df

Unnamed: 0,ID,city,birth_year,name
0,100,Tokyo,1990,Hiroshi
1,101,Osaka,1989,Akiko
2,102,Kyoto,1992,Yuki
3,103,Hokkaido,1997,Satoru
4,104,Tokyo,1982,Steeve
5,106,Tokyo,1991,Mituru
6,108,Osaka,1988,Aoi
7,110,Kyoto,1990,Tarou
8,111,Hakkaido,1995,Suguru
9,113,Tokyo,1981,Mitsuo


In [211]:
city_map = {
    'Tokyo': 'Kanto',
    'Hakkaido': 'Hokkaido',
    'Osaka': 'Kansai',
    'Kyoto': 'Kansai',
}

attri_df['region'] = attri_df['city'].map(city_map)

attri_df

Unnamed: 0,ID,city,birth_year,name,region
0,100,Tokyo,1990,Hiroshi,Kanto
1,101,Osaka,1989,Akiko,Kansai
2,102,Kyoto,1992,Yuki,Kansai
3,103,Hokkaido,1997,Satoru,
4,104,Tokyo,1982,Steeve,Kanto
5,106,Tokyo,1991,Mituru,Kanto
6,108,Osaka,1988,Aoi,Kansai
7,110,Kyoto,1990,Tarou,Kansai
8,111,Hakkaido,1995,Suguru,Hokkaido
9,113,Tokyo,1981,Mitsuo,Kanto


In [212]:
we_map = {
    'Tokyo': 'east',
    'Hakkaido': 'east',
    'Osaka': 'west',
    'Kyoto': 'west',
}

attri_df['we'] = attri_df['city'].map(we_map)

attri_df

Unnamed: 0,ID,city,birth_year,name,region,we
0,100,Tokyo,1990,Hiroshi,Kanto,east
1,101,Osaka,1989,Akiko,Kansai,west
2,102,Kyoto,1992,Yuki,Kansai,west
3,103,Hokkaido,1997,Satoru,,
4,104,Tokyo,1982,Steeve,Kanto,east
5,106,Tokyo,1991,Mituru,Kanto,east
6,108,Osaka,1988,Aoi,Kansai,west
7,110,Kyoto,1990,Tarou,Kansai,west
8,111,Hakkaido,1995,Suguru,Hokkaido,east
9,113,Tokyo,1981,Mitsuo,Kanto,east


### 14.4.4 ビン分割

In [213]:
import pandas as pd

attri_data1 = {
    'ID': [100, 101, 102, 103, 104, 106, 108, 110, 111, 113],
    'city': ['Tokyo', 'Osaka', 'Kyoto', 'Hokkaido', 'Tokyo', 'Tokyo', 'Osaka', 'Kyoto', 'Hakkaido', 'Tokyo'],
    'birth_year': [1990, 1989, 1992, 1997, 1982, 1991, 1988, 1990, 1995, 1981],
    'name': ['Hiroshi', 'Akiko', 'Yuki', 'Satoru', 'Steeve', 'Mituru', 'Aoi', 'Tarou', 'Suguru', 'Mitsuo'],
}

attri_df = pd.DataFrame(attri_data1)

attri_df

Unnamed: 0,ID,city,birth_year,name
0,100,Tokyo,1990,Hiroshi
1,101,Osaka,1989,Akiko
2,102,Kyoto,1992,Yuki
3,103,Hokkaido,1997,Satoru
4,104,Tokyo,1982,Steeve
5,106,Tokyo,1991,Mituru
6,108,Osaka,1988,Aoi
7,110,Kyoto,1990,Tarou
8,111,Hakkaido,1995,Suguru
9,113,Tokyo,1981,Mitsuo


In [214]:
birth_year_bins = [1980, 1985, 1990, 1995, 2000]

birth_year_cut_data = pd.cut(attri_df.birth_year, birth_year_bins)

birth_year_cut_data

0    (1985, 1990]
1    (1985, 1990]
2    (1990, 1995]
3    (1995, 2000]
4    (1980, 1985]
5    (1990, 1995]
6    (1985, 1990]
7    (1985, 1990]
8    (1990, 1995]
9    (1980, 1985]
Name: birth_year, dtype: category
Categories (4, interval[int64, right]): [(1980, 1985] < (1985, 1990] < (1990, 1995] < (1995, 2000]]

In [215]:
pd.value_counts(birth_year_cut_data)

(1985, 1990]    4
(1990, 1995]    3
(1980, 1985]    2
(1995, 2000]    1
Name: birth_year, dtype: int64

In [216]:
group_names = ['first_1980', 'second_1980', 'first_1990', 'second_1990']

birth_year_cut_data = pd.cut(attri_df.birth_year, birth_year_bins, labels=group_names)

pd.value_counts(birth_year_cut_data)

second_1980    4
first_1990     3
first_1980     2
second_1990    1
Name: birth_year, dtype: int64

In [217]:
pd.cut(attri_df.birth_year, 2)

0      (1989.0, 1997.0]
1    (1980.984, 1989.0]
2      (1989.0, 1997.0]
3      (1989.0, 1997.0]
4    (1980.984, 1989.0]
5      (1989.0, 1997.0]
6    (1980.984, 1989.0]
7      (1989.0, 1997.0]
8      (1989.0, 1997.0]
9    (1980.984, 1989.0]
Name: birth_year, dtype: category
Categories (2, interval[float64, right]): [(1980.984, 1989.0] < (1989.0, 1997.0]]

In [218]:
pd.cut(attri_df.ID, 2)

0    (99.987, 106.5]
1    (99.987, 106.5]
2    (99.987, 106.5]
3    (99.987, 106.5]
4    (99.987, 106.5]
5    (99.987, 106.5]
6     (106.5, 113.0]
7     (106.5, 113.0]
8     (106.5, 113.0]
9     (106.5, 113.0]
Name: ID, dtype: category
Categories (2, interval[float64, right]): [(99.987, 106.5] < (106.5, 113.0]]

## 添削問題

In [219]:
import pandas as pd
import numpy as np
from numpy import nan as NA

csv_path = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data'

df = pd.read_csv(csv_path, header=None)

df.columns = [
    '', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium', 'Total phenols', 'Flavanoids',
    'Nonflavanoid phenols', 'Proanthocyanins', 'Color intensity', 'Hue', '0D280/0D315 of diluted wines', 'Proline'
]

df_ten = df.head(10)

df_ten

Unnamed: 0,Unnamed: 1,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,0D280/0D315 of diluted wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735
5,1,14.2,1.76,2.45,15.2,112,3.27,3.39,0.34,1.97,6.75,1.05,2.85,1450
6,1,14.39,1.87,2.45,14.6,96,2.5,2.52,0.3,1.98,5.25,1.02,3.58,1290
7,1,14.06,2.15,2.61,17.6,121,2.6,2.51,0.31,1.25,5.05,1.06,3.58,1295
8,1,14.83,1.64,2.17,14.0,97,2.8,2.98,0.29,1.98,5.2,1.08,2.85,1045
9,1,13.86,1.35,2.27,16.0,98,2.98,3.15,0.22,1.85,7.22,1.01,3.55,1045


In [220]:
df_ten.iloc[1, 0] = NA
df_ten.iloc[2, 3] = NA
df_ten.iloc[4, 8] = NA
df_ten.iloc[7, 3] = NA

df_ten

Unnamed: 0,Unnamed: 1,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,0D280/0D315 of diluted wines,Proline
0,1.0,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1.0,13.16,2.36,,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1.0,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1.0,13.24,2.59,2.87,21.0,118,2.8,2.69,,1.82,4.32,1.04,2.93,735
5,1.0,14.2,1.76,2.45,15.2,112,3.27,3.39,0.34,1.97,6.75,1.05,2.85,1450
6,1.0,14.39,1.87,2.45,14.6,96,2.5,2.52,0.3,1.98,5.25,1.02,3.58,1290
7,1.0,14.06,2.15,,17.6,121,2.6,2.51,0.31,1.25,5.05,1.06,3.58,1295
8,1.0,14.83,1.64,2.17,14.0,97,2.8,2.98,0.29,1.98,5.2,1.08,2.85,1045
9,1.0,13.86,1.35,2.27,16.0,98,2.98,3.15,0.22,1.85,7.22,1.01,3.55,1045


In [221]:
df_ten.fillna(df_ten.mean())

Unnamed: 0,Unnamed: 1,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,0D280/0D315 of diluted wines,Proline
0,1.0,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1.0,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1.0,13.16,2.36,2.41,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1.0,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1.0,13.24,2.59,2.87,21.0,118,2.8,2.69,0.282222,1.82,4.32,1.04,2.93,735
5,1.0,14.2,1.76,2.45,15.2,112,3.27,3.39,0.34,1.97,6.75,1.05,2.85,1450
6,1.0,14.39,1.87,2.45,14.6,96,2.5,2.52,0.3,1.98,5.25,1.02,3.58,1290
7,1.0,14.06,2.15,2.41,17.6,121,2.6,2.51,0.31,1.25,5.05,1.06,3.58,1295
8,1.0,14.83,1.64,2.17,14.0,97,2.8,2.98,0.29,1.98,5.2,1.08,2.85,1045
9,1.0,13.86,1.35,2.27,16.0,98,2.98,3.15,0.22,1.85,7.22,1.01,3.55,1045


In [222]:
df_ten['Alcohol'].mean()

13.953999999999999

In [223]:
df_ten = pd.concat([df_ten, pd.DataFrame([df_ten.loc[3]])], ignore_index=True)
df_ten = pd.concat([df_ten, pd.DataFrame([df_ten.loc[6]])], ignore_index=True)
df_ten = pd.concat([df_ten, pd.DataFrame([df_ten.loc[9]])], ignore_index=True)

df_ten = df_ten.drop_duplicates()

df_ten

Unnamed: 0,Unnamed: 1,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,0D280/0D315 of diluted wines,Proline
0,1.0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,1.0,13.16,2.36,,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0
3,1.0,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0
4,1.0,13.24,2.59,2.87,21.0,118.0,2.8,2.69,,1.82,4.32,1.04,2.93,735.0
5,1.0,14.2,1.76,2.45,15.2,112.0,3.27,3.39,0.34,1.97,6.75,1.05,2.85,1450.0
6,1.0,14.39,1.87,2.45,14.6,96.0,2.5,2.52,0.3,1.98,5.25,1.02,3.58,1290.0
7,1.0,14.06,2.15,,17.6,121.0,2.6,2.51,0.31,1.25,5.05,1.06,3.58,1295.0
8,1.0,14.83,1.64,2.17,14.0,97.0,2.8,2.98,0.29,1.98,5.2,1.08,2.85,1045.0
9,1.0,13.86,1.35,2.27,16.0,98.0,2.98,3.15,0.22,1.85,7.22,1.01,3.55,1045.0


In [224]:
alcohols_bins = [0, 5, 10, 15, 20, 25]

alcohols_cut_data = pd.cut(df_ten['Alcohol'], alcohols_bins)

alcohols_cut_data

0    (10, 15]
1    (10, 15]
2    (10, 15]
3    (10, 15]
4    (10, 15]
5    (10, 15]
6    (10, 15]
7    (10, 15]
8    (10, 15]
9    (10, 15]
Name: Alcohol, dtype: category
Categories (5, interval[int64, right]): [(0, 5] < (5, 10] < (10, 15] < (15, 20] < (20, 25]]

In [225]:
pd.value_counts(alcohols_cut_data)

(10, 15]    10
(0, 5]       0
(5, 10]      0
(15, 20]     0
(20, 25]     0
Name: Alcohol, dtype: int64