In [1]:
%matplotlib notebook
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
snails = pd.read_excel('snail_size.xlsx', sheet_name='snail_size')

In [3]:
snails.describe()

Unnamed: 0,length,diameter,height,full_weight,no_shell_weight,core_weight,shell_weight,age
count,4163.0,4163.0,4177.0,4177.0,4177.0,4177.0,4147.0,4177.0
mean,0.524042,0.407871,0.219368,0.828742,0.359367,0.180594,0.239078,9.933684
std,0.1201,0.099266,3.506068,0.490389,0.221963,0.109614,0.139089,3.224169
min,0.075,0.055,0.0,0.002,0.001,0.0005,0.0015,1.0
25%,0.45,0.35,0.115,0.4415,0.186,0.0935,0.13,8.0
50%,0.545,0.425,0.14,0.7995,0.336,0.171,0.235,9.0
75%,0.615,0.48,0.165,1.153,0.502,0.253,0.32975,11.0
max,0.815,0.65,165.0,2.8255,1.488,0.76,1.005,29.0


In [4]:
snails.head()

Unnamed: 0,gender,length,diameter,height,full_weight,no_shell_weight,core_weight,shell_weight,age
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,,10
4,Infant,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [5]:
snails.dtypes

gender              object
length             float64
diameter           float64
height             float64
full_weight        float64
no_shell_weight    float64
core_weight        float64
shell_weight       float64
age                  int64
dtype: object

In [6]:
snails.hist()

<IPython.core.display.Javascript object>

array([[<Axes: title={'center': 'length'}>,
        <Axes: title={'center': 'diameter'}>,
        <Axes: title={'center': 'height'}>],
       [<Axes: title={'center': 'full_weight'}>,
        <Axes: title={'center': 'no_shell_weight'}>,
        <Axes: title={'center': 'core_weight'}>],
       [<Axes: title={'center': 'shell_weight'}>,
        <Axes: title={'center': 'age'}>, <Axes: >]], dtype=object)

### Gender

Since `gender` contains categorical nominal data, we can encode this data numerically using label, or integer encoding, by simply swapping the string with a unique number:

In [7]:
snails['gender']

0            M
1            M
2            F
3            M
4       Infant
         ...  
4172         F
4173         M
4174         M
4175         F
4176         M
Name: gender, Length: 4177, dtype: object

In [8]:
snails['gender'].unique()

array(['M', 'F', 'Infant', 'Instant'], dtype=object)

In [9]:
snails['gender'] = snails['gender'].str.replace('M', '0')

In [10]:
snails['gender'] = snails['gender'].str.replace('F', '1')

In [11]:
snails['gender'] = snails['gender'].str.replace('Infant', '2')

In [12]:
snails['gender'] = snails['gender'].str.replace('Instant', '3')

In [13]:
snails['gender'].unique()

array(['0', '1', '2', '3'], dtype=object)

In [14]:
snails['gender']

0       0
1       0
2       1
3       0
4       2
       ..
4172    1
4173    0
4174    0
4175    1
4176    0
Name: gender, Length: 4177, dtype: object

In [15]:
snails['gender'] = pd.to_numeric(snails['gender'], errors='coerce')

In [16]:
snails['gender'].unique()

array([0, 1, 2, 3])

In [17]:
snails['gender'].dtypes

dtype('int64')

### Length

For `length`, the value count is 4,163 - which is less than the row count of 4,177. This means that there are likely missing values. We'll replace the NaN values with the median value for that row, to complete the series:

In [18]:
snails.describe()

Unnamed: 0,gender,length,diameter,height,full_weight,no_shell_weight,core_weight,shell_weight,age
count,4177.0,4163.0,4163.0,4177.0,4177.0,4177.0,4177.0,4147.0,4177.0
mean,0.957146,0.524042,0.407871,0.219368,0.828742,0.359367,0.180594,0.239078,9.933684
std,0.830935,0.1201,0.099266,3.506068,0.490389,0.221963,0.109614,0.139089,3.224169
min,0.0,0.075,0.055,0.0,0.002,0.001,0.0005,0.0015,1.0
25%,0.0,0.45,0.35,0.115,0.4415,0.186,0.0935,0.13,8.0
50%,1.0,0.545,0.425,0.14,0.7995,0.336,0.171,0.235,9.0
75%,2.0,0.615,0.48,0.165,1.153,0.502,0.253,0.32975,11.0
max,3.0,0.815,0.65,165.0,2.8255,1.488,0.76,1.005,29.0


In [19]:
snails.loc[snails['length'].isna()]

Unnamed: 0,gender,length,diameter,height,full_weight,no_shell_weight,core_weight,shell_weight,age
1925,1,,0.47,0.165,1.1785,0.566,0.2785,0.294,11
1964,1,,0.535,0.205,1.4415,0.5925,0.2775,0.49,10
2003,2,,0.28,0.08,0.226,0.105,0.047,0.065,6
2042,2,,0.28,0.1,0.2755,0.1305,0.061,0.0725,8
2081,0,,0.5,0.14,1.238,0.6165,0.2355,0.32,8
2120,0,,0.325,0.1,0.3295,0.1365,0.0725,0.11,7
2159,1,,0.51,0.2,1.3905,0.61,0.3315,0.41,12
2198,2,,0.19,0.08,0.081,0.0265,0.0195,0.03,6
2237,0,,0.465,0.19,1.171,0.3905,0.2355,0.4,17
3097,0,,0.435,0.145,0.9385,0.3685,0.1245,0.345,11


In [20]:
snails.loc[snails['length'].isna()] = snails['length'].median()

In [21]:
snails.loc[snails['length'].isna()]

Unnamed: 0,gender,length,diameter,height,full_weight,no_shell_weight,core_weight,shell_weight,age


In [22]:
snails.describe()

Unnamed: 0,gender,length,diameter,height,full_weight,no_shell_weight,core_weight,shell_weight,age
count,4177.0,4177.0,4163.0,4177.0,4177.0,4177.0,4177.0,4147.0,4177.0
mean,0.955142,0.524112,0.408373,0.220725,0.827939,0.360077,0.181863,0.240103,9.902234
std,0.829516,0.119905,0.0994,3.506115,0.489934,0.221907,0.111465,0.139971,3.264726
min,0.0,0.075,0.055,0.0,0.002,0.001,0.0005,0.0015,0.545
25%,0.0,0.45,0.35,0.115,0.443,0.1865,0.0935,0.13,8.0
50%,1.0,0.545,0.425,0.14,0.7965,0.3365,0.171,0.235,9.0
75%,2.0,0.615,0.48,0.165,1.151,0.5045,0.254,0.33,11.0
max,3.0,0.815,0.65,165.0,2.8255,1.488,0.76,1.005,29.0


Now that this is done, we should look for outliers. If there are any values which are +/- 3 Standard Deviations above the mean, we should remove the rows:

In [23]:
snails.loc[(snails['length'] < (snails['length'].mean() - (snails['length'].std() * 3))) | (snails['length'] > (snails['length'].mean() + (snails['length'].std() * 3)))]

Unnamed: 0,gender,length,diameter,height,full_weight,no_shell_weight,core_weight,shell_weight,age
236,2.0,0.075,0.055,0.01,0.002,0.001,0.0005,0.0015,1.0
237,2.0,0.13,0.1,0.03,0.013,0.0045,0.003,,3.0
238,2.0,0.11,0.09,0.03,0.008,0.0025,0.002,0.003,3.0
239,2.0,0.16,0.12,0.035,0.021,0.0075,0.0045,0.005,5.0
526,0.0,0.155,0.11,0.04,0.0155,0.0065,0.003,0.005,3.0
696,2.0,0.155,0.105,0.05,0.0175,0.005,0.0035,0.005,4.0
719,2.0,0.15,0.1,0.025,0.015,0.0045,0.004,0.005,2.0
720,2.0,0.16,0.11,0.025,0.018,0.0065,0.0055,0.005,3.0
1429,2.0,0.14,0.105,0.035,0.014,0.0055,0.0025,0.004,3.0
1986,2.0,0.135,0.13,0.04,0.029,0.0125,0.0065,0.008,4.0


In [24]:
snails.drop(snails[(snails['length'] < (snails['length'].mean() - (snails['length'].std() * 3))) | (snails['length'] > (snails['length'].mean() + (snails['length'].std() * 3)))].index, inplace=True)

In [25]:
snails.loc[(snails['length'] < (snails['length'].mean() - (snails['length'].std() * 3))) | (snails['length'] > (snails['length'].mean() + (snails['length'].std() * 3)))]

Unnamed: 0,gender,length,diameter,height,full_weight,no_shell_weight,core_weight,shell_weight,age
149,2.0,0.17,0.13,0.095,0.03,0.013,0.008,0.01,4.0
306,2.0,0.165,0.12,0.03,0.0215,0.007,0.005,0.005,3.0
694,2.0,0.165,0.11,0.02,0.019,0.0065,0.0025,0.005,4.0
2169,2.0,0.165,0.115,0.015,0.0145,0.0055,0.003,0.005,4.0
2343,2.0,0.17,0.125,0.055,0.0235,0.009,0.0055,0.008,6.0
3318,0.0,0.165,0.125,0.04,0.0245,0.0095,0.0045,0.008,4.0
3472,2.0,0.165,0.12,0.05,0.021,0.0075,0.0045,0.014,3.0
3837,2.0,0.17,0.105,0.035,0.034,0.012,0.0085,0.005,4.0


Because we removed such impactful outliers, the median changed - let's remove the new outliers:

In [26]:
snails.drop(snails.loc[(snails['length'] < (snails['length'].mean() - (snails['length'].std() * 3))) | (snails['length'] > (snails['length'].mean() + (snails['length'].std() * 3)))].index, inplace=True)

In [27]:
snails.loc[(snails['length'] < (snails['length'].mean() - (snails['length'].std() * 3))) | (snails['length'] > (snails['length'].mean() + (snails['length'].std() * 3)))]

Unnamed: 0,gender,length,diameter,height,full_weight,no_shell_weight,core_weight,shell_weight,age
148,2.0,0.175,0.13,0.055,0.0315,0.0105,0.0065,0.0125,5.0
465,2.0,0.175,0.125,0.05,0.0235,0.008,0.0035,0.008,5.0
525,0.0,0.175,0.125,0.04,0.024,0.0095,0.006,0.005,4.0
2380,0.0,0.175,0.135,0.04,0.0305,0.011,0.0075,0.01,5.0
3600,2.0,0.175,0.125,0.04,0.028,0.0095,0.008,0.009,4.0


In [28]:
snails.drop(snails.loc[(snails['length'] < (snails['length'].mean() - (snails['length'].std() * 3))) | (snails['length'] > (snails['length'].mean() + (snails['length'].std() * 3)))].index, inplace=True)

In [29]:
snails.loc[(snails['length'] < (snails['length'].mean() - (snails['length'].std() * 3))) | (snails['length'] > (snails['length'].mean() + (snails['length'].std() * 3)))]

Unnamed: 0,gender,length,diameter,height,full_weight,no_shell_weight,core_weight,shell_weight,age


Looks like we got them all!

### Diameter

In [30]:
snails.describe()

Unnamed: 0,gender,length,diameter,height,full_weight,no_shell_weight,core_weight,shell_weight,age
count,4149.0,4149.0,4135.0,4149.0,4149.0,4149.0,4149.0,4120.0,4149.0
mean,0.950501,0.526606,0.410374,0.221968,0.83339,0.362457,0.183059,0.241634,9.943271
std,0.82799,0.116372,0.096696,3.517895,0.487054,0.220748,0.110883,0.13914,3.236097
min,0.0,0.18,0.12,0.0,0.023,0.0085,0.0005,0.0065,0.545
25%,0.0,0.455,0.35,0.115,0.449,0.1895,0.095,0.1325,8.0
50%,1.0,0.545,0.425,0.145,0.8025,0.3395,0.1715,0.235,9.0
75%,2.0,0.615,0.48,0.165,1.154,0.506,0.255,0.33,11.0
max,3.0,0.815,0.65,165.0,2.8255,1.488,0.76,1.005,29.0


The `diameter` row count is below the total row count, so we should look for NaN values:

In [31]:
snails.loc[snails['diameter'].isna()]

Unnamed: 0,gender,length,diameter,height,full_weight,no_shell_weight,core_weight,shell_weight,age
1184,2.0,0.665,,0.17,1.2975,0.6035,0.291,0.3595,9.0
1223,2.0,0.34,,0.09,0.179,0.0775,0.033,0.055,6.0
1262,2.0,0.455,,0.105,0.4005,0.164,0.0755,0.126,8.0
1301,2.0,0.535,,0.135,0.6025,0.2895,0.121,0.154,9.0
1340,0.0,0.585,,0.155,0.9145,0.4555,0.1965,0.235,9.0
1379,1.0,0.62,,0.16,1.1295,0.463,0.2685,0.33,10.0
1418,0.0,0.705,,0.215,2.141,1.0465,0.383,0.528,11.0
1457,1.0,0.475,,0.115,0.566,0.281,0.117,0.1335,7.0
1496,0.0,0.62,,0.165,1.0725,0.4815,0.235,0.312,9.0
1535,2.0,0.305,,0.09,0.1465,0.063,0.034,0.0415,6.0


In [32]:
snails.loc[snails['diameter'].isna()] = snails['diameter'].median()

In [33]:
snails.loc[snails['diameter'].isna()]

Unnamed: 0,gender,length,diameter,height,full_weight,no_shell_weight,core_weight,shell_weight,age


If we look for outliers, we'll see that there are only three, so let's remove them:

In [34]:
snails.loc[(snails['diameter'] < (snails['diameter'].mean() - (snails['diameter'].std() * 3))) | (snails['diameter'] > (snails['diameter'].mean() + (snails['diameter'].std() * 3)))]

Unnamed: 0,gender,length,diameter,height,full_weight,no_shell_weight,core_weight,shell_weight,age
898,2.0,0.28,0.12,0.075,0.117,0.0455,0.029,0.0345,4.0


In [35]:
snails.drop(snails[(snails['diameter'] < (snails['diameter'].mean() - (snails['diameter'].std() * 3))) | (snails['diameter'] > (snails['diameter'].mean() + (snails['diameter'].std() * 3)))].index, inplace=True)

In [36]:
snails.describe()

Unnamed: 0,gender,length,diameter,height,full_weight,no_shell_weight,core_weight,shell_weight,age
count,4148.0,4148.0,4148.0,4148.0,4148.0,4148.0,4148.0,4119.0,4148.0
mean,0.947825,0.526279,0.410493,0.222958,0.832162,0.362621,0.18395,0.242334,9.915762
std,0.826706,0.116285,0.096442,3.518335,0.486681,0.220268,0.11161,0.139334,3.279764
min,0.0,0.18,0.125,0.0,0.023,0.0085,0.0005,0.0065,0.425
25%,0.0,0.45,0.35,0.115,0.447,0.19,0.0955,0.13325,8.0
50%,1.0,0.545,0.425,0.145,0.7995,0.341,0.17225,0.235,9.0
75%,2.0,0.615,0.48,0.165,1.153,0.5055,0.256,0.33125,11.0
max,3.0,0.815,0.65,165.0,2.8255,1.488,0.76,1.005,29.0


### Height

Although the value count for `height` is equal to the row count, it's still worth checking for null values:

In [37]:
snails.loc[snails['height'].isna()]

Unnamed: 0,gender,length,diameter,height,full_weight,no_shell_weight,core_weight,shell_weight,age


This looks good. Now, let's check for outliers:

In [38]:
snails.loc[(snails['height'] < (snails['height'].mean() - (snails['height'].std() * 3))) | (snails['height'] > (snails['height'].mean() + (snails['height'].std() * 3)))]

Unnamed: 0,gender,length,diameter,height,full_weight,no_shell_weight,core_weight,shell_weight,age
10,1.0,0.525,0.38,14.0,0.6065,0.194,0.1475,0.21,14.0
557,1.0,0.66,0.5,165.0,1.1905,0.4585,0.298,0.37,12.0
1495,1.0,0.62,0.435,155.0,1.012,0.477,0.236,0.275,8.0


The two highest outliers might be typo's, but we should remove all three just in case:

In [39]:
snails.drop(snails[(snails['height'] < (snails['height'].mean() - (snails['height'].std() * 3))) | (snails['height'] > (snails['height'].mean() + (snails['height'].std() * 3)))].index, inplace=True)

In [40]:
snails.describe()

Unnamed: 0,gender,length,diameter,height,full_weight,no_shell_weight,core_weight,shell_weight,age
count,4145.0,4145.0,4145.0,4145.0,4145.0,4145.0,4145.0,4116.0,4145.0
mean,0.947788,0.526224,0.410473,0.14254,0.832086,0.362611,0.183918,0.242303,9.914736
std,0.827004,0.116299,0.096465,0.050003,0.486805,0.22032,0.111632,0.139369,3.280042
min,0.0,0.18,0.125,0.0,0.023,0.0085,0.0005,0.0065,0.425
25%,0.0,0.45,0.35,0.115,0.447,0.19,0.0955,0.132875,8.0
50%,1.0,0.545,0.425,0.145,0.7995,0.341,0.172,0.235,9.0
75%,2.0,0.615,0.48,0.165,1.153,0.5055,0.256,0.331125,11.0
max,3.0,0.815,0.65,1.13,2.8255,1.488,0.76,1.005,29.0


### Full Weight

Although the value count for `full_weight` is equal to the row count, we will still check for null values:

In [41]:
snails.loc[snails['full_weight'].isna()]

Unnamed: 0,gender,length,diameter,height,full_weight,no_shell_weight,core_weight,shell_weight,age


Looks like all the values are accounted for, but after a cursory look at the standard deviation, min, and max for this column, there are likely a great deal of outliers - let's remove them as well:

In [42]:
snails.loc[(snails['full_weight'] < (snails['full_weight'].mean() - (snails['full_weight'].std() * 3))) | (snails['full_weight'] > (snails['full_weight'].mean() + (snails['full_weight'].std() * 3)))]

Unnamed: 0,gender,length,diameter,height,full_weight,no_shell_weight,core_weight,shell_weight,age
165,0.0,0.725,0.57,0.19,2.55,1.0705,0.483,0.725,14.0
358,0.0,0.745,0.585,0.215,2.499,0.9265,0.472,0.7,17.0
891,0.0,0.73,0.595,0.23,2.8255,1.1465,0.419,0.897,17.0
1051,1.0,0.735,0.6,0.22,2.555,1.1335,0.44,0.6,11.0
1052,0.0,0.765,0.6,0.22,2.302,1.007,0.509,0.6205,12.0
1207,1.0,0.755,0.625,0.21,2.505,1.1965,0.513,0.6785,11.0
1209,1.0,0.78,0.63,0.215,2.657,1.488,0.4985,0.586,11.0
1427,1.0,0.75,0.61,0.235,2.5085,1.232,0.519,0.612,14.0
1762,0.0,0.77,0.62,0.195,2.5155,1.1155,0.6415,0.642,12.0
1763,0.0,0.775,0.63,0.25,2.7795,1.3485,0.76,0.578,12.0


In [43]:
snails.drop(snails.loc[(snails['full_weight'] < (snails['full_weight'].mean() - (snails['full_weight'].std() * 3))) | (snails['full_weight'] > (snails['full_weight'].mean() + (snails['full_weight'].std() * 3)))].index, inplace=True)

In [44]:
snails.describe()

Unnamed: 0,gender,length,diameter,height,full_weight,no_shell_weight,core_weight,shell_weight,age
count,4126.0,4126.0,4126.0,4126.0,4126.0,4126.0,4126.0,4097.0,4126.0
mean,0.950698,0.525191,0.409613,0.142233,0.824427,0.359067,0.182368,0.240559,9.901255
std,0.827189,0.115553,0.095837,0.049898,0.474529,0.21428,0.109372,0.13717,3.274728
min,0.0,0.18,0.125,0.0,0.023,0.0085,0.0005,0.0065,0.425
25%,0.0,0.45,0.35,0.115,0.444625,0.1895,0.095,0.1325,8.0
50%,1.0,0.545,0.425,0.145,0.79575,0.33925,0.1715,0.235,9.0
75%,2.0,0.615,0.48,0.165,1.146,0.502,0.254375,0.33,11.0
max,3.0,0.815,0.65,1.13,2.273,1.351,0.564,1.005,29.0


In [45]:
snails.loc[(snails['full_weight'] < (snails['full_weight'].mean() - (snails['full_weight'].std() * 3))) | (snails['full_weight'] > (snails['full_weight'].mean() + (snails['full_weight'].std() * 3)))]

Unnamed: 0,gender,length,diameter,height,full_weight,no_shell_weight,core_weight,shell_weight,age
1193,0.0,0.7,0.575,0.19,2.273,1.095,0.418,0.638,12.0
1426,1.0,0.745,0.57,0.215,2.25,1.1565,0.446,0.558,9.0
1428,1.0,0.815,0.65,0.25,2.255,0.8905,0.42,0.7975,14.0
3008,1.0,0.75,0.615,0.205,2.2635,0.821,0.423,0.726,12.0
3188,0.0,0.72,0.575,0.23,2.2695,0.8835,0.3985,0.665,16.0


Just like with `length` above, we have new outliers - let's remove them now:

In [46]:
snails.drop(snails.loc[(snails['full_weight'] < (snails['full_weight'].mean() - (snails['full_weight'].std() * 3))) | (snails['full_weight'] > (snails['full_weight'].mean() + (snails['full_weight'].std() * 3)))].index, inplace=True)

In [47]:
snails.loc[(snails['full_weight'] < (snails['full_weight'].mean() - (snails['full_weight'].std() * 3))) | (snails['full_weight'] > (snails['full_weight'].mean() + (snails['full_weight'].std() * 3)))]

Unnamed: 0,gender,length,diameter,height,full_weight,no_shell_weight,core_weight,shell_weight,age


In [48]:
snails.drop(snails.loc[(snails['full_weight'] < (snails['full_weight'].mean() - (snails['full_weight'].std() * 3))) | (snails['full_weight'] > (snails['full_weight'].mean() + (snails['full_weight'].std() * 3)))].index, inplace=True)

In [49]:
snails.loc[(snails['full_weight'] < (snails['full_weight'].mean() - (snails['full_weight'].std() * 3))) | (snails['full_weight'] > (snails['full_weight'].mean() + (snails['full_weight'].std() * 3)))]

Unnamed: 0,gender,length,diameter,height,full_weight,no_shell_weight,core_weight,shell_weight,age


# No Shell Weight

In [50]:
snails.describe()

Unnamed: 0,gender,length,diameter,height,full_weight,no_shell_weight,core_weight,shell_weight,age
count,4121.0,4121.0,4121.0,4121.0,4121.0,4121.0,4121.0,4092.0,4121.0
mean,0.951124,0.524924,0.409386,0.142141,0.822683,0.358326,0.182078,0.240026,9.897981
std,0.827425,0.115359,0.095667,0.049853,0.472164,0.213303,0.10912,0.136373,3.274356
min,0.0,0.18,0.125,0.0,0.023,0.0085,0.0005,0.0065,0.425
25%,0.0,0.45,0.35,0.115,0.4445,0.1895,0.095,0.132375,8.0
50%,1.0,0.545,0.425,0.14,0.7955,0.339,0.1715,0.235,9.0
75%,2.0,0.615,0.48,0.165,1.145,0.5015,0.254,0.33,11.0
max,3.0,0.775,0.605,1.13,2.2385,1.351,0.564,1.005,29.0


In [51]:
snails.loc[snails['no_shell_weight'].isna()]

Unnamed: 0,gender,length,diameter,height,full_weight,no_shell_weight,core_weight,shell_weight,age


Looks like all values are accounted for, let's check for outliers:

In [52]:
snails.loc[(snails['no_shell_weight'] < (snails['no_shell_weight'].mean() - (snails['no_shell_weight'].std() * 3))) | (snails['no_shell_weight'] > (snails['no_shell_weight'].mean() + (snails['no_shell_weight'].std() * 3)))]

Unnamed: 0,gender,length,diameter,height,full_weight,no_shell_weight,core_weight,shell_weight,age
1048,0.0,0.71,0.565,0.205,2.198,1.012,0.5225,0.5475,11.0
1197,0.0,0.715,0.55,0.19,2.0045,1.0465,0.407,0.5075,12.0
1199,1.0,0.72,0.58,0.195,2.103,1.0265,0.48,0.5375,10.0
1202,0.0,0.725,0.505,0.185,1.978,1.026,0.4255,0.4505,12.0
1206,1.0,0.75,0.57,0.21,2.236,1.109,0.5195,0.545,11.0
1417,0.0,0.705,0.565,0.515,2.21,1.1075,0.4865,0.512,10.0
1527,0.0,0.72,0.565,0.2,2.1055,1.017,0.363,0.494,12.0
1528,0.0,0.725,0.575,0.24,2.21,1.351,0.413,0.5015,13.0
1754,0.0,0.72,0.55,0.205,2.125,1.1455,0.4425,0.511,13.0
1756,1.0,0.725,0.565,0.21,2.1425,1.03,0.487,0.503,14.0


In [53]:
snails.drop(snails.loc[(snails['no_shell_weight'] < (snails['no_shell_weight'].mean() - (snails['no_shell_weight'].std() * 3))) | (snails['no_shell_weight'] > (snails['no_shell_weight'].mean() + (snails['no_shell_weight'].std() * 3)))].index, inplace=True)

In [54]:
snails.loc[(snails['no_shell_weight'] < (snails['no_shell_weight'].mean() - (snails['no_shell_weight'].std() * 3))) | (snails['no_shell_weight'] > (snails['no_shell_weight'].mean() + (snails['no_shell_weight'].std() * 3)))]

Unnamed: 0,gender,length,diameter,height,full_weight,no_shell_weight,core_weight,shell_weight,age
1749,0.0,0.71,0.575,0.215,2.009,0.9895,0.4475,0.502,11.0
1750,1.0,0.71,0.57,0.195,1.9805,0.9925,0.4925,0.48,12.0
2970,0.0,0.69,0.515,0.18,1.8445,0.9815,0.4655,0.341,13.0
2972,0.0,0.72,0.58,0.19,2.0885,0.9955,0.478,0.5305,13.0
3993,0.0,0.72,0.6,0.235,2.2385,0.984,0.411,0.621,12.0


In [55]:
snails.drop(snails.loc[(snails['no_shell_weight'] < (snails['no_shell_weight'].mean() - (snails['no_shell_weight'].std() * 3))) | (snails['no_shell_weight'] > (snails['no_shell_weight'].mean() + (snails['no_shell_weight'].std() * 3)))].index, inplace=True)

In [56]:
snails.loc[(snails['no_shell_weight'] < (snails['no_shell_weight'].mean() - (snails['no_shell_weight'].std() * 3))) | (snails['no_shell_weight'] > (snails['no_shell_weight'].mean() + (snails['no_shell_weight'].std() * 3)))]

Unnamed: 0,gender,length,diameter,height,full_weight,no_shell_weight,core_weight,shell_weight,age


After dropping the rows with outliers, `no_shell_weight` should be in a cleaned state:

In [57]:
snails.describe()

Unnamed: 0,gender,length,diameter,height,full_weight,no_shell_weight,core_weight,shell_weight,age
count,4092.0,4092.0,4092.0,4092.0,4092.0,4092.0,4092.0,4063.0,4092.0
mean,0.955909,0.523543,0.4083,0.141639,0.813505,0.353236,0.180177,0.238109,9.887972
std,0.827535,0.114567,0.095106,0.049418,0.460952,0.205149,0.10707,0.134856,3.281904
min,0.0,0.18,0.125,0.0,0.023,0.0085,0.0005,0.0065,0.425
25%,0.0,0.45,0.35,0.115,0.442375,0.188375,0.0945,0.131,8.0
50%,1.0,0.54,0.425,0.14,0.7905,0.336,0.171,0.235,9.0
75%,2.0,0.61,0.48,0.165,1.137125,0.497625,0.2515,0.32525,11.0
max,3.0,0.775,0.605,1.13,2.226,0.9655,0.564,1.005,29.0


### Core Weight

In [58]:
snails.loc[snails['core_weight'].isna()]

Unnamed: 0,gender,length,diameter,height,full_weight,no_shell_weight,core_weight,shell_weight,age


The column `core_weight` looks to have all of its values present and accounted for. Let's look for outliers:

In [59]:
snails.loc[(snails['core_weight'] < (snails['core_weight'].mean() - (snails['core_weight'].std() * 3))) | (snails['core_weight'] > (snails['core_weight'].mean() + (snails['core_weight'].std() * 3)))]

Unnamed: 0,gender,length,diameter,height,full_weight,no_shell_weight,core_weight,shell_weight,age
170,0.0,0.695,0.55,0.215,1.9565,0.7125,0.541,0.59,14.0
1204,0.0,0.735,0.585,0.185,2.124,0.952,0.55,0.5,11.0
1422,0.0,0.72,0.575,0.215,2.173,0.9515,0.564,0.5365,12.0
1757,1.0,0.73,0.56,0.19,1.9425,0.799,0.5195,0.5655,11.0
1759,1.0,0.74,0.565,0.205,2.119,0.9655,0.5185,0.482,12.0
1925,0.545,0.545,0.545,0.545,0.545,0.545,0.545,0.545,0.545
1964,0.545,0.545,0.545,0.545,0.545,0.545,0.545,0.545,0.545
2003,0.545,0.545,0.545,0.545,0.545,0.545,0.545,0.545,0.545
2042,0.545,0.545,0.545,0.545,0.545,0.545,0.545,0.545,0.545
2081,0.545,0.545,0.545,0.545,0.545,0.545,0.545,0.545,0.545


In [60]:
snails.drop(snails.loc[(snails['core_weight'] < (snails['core_weight'].mean() - (snails['core_weight'].std() * 3))) | (snails['core_weight'] > (snails['core_weight'].mean() + (snails['core_weight'].std() * 3)))].index, inplace=True)

In [61]:
snails.loc[(snails['core_weight'] < (snails['core_weight'].mean() - (snails['core_weight'].std() * 3))) | (snails['core_weight'] > (snails['core_weight'].mean() + (snails['core_weight'].std() * 3)))]

Unnamed: 0,gender,length,diameter,height,full_weight,no_shell_weight,core_weight,shell_weight,age
1985,0.0,0.76,0.605,0.215,2.173,0.801,0.4915,0.646,13.0
2162,1.0,0.71,0.565,0.195,1.817,0.785,0.492,0.49,11.0


In [62]:
snails.drop(snails.loc[(snails['core_weight'] < (snails['core_weight'].mean() - (snails['core_weight'].std() * 3))) | (snails['core_weight'] > (snails['core_weight'].mean() + (snails['core_weight'].std() * 3)))].index, inplace=True)

In [63]:
snails.loc[(snails['core_weight'] < (snails['core_weight'].mean() - (snails['core_weight'].std() * 3))) | (snails['core_weight'] > (snails['core_weight'].mean() + (snails['core_weight'].std() * 3)))]

Unnamed: 0,gender,length,diameter,height,full_weight,no_shell_weight,core_weight,shell_weight,age
355,0.0,0.7,0.58,0.205,2.13,0.7415,0.49,0.58,20.0


All outliers for `core_weight` have been removed.

### Shell Weight

In [64]:
snails.describe()

Unnamed: 0,gender,length,diameter,height,full_weight,no_shell_weight,core_weight,shell_weight,age
count,4068.0,4068.0,4068.0,4068.0,4068.0,4068.0,4068.0,4039.0,4068.0
mean,0.958444,0.522954,0.40742,0.140097,0.811412,0.351348,0.178076,0.236346,9.914934
std,0.82894,0.114418,0.09468,0.043404,0.457983,0.203898,0.103811,0.133241,3.243427
min,0.0,0.18,0.125,0.0,0.023,0.0085,0.0005,0.0065,0.425
25%,0.0,0.45,0.35,0.115,0.441,0.187375,0.094,0.13025,8.0
50%,1.0,0.54,0.425,0.14,0.791,0.3345,0.17,0.2315,9.0
75%,2.0,0.61,0.48,0.165,1.136125,0.495,0.249125,0.325,11.0
max,3.0,0.775,0.6,1.13,2.226,0.96,0.49,1.005,29.0


It looks like `shell_weight` is missing 29 values:

In [65]:
snails.loc[snails['shell_weight'].isna()]

Unnamed: 0,gender,length,diameter,height,full_weight,no_shell_weight,core_weight,shell_weight,age
3,0.0,0.44,0.365,0.125,0.516,0.2155,0.114,,10.0
42,2.0,0.24,0.175,0.045,0.07,0.0315,0.0235,,5.0
81,0.0,0.62,0.51,0.175,1.615,0.5105,0.192,,12.0
120,1.0,0.47,0.365,0.105,0.4205,0.163,0.1035,,9.0
159,1.0,0.575,0.46,0.165,1.124,0.2985,0.1785,,13.0
198,0.0,0.56,0.45,0.16,0.922,0.432,0.178,,15.0
276,1.0,0.66,0.53,0.185,1.3485,0.493,0.245,,12.0
315,2.0,0.45,0.355,0.11,0.4585,0.194,0.067,,8.0
354,0.0,0.635,0.515,0.17,1.275,0.509,0.286,,16.0
393,2.0,0.365,0.295,0.095,0.25,0.1075,0.0545,,9.0


In [66]:
snails.loc[snails['shell_weight'].isna()] = snails['shell_weight'].median()

In [67]:
snails.loc[snails['shell_weight'].isna()]

Unnamed: 0,gender,length,diameter,height,full_weight,no_shell_weight,core_weight,shell_weight,age


Now that we have replaced all of the NaN values with the median value for the series, let's check for outliers within `shell_weight`:

In [68]:
snails.loc[(snails['shell_weight'] < (snails['shell_weight'].mean() - (snails['shell_weight'].std() * 3))) | (snails['shell_weight'] > (snails['shell_weight'].mean() + (snails['shell_weight'].std() * 3)))]

Unnamed: 0,gender,length,diameter,height,full_weight,no_shell_weight,core_weight,shell_weight,age
129,0.0,0.71,0.54,0.165,1.959,0.7665,0.261,0.78,18.0
157,0.0,0.645,0.485,0.215,1.514,0.546,0.2615,0.635,16.0
163,1.0,0.725,0.56,0.21,2.141,0.65,0.398,1.005,18.0
164,1.0,0.65,0.545,0.23,1.752,0.5605,0.2895,0.815,16.0
166,1.0,0.725,0.575,0.175,2.124,0.765,0.4515,0.85,20.0
167,1.0,0.68,0.57,0.205,1.842,0.625,0.408,0.65,20.0
168,0.0,0.705,0.56,0.22,1.981,0.8175,0.3085,0.76,14.0
277,0.0,0.61,0.5,0.24,1.642,0.532,0.3345,0.69,18.0
334,1.0,0.74,0.6,0.195,1.974,0.598,0.4085,0.71,16.0
1823,0.0,0.73,0.575,0.21,2.069,0.9285,0.409,0.643,11.0


In [69]:
snails.drop(snails.loc[(snails['shell_weight'] < (snails['shell_weight'].mean() - (snails['shell_weight'].std() * 3))) | (snails['shell_weight'] > (snails['shell_weight'].mean() + (snails['shell_weight'].std() * 3)))].index, inplace=True)

In [70]:
snails.loc[(snails['shell_weight'] < (snails['shell_weight'].mean() - (snails['shell_weight'].std() * 3))) | (snails['shell_weight'] > (snails['shell_weight'].mean() + (snails['shell_weight'].std() * 3)))]

Unnamed: 0,gender,length,diameter,height,full_weight,no_shell_weight,core_weight,shell_weight,age
169,1.0,0.68,0.515,0.175,1.6185,0.5125,0.409,0.62,12.0
468,1.0,0.68,0.55,0.21,1.7445,0.5975,0.305,0.625,17.0
2265,1.0,0.72,0.575,0.215,2.226,0.8955,0.405,0.62,13.0
2973,0.0,0.735,0.59,0.205,2.087,0.909,0.474,0.625,12.0


Almost done! Now we just have to check the age column!

### Age

In [71]:
snails.loc[snails['age'].isna()]

Unnamed: 0,gender,length,diameter,height,full_weight,no_shell_weight,core_weight,shell_weight,age


The column `age` has all of its values, let's check for outliers next:

In [72]:
snails.loc[(snails['age'] < (snails['age'].mean() - (snails['age'].std() * 3))) | (snails['age'] > (snails['age'].mean() + (snails['age'].std() * 3)))]

Unnamed: 0,gender,length,diameter,height,full_weight,no_shell_weight,core_weight,shell_weight,age
6,1.0,0.53,0.415,0.15,0.7775,0.237,0.1415,0.33,20.0
72,1.0,0.595,0.475,0.17,1.247,0.48,0.225,0.425,20.0
83,0.0,0.595,0.475,0.16,1.3175,0.408,0.234,0.58,21.0
181,0.0,0.64,0.51,0.175,1.368,0.515,0.266,0.57,21.0
232,0.0,0.625,0.505,0.215,1.4455,0.496,0.287,0.435,22.0
270,1.0,0.64,0.525,0.215,1.779,0.4535,0.2855,0.55,22.0
275,0.0,0.655,0.54,0.215,1.844,0.7425,0.327,0.585,22.0
278,0.0,0.635,0.525,0.205,1.484,0.55,0.3115,0.43,20.0
294,0.0,0.6,0.495,0.195,1.0575,0.384,0.19,0.375,26.0
310,1.0,0.63,0.485,0.19,1.2435,0.4635,0.3055,0.39,21.0


Wow - `age` has a high number of outliers +/- 3 Standard Deviations from the Mean. Let's remove them:

In [73]:
snails.drop(snails.loc[(snails['age'] < (snails['age'].mean() - (snails['age'].std() * 3))) | (snails['age'] > (snails['age'].mean() + (snails['age'].std() * 3)))].index, inplace=True)

In [74]:
snails.loc[(snails['age'] < (snails['age'].mean() - (snails['age'].std() * 3))) | (snails['age'] > (snails['age'].mean() + (snails['age'].std() * 3)))]

Unnamed: 0,gender,length,diameter,height,full_weight,no_shell_weight,core_weight,shell_weight,age
3,0.2315,0.2315,0.2315,0.2315,0.2315,0.2315,0.2315,0.2315,0.2315
9,1.0000,0.5500,0.4400,0.1500,0.8945,0.3145,0.1510,0.3200,19.0000
33,1.0000,0.6800,0.5500,0.1750,1.7980,0.8150,0.3925,0.4550,19.0000
42,0.2315,0.2315,0.2315,0.2315,0.2315,0.2315,0.2315,0.2315,0.2315
81,0.2315,0.2315,0.2315,0.2315,0.2315,0.2315,0.2315,0.2315,0.2315
...,...,...,...,...,...,...,...,...,...
3373,0.0000,0.5400,0.4050,0.1550,0.9715,0.3225,0.1940,0.2900,19.0000
3381,0.0000,0.5450,0.4350,0.1650,0.9955,0.3245,0.2665,0.3250,19.0000
3392,0.0000,0.6450,0.5150,0.1850,1.4605,0.5835,0.3155,0.4100,19.0000
3865,0.0000,0.5250,0.3950,0.1650,0.7820,0.2850,0.1405,0.2850,19.0000


In [75]:
snails.drop(snails.loc[(snails['age'] < (snails['age'].mean() - (snails['age'].std() * 3))) | (snails['age'] > (snails['age'].mean() + (snails['age'].std() * 3)))].index, inplace=True)

In [76]:
snails.loc[(snails['age'] < (snails['age'].mean() - (snails['age'].std() * 3))) | (snails['age'] > (snails['age'].mean() + (snails['age'].std() * 3)))]

Unnamed: 0,gender,length,diameter,height,full_weight,no_shell_weight,core_weight,shell_weight,age
32,0.0,0.665,0.525,0.165,1.338,0.5515,0.3575,0.35,18.0
318,1.0,0.58,0.455,0.155,0.8365,0.315,0.1385,0.32,18.0
357,1.0,0.645,0.525,0.19,1.8085,0.7035,0.3885,0.395,18.0
360,0.0,0.605,0.49,0.18,1.227,0.48,0.287,0.35,18.0
426,1.0,0.61,0.485,0.165,1.0915,0.3935,0.2435,0.33,18.0
429,1.0,0.575,0.45,0.17,1.0475,0.3775,0.1705,0.385,18.0
430,1.0,0.57,0.45,0.175,0.9555,0.38,0.1665,0.295,18.0
449,1.0,0.565,0.455,0.15,0.8205,0.365,0.159,0.26,18.0
592,2.0,0.525,0.41,0.175,0.874,0.3585,0.207,0.205,18.0
613,2.0,0.495,0.37,0.125,0.4775,0.185,0.0705,0.169,18.0


In [77]:
snails.describe()

Unnamed: 0,gender,length,diameter,height,full_weight,no_shell_weight,core_weight,shell_weight,age
count,3916.0,3916.0,3916.0,3916.0,3916.0,3916.0,3916.0,3916.0,3916.0
mean,0.969867,0.52096,0.40528,0.138025,0.79926,0.348062,0.175059,0.22945,9.672114
std,0.83298,0.11428,0.09455,0.039814,0.45017,0.203851,0.102075,0.126019,2.73565
min,0.0,0.18,0.125,0.0,0.023,0.0085,0.0005,0.0065,3.0
25%,0.0,0.45,0.345,0.115,0.438,0.1825,0.091875,0.1275,8.0
50%,1.0,0.54,0.42,0.14,0.78,0.329,0.16675,0.225,9.0
75%,2.0,0.61,0.475,0.165,1.129,0.4935,0.246,0.317125,11.0
max,3.0,0.76,0.6,1.13,2.226,0.96,0.4875,0.625,18.0


In [78]:
snails.hist()

<IPython.core.display.Javascript object>

array([[<Axes: title={'center': 'gender'}>,
        <Axes: title={'center': 'length'}>,
        <Axes: title={'center': 'diameter'}>],
       [<Axes: title={'center': 'height'}>,
        <Axes: title={'center': 'full_weight'}>,
        <Axes: title={'center': 'no_shell_weight'}>],
       [<Axes: title={'center': 'core_weight'}>,
        <Axes: title={'center': 'shell_weight'}>,
        <Axes: title={'center': 'age'}>]], dtype=object)

We can see now that the distributions in the histograms for all nine columns looked more normalized, including `height`, `shell_weight`, and `age`. This data set should be ready for our model.

In [79]:
snails.to_csv('jose_vila_cleaned_assignment_6_data.csv')