# Data Aquisition

### 1. Use a python module containing datasets as a source from the iris data. Create a pandas dataframe, df_iris, from this data.

In [1]:
from pydataset import data
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
iris = data('iris')

- #### print the first 3 rows

In [3]:
iris.head(3)

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
1,5.1,3.5,1.4,0.2,setosa
2,4.9,3.0,1.4,0.2,setosa
3,4.7,3.2,1.3,0.2,setosa


- #### print the number of rows and columns (shape)

In [4]:
iris.shape

(150, 5)

- #### print the column names

In [5]:
iris.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 150 entries, 1 to 150
Data columns (total 5 columns):
Sepal.Length    150 non-null float64
Sepal.Width     150 non-null float64
Petal.Length    150 non-null float64
Petal.Width     150 non-null float64
Species         150 non-null object
dtypes: float64(4), object(1)
memory usage: 7.0+ KB


- #### print the data type of each column

In [6]:
iris.dtypes

Sepal.Length    float64
Sepal.Width     float64
Petal.Length    float64
Petal.Width     float64
Species          object
dtype: object

- #### print the summary statistics for each of the numeric variables. Would you recommend rescaling the data based on these statistics?

I would NOT recommend rescaling the data based on these statistics.

In [7]:
iris.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Sepal.Length,150.0,5.843333,0.828066,4.3,5.1,5.8,6.4,7.9
Sepal.Width,150.0,3.057333,0.435866,2.0,2.8,3.0,3.3,4.4
Petal.Length,150.0,3.758,1.765298,1.0,1.6,4.35,5.1,6.9
Petal.Width,150.0,1.199333,0.762238,0.1,0.3,1.3,1.8,2.5


In [8]:
iris.isnull().sum()

Sepal.Length    0
Sepal.Width     0
Petal.Length    0
Petal.Width     0
Species         0
dtype: int64

### 2. Read Table1_CustDetails the excel module dataset, Excel_Exercises.xlsx, into a dataframe, df_excel

In [9]:
customer_df = pd.read_excel('Excel_Exercises.xlsx', sheet_name=0)

- #### assign the first 100 rows to a new dataframe, df_excel_sample

In [10]:
df_excel_sample = customer_df.head(100)

- #### print the number of rows of your original dataframe

In [11]:
customer_df.shape[0]

7049

- #### print the first 5 column names

In [12]:
df_excel_sample.columns[:5]

Index(['customer_id', 'gender', 'is_senior_citizen', 'partner', 'dependents'], dtype='object')

- #### print the column names that have a data type of object

In [13]:
df_excel_sample.dtypes

customer_id           object
gender                object
is_senior_citizen      int64
partner               object
dependents            object
phone_service          int64
internet_service       int64
contract_type          int64
payment_type          object
monthly_charges      float64
total_charges        float64
churn                 object
dtype: object

In [14]:
df_excel_sample.dtypes[df_excel_sample.dtypes == 'object']

customer_id     object
gender          object
partner         object
dependents      object
payment_type    object
churn           object
dtype: object

In [15]:
df_excel_sample.dtypes[df_excel_sample.dtypes == 'object']

customer_id     object
gender          object
partner         object
dependents      object
payment_type    object
churn           object
dtype: object

- #### compute the range for each of the numeric variables.

In [16]:
[df_excel_sample.dtypes == 'int64'] or [df_excel_sample.dtypes == 'float64']
df_excel_sample.dtypes[df_excel_sample.dtypes != 'object']
df_excel_sample.select_dtypes(['int64', 'float64']).max() - df_excel_sample.select_dtypes(['int64', 'float64']).min()

is_senior_citizen       1.00
phone_service           2.00
internet_service        2.00
contract_type           2.00
monthly_charges        97.40
total_charges        8476.85
dtype: float64

### 3. Read the data from this google sheet into a dataframe, df_google

In [17]:
sheet_url = 'https://docs.google.com/spreadsheets/d/1Uhtml8KY19LILuZsrDtlsHHDC9wuDGUSe8LTEwvdI5g/edit#gid=341089357'    

csv_export_url = sheet_url.replace('/edit#gid=', '/export?format=csv&gid=')

df_google = pd.read_csv(csv_export_url)

- #### print the first 3 rows

In [18]:
df_google.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


- #### print the number of rows and columns


In [19]:
df_google.shape

(891, 12)

- #### print the column names

In [20]:
df_google.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

- #### print the data type of each column

In [21]:
df_google.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

- #### print the summary statistics for each of the numeric variables

In [22]:
df_google.PassengerId.describe()

count    891.000000
mean     446.000000
std      257.353842
min        1.000000
25%      223.500000
50%      446.000000
75%      668.500000
max      891.000000
Name: PassengerId, dtype: float64

In [23]:
df_google.Survived.describe()

count    891.000000
mean       0.383838
std        0.486592
min        0.000000
25%        0.000000
50%        0.000000
75%        1.000000
max        1.000000
Name: Survived, dtype: float64

In [24]:
df_google.Pclass.describe()

count    891.000000
mean       2.308642
std        0.836071
min        1.000000
25%        2.000000
50%        3.000000
75%        3.000000
max        3.000000
Name: Pclass, dtype: float64

In [25]:
df_google.Age.describe()

count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: Age, dtype: float64

In [26]:
df_google.SibSp.describe()

count    891.000000
mean       0.523008
std        1.102743
min        0.000000
25%        0.000000
50%        0.000000
75%        1.000000
max        8.000000
Name: SibSp, dtype: float64

In [27]:
df_google.Parch.describe()

count    891.000000
mean       0.381594
std        0.806057
min        0.000000
25%        0.000000
50%        0.000000
75%        0.000000
max        6.000000
Name: Parch, dtype: float64

In [28]:
df_google.Fare.describe()

count    891.000000
mean      32.204208
std       49.693429
min        0.000000
25%        7.910400
50%       14.454200
75%       31.000000
max      512.329200
Name: Fare, dtype: float64

- #### print the unique values for each of your categorical variables

In [29]:
df_google.Name.describe()

count                      891
unique                     891
top       Connors, Mr. Patrick
freq                         1
Name: Name, dtype: object

In [30]:
df_google.Sex.describe()

count      891
unique       2
top       male
freq       577
Name: Sex, dtype: object

In [31]:
df_google.Cabin.describe()

count     204
unique    147
top        G6
freq        4
Name: Cabin, dtype: object

In [32]:
df_google.Embarked.describe()

count     889
unique      3
top         S
freq      644
Name: Embarked, dtype: object

# Data Preparation

The end product of this exercise should be the specified functions in a python script named prepare.py. Do these in your classification_exercises.ipynb first, then transfer to the prepare.py file.

1. Iris Data

    - Use the function defined in acquire.py to load the iris data.
    - Drop the species_id and measurement_id columns.
    - Rename the species_name column to just species.
    - Encode the species name using a sklearn label encoder. Research the inverse_transform method of the label encoder. How might this be useful?
    - Create a function named prep_iris that accepts the untransformed iris data, and returns the data with the transformations above applied.

In [33]:
import pandas as pd
import seaborn as sns
import numpy as np

import pandas_profiling

import env
import acquire

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

In [34]:
iris = acquire.get_iris_data()
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [35]:
iris.isnull().sum()

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64

In [36]:
train1, test1 = train_test_split(iris, train_size=.8, random_state=123)

In [37]:
int_encoder = LabelEncoder()
int_encoder.fit(train1.species)
train1.species = int_encoder.transform(train1.species)

In [38]:
train1.species.value_counts()

1    44
2    39
0    37
Name: species, dtype: int64

In [39]:
train1.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
130,7.4,2.8,6.1,1.9,2
119,6.0,2.2,5.0,1.5,2
29,4.7,3.2,1.6,0.2,0
0,5.1,3.5,1.4,0.2,0
62,6.0,2.2,4.0,1.0,1


In [40]:
def prep_iris(df):
    train, test = train_test_split(iris, train_size=.8, random_state=123)
    int_encoder = LabelEncoder()
    int_encoder.fit(train.species)
    train.species = int_encoder.transform(train.species)
    return train.species

2. Titanic Data

    - Use the function you defined in acquire.py to load the titanic data set.
    - Handle the missing values in the embark_town and embarked columns.
    - Remove the deck column.
    - Use a label encoder to transform the embarked column.
    - Scale the age and fare columns using a min max scaler. Why might this be beneficial? When might you not want to do this?
    - Create a function named prep_titanic that accepts the untransformed titanic data, and returns the data with the transformations above applied.

In [41]:
titanic = acquire.get_titanic_data()
titanic.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [42]:
dft = titanic

In [43]:
# report_titanic = titanic.profile_report()

In [44]:
dft.dtypes

passenger_id      int64
survived          int64
pclass            int64
sex              object
age             float64
sibsp             int64
parch             int64
fare            float64
embarked         object
class            object
deck             object
embark_town      object
alone             int64
dtype: object

In [45]:
dft.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
passenger_id    891 non-null int64
survived        891 non-null int64
pclass          891 non-null int64
sex             891 non-null object
age             714 non-null float64
sibsp           891 non-null int64
parch           891 non-null int64
fare            891 non-null float64
embarked        889 non-null object
class           891 non-null object
deck            203 non-null object
embark_town     889 non-null object
alone           891 non-null int64
dtypes: float64(2), int64(6), object(5)
memory usage: 90.6+ KB


In [46]:
dft.shape

(891, 13)

In [47]:
dft.describe()

Unnamed: 0,passenger_id,survived,pclass,age,sibsp,parch,fare,alone
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0,891.0
mean,445.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208,0.602694
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429,0.489615
min,0.0,0.0,1.0,0.42,0.0,0.0,0.0,0.0
25%,222.5,0.0,2.0,20.125,0.0,0.0,7.9104,0.0
50%,445.0,0.0,3.0,28.0,0.0,0.0,14.4542,1.0
75%,667.5,1.0,3.0,38.0,1.0,0.0,31.0,1.0
max,890.0,1.0,3.0,80.0,8.0,6.0,512.3292,1.0


In [48]:
dft.age.value_counts(bins=10, sort=True)

(16.336, 24.294]    177
(24.294, 32.252]    169
(32.252, 40.21]     118
(40.21, 48.168]      70
(0.339, 8.378]       54
(8.378, 16.336]      46
(48.168, 56.126]     45
(56.126, 64.084]     24
(64.084, 72.042]      9
(72.042, 80.0]        2
Name: age, dtype: int64

In [49]:
dft.embarked.value_counts(dropna=False)

S      644
C      168
Q       77
NaN      2
Name: embarked, dtype: int64

In [50]:
dft.drop(columns=['deck'], inplace=True)

In [51]:
dft.fillna(np.nan, inplace=True)

In [52]:
train, test = train_test_split(dft, train_size=.8, random_state=123)

In [53]:
train.embarked.value_counts(dropna=False)

S      515
C      128
Q       67
NaN      2
Name: embarked, dtype: int64

In [54]:
imp_mode = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

imp_mode.fit(train[['embarked']])

train['embarked'] = imp_mode.transform(train[['embarked']])

test['embarked'] = imp_mode.transform(test[['embarked']])

In [55]:
train.embarked.value_counts()

S    517
C    128
Q     67
Name: embarked, dtype: int64

In [56]:
# imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
# train['age'] = imp_median.fit_transform(train['age'])
# train.age.isnull().sum()

Encoding:
 - integer
 - one hot encoding

In [57]:
int_encoder = LabelEncoder()
int_encoder.fit(train.embarked)
train.embarked = int_encoder.transform(train.embarked)

In [58]:
train.embarked.value_counts()

2    517
0    128
1     67
Name: embarked, dtype: int64

In [59]:
embarked_array = np.array(train.embarked)
embarked_array[0:5]

array([0, 1, 0, 1, 0])

In [60]:
embarked_array = embarked_array.reshape(len(embarked_array), 1)

In [61]:
ohe = OneHotEncoder(sparse=False, categories='auto')

In [62]:
embarked_ohe = ohe.fit_transform(embarked_array)
embarked_ohe

array([[1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       ...,
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.]])

In [63]:
test.embarked = int_encoder.transform(test.embarked)

In [64]:
embarked_array = np.array(test.embarked).reshape(len(test.embarked), 1)

In [65]:
embarked_test_ohe = ohe.transform(embarked_array)

In [66]:
embarked_test_ohe[0:5]

array([[0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.]])

Scaling:
- age
- fare

In [67]:
import split_scale

In [68]:
train_age_fare = train[['age', 'fare']]

In [69]:
test_age_fare = test[['age', 'fare']]

In [70]:
train_age_fare.head()

Unnamed: 0,age,fare
329,16.0,57.9792
749,31.0,7.75
203,45.5,7.225
421,21.0,7.7333
97,23.0,63.3583


In [71]:
scaler, train_age_fare_scaled, test_age_fare_scaled = split_scale.my_minmax_scaler(train_age_fare, test_age_fare)

In [72]:
train_age_fare_scaled.head()

Unnamed: 0,age,fare
329,0.195778,0.113168
749,0.384267,0.015127
203,0.566474,0.014102
421,0.258608,0.015094
97,0.28374,0.123667


In [73]:
test_age_fare_scaled.head()

Unnamed: 0,age,fare
172,0.007288,0.021731
524,,0.01411
452,0.371701,0.054164
170,0.761247,0.065388
620,0.334004,0.028213


MinMaxScaler preserves the shape of the original distribution. It doesn’t meaningfully change the information embedded in the original data.

MinMaxScaler doesn’t reduce the importance of outliers.

Create a function named prep_titanic that accepts the untransformed titanic data, and returns the data with the transformations above applied

In [74]:
def prep_titanic(df):
    
    dft.drop(columns=['deck'], inplace=True)
    dft.fillna(np.nan, inplace=True)
    train, test = train_test_split(dft, train_size=.8, random_state=123)
    
    imp_mode = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

    imp_mode.fit(train[['embarked']])

    train['embarked'] = imp_mode.transform(train[['embarked']])

    test['embarked'] = imp_mode.transform(test[['embarked']])
    
    int_encoder = LabelEncoder()
    int_encoder.fit(train.embarked)
    train.embarked = int_encoder.transform(train.embarked)
    
    embarked_array = np.array(train.embarked)
    embarked_array[0:5]
    
    embarked_array = embarked_array.reshape(len(embarked_array), 1)
    
    ohe = OneHotEncoder(sparse=False, categories='auto')
    
    embarked_ohe = ohe.fit_transform(embarked_array)
    embarked_ohe
    
    test.embarked = int_encoder.transform(test.embarked)
    
    embarked_array = np.array(test.embarked).reshape(len(test.embarked), 1)
    
    embarked_test_ohe = ohe.transform(embarked_array)
    
    
    train_age_fare = train[['age', 'fare']]
    test_age_fare = test[['age', 'fare']]
    scaler, train_age_fare_scaled, test_age_fare_scaled = split_scale.my_minmax_scaler(train_age_fare, test_age_fare)
    
    return embarked_test_ohe