In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
from pydataset import data

# Classification
## Exercise 1
In a jupyter notebook, classification_exercises.ipynb, use a python module (pydata or seaborn datasets) containing datasets as a source from the iris data. Create a pandas dataframe, df_iris, from this data.

- print the first 3 rows
- print the number of rows and columns (shape)
- print the column names
- print the data type of each column
- print the summary statistics for each of the numeric variables. Would you recommend rescaling the data based on these statistics?

In [4]:
# import pydataset iris 
df_iris = data('iris')

In [5]:
# preview dataset
df_iris.head(3)

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
1,5.1,3.5,1.4,0.2,setosa
2,4.9,3.0,1.4,0.2,setosa
3,4.7,3.2,1.3,0.2,setosa


In [6]:
# look at shape, rows and columns
df_iris.shape

(150, 5)

In [7]:
# look at column names
df_iris.columns

Index(['Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width',
       'Species'],
      dtype='object')

In [8]:
# look at datatype of each column
df_iris.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 150 entries, 1 to 150
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Sepal.Length  150 non-null    float64
 1   Sepal.Width   150 non-null    float64
 2   Petal.Length  150 non-null    float64
 3   Petal.Width   150 non-null    float64
 4   Species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 7.0+ KB


In [9]:
# print summary statistics 
# .T to transpose and get a different view
df_iris.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Sepal.Length,150.0,5.843333,0.828066,4.3,5.1,5.8,6.4,7.9
Sepal.Width,150.0,3.057333,0.435866,2.0,2.8,3.0,3.3,4.4
Petal.Length,150.0,3.758,1.765298,1.0,1.6,4.35,5.1,6.9
Petal.Width,150.0,1.199333,0.762238,0.1,0.3,1.3,1.8,2.5


In [8]:
data('iris', show_doc = True)

iris

PyDataset Documentation (adopted from R Documentation. The displayed examples are in R)

## Edgar Anderson's Iris Data

### Description

This famous (Fisher's or Anderson's) iris data set gives the measurements in
centimeters of the variables sepal length and width and petal length and
width, respectively, for 50 flowers from each of 3 species of iris. The
species are _Iris setosa_, _versicolor_, and _virginica_.

### Usage

    iris
    iris3

### Format

`iris` is a data frame with 150 cases (rows) and 5 variables (columns) named
`Sepal.Length`, `Sepal.Width`, `Petal.Length`, `Petal.Width`, and `Species`.

`iris3` gives the same data arranged as a 3-dimensional array of size 50 by 4
by 3, as represented by S-PLUS. The first dimension gives the case number
within the species subsample, the second the measurements with names `Sepal
L.`, `Sepal W.`, `Petal L.`, and `Petal W.`, and the third the species.

### Source

Fisher, R. A. (1936) The use of multiple measurements in taxonomi

##### Rescale or Not rescale? 
Based on the statistics and the measurements (all being in cm) I would not rescale this data


<hr style="border-top: 10px groove lime; margin-top: 1px; margin-bottom: 1px"></hr>


## Exercise 2
Read the Table1_CustDetails table from the Excel_Exercises.xlsx file into a dataframe named df_excel.

- assign the first 100 rows to a new dataframe, df_excel_sample
- print the number of rows of your original dataframe
- print the first 5 column names
- print the column names that have a data type of object
- compute the range for each of the numeric variables.

In [11]:
df_excel = pd.read_excel('telco_churn_data.xlsx', sheet_name = "Table1_CustDetails")

In [12]:
df_excel.head()

Unnamed: 0,customer_id,clean_cust_id,gender,is_senior_citizen,partner,dependents,phone_service,internet_service,contract_type,payment_type,...,has_phone_has_internet,partner_dependents,monthly_tenure_not_rounded,start_date,contract_description,phone_service_description,internet_service_description,only_phone,only_internet,service_type
0,1371-DWPAZ,1371-DWPAZ,Female,0,Yes,Yes,0,1,2,Credit card (automatic),...,False,3,#REF!,#REF!,2 Year,No Phone Service,DSL,False,True,Only Internet
1,4472-LVYGI,4472-LVYGI,Female,0,Yes,Yes,0,1,2,Bank transfer (automatic),...,False,3,0,2021-04-29 00:00:00,2 Year,No Phone Service,DSL,False,True,Only Internet
2,2520-SGTTA,2520-SGTTA,Female,0,Yes,Yes,1,0,2,Mailed check,...,False,3,0,2021-04-29 00:00:00,2 Year,One Line,No Internet Service,True,False,Only Phone
3,2923-ARZLG,2923-ARZLG,Male,0,Yes,Yes,1,0,1,Mailed check,...,False,3,0,2021-04-29 00:00:00,1 Year,One Line,No Internet Service,True,False,Only Phone
4,3115-CZMZD,3115-CZMZD,Male,0,No,Yes,1,0,2,Mailed check,...,False,2,0,2021-04-29 00:00:00,2 Year,One Line,No Internet Service,True,False,Only Phone


In [13]:
# assign new first 100 rows to a new dataframe
df_excel_sample = df_excel.head(100)

In [14]:
df_excel_sample.shape

(100, 28)

In [15]:
print(f"Number of rows in my original data frame is {df_excel.shape[0]}.")

Number of rows in my original data frame is 7043.


In [16]:
# print first 5 rows in df_excel
for x in range(5): 
    print(df_excel.columns[x])

customer_id
clean_cust_id
gender
is_senior_citizen
partner


In [17]:
# print columns that have data type object
# first get info for all columns 
(df_excel.dtypes)

customer_id                      object
clean_cust_id                    object
gender                           object
is_senior_citizen                 int64
partner                          object
dependents                       object
phone_service                     int64
internet_service                  int64
contract_type                     int64
payment_type                     object
monthly_charges                 float64
total_charges                   float64
churn                            object
monthly_tenure                    int64
is_female                          bool
has_churned                        bool
has_phone                          bool
has_internet                       bool
has_phone_has_internet             bool
partner_dependents                int64
monthly_tenure_not_rounded       object
start_date                       object
contract_description             object
phone_service_description        object
internet_service_description     object


In [18]:
# check type of previous output
# haha it's a series maybe I can do something with that
type(df_excel.dtypes)

pandas.core.series.Series

In [19]:
# assign dtypes to a variable
coltypes = df_excel.dtypes

In [20]:
# if the values of the series == object, True and use pandas magic to get them all togegher
coltypes[coltypes.values == 'object']

customer_id                     object
clean_cust_id                   object
gender                          object
partner                         object
dependents                      object
payment_type                    object
churn                           object
monthly_tenure_not_rounded      object
start_date                      object
contract_description            object
phone_service_description       object
internet_service_description    object
service_type                    object
dtype: object

In [21]:
# compute range of numeric values
# see what we're working with
coltypes[coltypes.values != 'object']

is_senior_citizen           int64
phone_service               int64
internet_service            int64
contract_type               int64
monthly_charges           float64
total_charges             float64
monthly_tenure              int64
is_female                    bool
has_churned                  bool
has_phone                    bool
has_internet                 bool
has_phone_has_internet       bool
partner_dependents          int64
only_phone                   bool
only_internet                bool
dtype: object

In [22]:
# monthly charges, total_charges, monthly_tenure are all continous 
# all others are categorical
df_excel[['monthly_charges', 'contract_type']].min()

# ok so this will give me the min of all the columns i need. 
# need to create a list of all the columns with numerical values
# Also go look at Faith's in the codeup classroom
# she added a column to the .describe table called range

monthly_charges    18.25
contract_type       0.00
dtype: float64

In [23]:
# use select dtypes to get only numeric columns
df_excel.select_dtypes(include = ['int64', 'float64'])

Unnamed: 0,is_senior_citizen,phone_service,internet_service,contract_type,monthly_charges,total_charges,monthly_tenure,partner_dependents
0,0,0,1,2,56.05,0.00,0,3
1,0,0,1,2,52.55,0.00,0,3
2,0,1,0,2,20.00,0.00,0,3
3,0,1,0,1,19.70,0.00,0,3
4,0,1,0,2,20.25,0.00,0,2
...,...,...,...,...,...,...,...,...
7038,0,2,2,2,106.00,7723.70,73,3
7039,0,2,1,2,65.30,4759.75,73,1
7040,1,2,2,2,104.15,7689.95,74,1
7041,0,2,2,1,117.80,8684.80,74,1


In [115]:
numeric = df_excel.select_dtypes(include = ['int64', 'float64'])
type(numeric)

pandas.core.frame.DataFrame

In [119]:
#here it is my list of numeric column names
numeric_column_names = numeric.columns
numeric_column_names

Index(['is_senior_citizen', 'phone_service', 'internet_service',
       'contract_type', 'monthly_charges', 'total_charges', 'monthly_tenure',
       'partner_dependents'],
      dtype='object')

In [122]:
# plop that list into the min function I figured out earlier 
mins = df_excel[numeric_column_names].min()
mins

is_senior_citizen      0.00
phone_service          0.00
internet_service       0.00
contract_type          0.00
monthly_charges       18.25
total_charges          0.00
monthly_tenure         0.00
partner_dependents     0.00
dtype: float64

In [123]:
# now for the max
maxes = df_excel[numeric_column_names].max()
maxes

is_senior_citizen        1.00
phone_service            2.00
internet_service         2.00
contract_type            2.00
monthly_charges        118.75
total_charges         8684.80
monthly_tenure          79.00
partner_dependents       3.00
dtype: float64

In [124]:
# Range for each numeric data column is the max minus the min
maxes - mins

is_senior_citizen        1.0
phone_service            2.0
internet_service         2.0
contract_type            2.0
monthly_charges        100.5
total_charges         8684.8
monthly_tenure          79.0
partner_dependents       3.0
dtype: float64


<hr style="border-top: 10px groove lime; margin-top: 1px; margin-bottom: 1px"></hr>


## Exercise 3
Read the data from this google sheet into a dataframe, df_google

- print the first 3 rows
- print the number of rows and columns
- print the column names
- print the data type of each column
- print the summary statistics for each of the numeric variables
- print the unique values for each of your categorical variables

In [24]:
sheet_url = 'https://docs.google.com/spreadsheets/d/1Uhtml8KY19LILuZsrDtlsHHDC9wuDGUSe8LTEwvdI5g/edit#gid=341089357'
csv_export_url = sheet_url.replace('/edit#gid=', '/export?format=csv&gid=')
df_google = pd.read_csv(csv_export_url)


In [25]:
df_google.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [26]:
print(f'There are {df_google.shape[0]} rows, and {df_google.shape[1]} columns.')

There are 891 rows, and 12 columns.


In [27]:
df_google.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [28]:
# use .dtypes to get all column names and their data types
# p.s. this is a series
df_google.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [29]:
# print summary statistics with .describe()
df_google.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [30]:
# unique values for categorical variables
# I'm chosing not to include identifers (passengerid, name, ticket)
# Cabin is iffy but I'll allow it

# Survived has 0 and 1
df_google.Survived.unique()

array([0, 1])

In [31]:
df_google.Survived.value_counts()

0    549
1    342
Name: Survived, dtype: int64

In [148]:
# Pclass (passenger class) has 3. 1st 2nd and 3rd class
df_google.Pclass.unique()

array([3, 1, 2])

In [149]:
df_google.Pclass.value_counts()

3    491
1    216
2    184
Name: Pclass, dtype: int64

In [150]:
# Sex has 2 'male' and 'female'
df_google.Sex.unique()

array(['male', 'female'], dtype=object)

In [151]:
df_google.Sex.value_counts()

male      577
female    314
Name: Sex, dtype: int64

In [154]:
# SibSp has 1, 0, 3, 4, 2, 5, 8
df_google.SibSp.unique()

array([1, 0, 3, 4, 2, 5, 8])

In [155]:
df_google.SibSp.value_counts()

0    608
1    209
2     28
4     18
3     16
8      7
5      5
Name: SibSp, dtype: int64

In [156]:
# Parch has 0, 1, 2, 5, 3, 4, 6
df_google.Parch.unique()

array([0, 1, 2, 5, 3, 4, 6])

In [157]:
df_google.Parch.value_counts()

0    678
1    118
2     80
5      5
3      5
4      4
6      1
Name: Parch, dtype: int64

In [32]:
# Embarked (where they got on) has S C Q
# dropna = False counts the nulls 
df_google.Embarked.value_counts(dropna=False)

S      644
C      168
Q       77
NaN      2
Name: Embarked, dtype: int64

In [162]:
# cabin has a bunch, 148 to be exact
df_google.Cabin.unique()

array([nan, 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6',
       'C23 C25 C27', 'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33',
       'F G73', 'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101',
       'F E69', 'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4',
       'A32', 'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35',
       'C87', 'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19',
       'B49', 'D', 'C22 C26', 'C106', 'C65', 'E36', 'C54',
       'B57 B59 B63 B66', 'C7', 'E34', 'C32', 'B18', 'C124', 'C91', 'E40',
       'T', 'C128', 'D37', 'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44',
       'A34', 'C104', 'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14',
       'B37', 'C30', 'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38',
       'B39', 'B22', 'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68',
       'B41', 'A20', 'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48',
       'E58', 'C126', 'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63',
       'C62 C64',

In [163]:
df_google.Cabin.value_counts()

G6                 4
B96 B98            4
C23 C25 C27        4
F33                3
F2                 3
D                  3
E101               3
C22 C26            3
E24                2
B28                2
E33                2
D36                2
D20                2
C124               2
C125               2
E44                2
B58 B60            2
F G73              2
C83                2
D17                2
D35                2
E8                 2
C2                 2
C126               2
B22                2
B49                2
B57 B59 B63 B66    2
D33                2
B35                2
C68                2
C123               2
B18                2
C92                2
E67                2
B20                2
C78                2
B51 B53 B55        2
E25                2
F4                 2
C52                2
B5                 2
D26                2
C65                2
E121               2
C93                2
B77                2
B37                1
E63          