In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns
import numpy as np
from pydataset import data

# 1. In a jupyter notebook, classification_exercises.ipynb, use a python module (pydata or seaborn datasets) containing datasets as a source from the iris data. Create a pandas dataframe, df_iris, from this data.

In [2]:
df_iris = data('iris', show_doc='True')
df_iris = data('iris')

iris

PyDataset Documentation (adopted from R Documentation. The displayed examples are in R)

## Edgar Anderson's Iris Data

### Description

This famous (Fisher's or Anderson's) iris data set gives the measurements in
centimeters of the variables sepal length and width and petal length and
width, respectively, for 50 flowers from each of 3 species of iris. The
species are _Iris setosa_, _versicolor_, and _virginica_.

### Usage

    iris
    iris3

### Format

`iris` is a data frame with 150 cases (rows) and 5 variables (columns) named
`Sepal.Length`, `Sepal.Width`, `Petal.Length`, `Petal.Width`, and `Species`.

`iris3` gives the same data arranged as a 3-dimensional array of size 50 by 4
by 3, as represented by S-PLUS. The first dimension gives the case number
within the species subsample, the second the measurements with names `Sepal
L.`, `Sepal W.`, `Petal L.`, and `Petal W.`, and the third the species.

### Source

Fisher, R. A. (1936) The use of multiple measurements in taxonomi

### a. Print the first 3 rows

In [3]:
df_iris.head(3)

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
1,5.1,3.5,1.4,0.2,setosa
2,4.9,3.0,1.4,0.2,setosa
3,4.7,3.2,1.3,0.2,setosa


### b. Print the number of rows and columns (shape)

In [4]:
df_iris.shape

(150, 5)

### c. Print the column names

In [5]:
df_iris.columns

Index(['Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width',
       'Species'],
      dtype='object')

### d. Print the data type of each column

In [6]:
df_iris.dtypes

Sepal.Length    float64
Sepal.Width     float64
Petal.Length    float64
Petal.Width     float64
Species          object
dtype: object

### e. Print the summary statistics for each of the numeric variables. Would you recommend rescaling the data based on these statistics?

In [7]:
df_iris.describe()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


I believe rescaling the units to milimeters would beter suit this data since the standard deviations for some of the columns are less than a centimeter.

# 2. Read the Table1_CustDetails table from the Excel_Exercises.xlsx file into a dataframe named df_excel.

### a. Assign the first 100 rows to a new dataframe, df_excel_sample

In [8]:
sheet_url = 'https://docs.google.com/spreadsheets/d/1kI_YtjH6Pg-4tsP66Liqiu7WJ76s6ejqpo-6cdQv8hI/edit#gid=1023018493'    

csv_export_url = sheet_url.replace('/edit#gid=', '/export?format=csv&gid=')

df_excel = pd.read_csv(csv_export_url)
df_excel_sample = df_excel.head(100)

### b. Print the number of rows of your original dataframe

In [9]:
df_excel.shape[0]

7049

### c. Print the first 5 column names

In [10]:
df_excel.columns[0:5]

Index(['customer_id', 'gender', 'is_senior_citizen', 'partner', 'dependents'], dtype='object')

### d. Print the column names that have a data type of object

In [11]:
df_excel.dtypes

customer_id           object
gender                object
is_senior_citizen      int64
partner               object
dependents            object
phone_service          int64
internet_service       int64
contract_type          int64
payment_type          object
monthly_charges      float64
total_charges        float64
churn                 object
dtype: object

In [12]:
df_excel.columns[(df_excel.dtypes == 'object')]

Index(['customer_id', 'gender', 'partner', 'dependents', 'payment_type',
       'churn'],
      dtype='object')

### e. Compute the range for each of the numeric variables.

In [13]:
# Determine columns that are numeric by filtering columns that are object values
numerical_columns = [feature for feature in df_excel.columns if df_excel[feature].dtypes != 'object']
numerical_columns

['is_senior_citizen',
 'phone_service',
 'internet_service',
 'contract_type',
 'monthly_charges',
 'total_charges']

In [14]:
for i in numerical_columns:
    x = df_excel[i].min()
    y = df_excel[i].max()
    if y <=2:
        print(f'({x},{y}): column is categorical.')
    else:
        print(f'({x},{y}): column is numeric')

(0,1): column is categorical.
(0,2): column is categorical.
(0,2): column is categorical.
(0,2): column is categorical.
(18.25,118.75): column is numeric
(18.8,8684.8): column is numeric


## 3. Read the data from this google sheet into a dataframe, df_google

### a. Print the first 3 rows

In [15]:
sheet_url = 'https://docs.google.com/spreadsheets/d/1Uhtml8KY19LILuZsrDtlsHHDC9wuDGUSe8LTEwvdI5g/edit#gid=341089357'    

csv_export_url = sheet_url.replace('/edit#gid=', '/export?format=csv&gid=')

df_google = pd.read_csv(csv_export_url)

In [16]:
df_google.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


### b. Print the number of rows and columns

In [17]:
df_google.shape

(891, 12)

### c. Print the column names

In [18]:
df_google.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

### d. Print the data type of each column

In [19]:
df_google.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


### e. Print the summary statistics for each of the numeric variables

In [20]:
# Determine columns that are numeric by filtering columns that are object values
numerical_columns = [feature for feature in df_google.columns if df_google[feature].dtypes != 'object']
numerical_columns

['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

In [21]:
# Remove numeric columns that are categorical
for i in numerical_columns:
    x = df_google[i].nunique()
    if x <=7:
        #Convert numerical columns with categorical info to objects
        df_google[i] = df_google[i].astype('object')
    else:
        print(f'{i} has {x} unique values: column is numeric')

PassengerId has 891 unique values: column is numeric
Age has 88 unique values: column is numeric
Fare has 248 unique values: column is numeric


### f. Print the unique values for each of your categorical variables

In [22]:
categorical_columns = [feature for feature in df_google.columns if df_google[feature].dtypes == 'object']
categorical_columns

['Survived',
 'Pclass',
 'Name',
 'Sex',
 'SibSp',
 'Parch',
 'Ticket',
 'Cabin',
 'Embarked']

In [23]:
for i in categorical_columns:
    x = df_google[i].nunique()
    print(f'Categorical column {i} has {x} unique values')

Categorical column Survived has 2 unique values
Categorical column Pclass has 3 unique values
Categorical column Name has 891 unique values
Categorical column Sex has 2 unique values
Categorical column SibSp has 7 unique values
Categorical column Parch has 7 unique values
Categorical column Ticket has 681 unique values
Categorical column Cabin has 147 unique values
Categorical column Embarked has 3 unique values
