In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (4, 2)

from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

import warnings
warnings.filterwarnings("ignore")

import acquire
from acquire import get_titanic_data, get_iris_data

import os
import env

import explore

### In a jupyter notebook, classification_exercises.ipynb, use a python module (pydata or seaborn datasets) containing datasets as a source from the iris data. Create a pandas dataframe, df_iris, from this data.

In [None]:
df_iris = sns.load_dataset('iris')

In [None]:
type(df_iris)

#### print the first 3 rows

In [None]:
df_iris.head(3)

#### print the number of rows and columns

In [None]:
df_iris.shape

#### print the column names

In [None]:
df_iris.columns

#### print the data type of each column

In [None]:
df_iris.info()

#### print the summary statistics for each of the numeric variables

In [None]:
# .T to transpose
df_iris.describe().T

### Read the Table1_CustDetails table from the Excel_Exercises.xlsx file into a dataframe named df_excel.

In [None]:
# redo this with Spreadsheets_Exercises.xlsx
telco = pd.read_excel("Spreadsheets_Exercises.xlsx")
telco.head().T

In [None]:
# Setup Yes/No as booleans
telco.partner = telco.partner == "Yes"
telco.dependents = telco.dependents == "Yes"
telco.churn = telco.churn == "Yes"
telco.is_senior_citizen = telco.is_senior_citizen == 1

In [None]:
# map phone_service, internet_service, and contract_type
contract_type = {
    0: "Month-to-Month",
    1: "1 Year",
    2: "2 Year"
}

phone_service = {
    0: "No Phone Service",
    1: "One line",
    2: "Two or more lines"
}

internet_service = {
    0: "No Internet Service",
    1: "DSL",
    2: "Fiber Optic"
}

telco.contract_type = telco.contract_type.map(contract_type)
telco.phone_service = telco.phone_service.map(phone_service)
telco.internet_service = telco.internet_service.map(internet_service)

In [None]:
telco.head().T

In [None]:
df_excel = pd.read_csv('Cust_Churn_Telco.csv')

#### assign the first 100 rows to a new dataframe, df_excel_sample

In [None]:
df_excel_sample = df_excel.loc[0:100, :].copy()
df_excel_sample

#### print the number of rows of your original dataframe

In [None]:
len(df_excel)

#### print the first 5 column names


In [None]:
df_excel.columns[:5]

#### print the column names that have a data type of object

In [None]:
df_excel.select_dtypes(include=['object']).columns

#### compute the range for each of the numeric variables.

In [None]:
numerics = df_excel.select_dtypes(include=['float64','int64'])
numerics.max()-numerics.min()

In [None]:
print('Tenure range = ',(df_excel.tenure.max() - df_excel.tenure.min()))
print('Monthly Charges range = ',(df_excel.MonthlyCharges.max() - df_excel.MonthlyCharges.min()))
print('Total Charges range = ',(df_excel.TotalCharges.max() - df_excel.TotalCharges.min()))

### Read the data from this google sheet into a dataframe, df_google

In [None]:
sheet_url = 'https://docs.google.com/spreadsheets/d/1Uhtml8KY19LILuZsrDtlsHHDC9wuDGUSe8LTEwvdI5g/edit#gid=341089357'    
csv_export_url = sheet_url.replace('/edit#gid=', '/export?format=csv&gid=')
df_google = pd.read_csv(csv_export_url)

#### print the first 3 rows

In [None]:
df_google.head(3)

#### print the number of rows and columns

In [None]:
df_google.shape

#### print the column names

In [None]:
df_google.columns.tolist()

#### print the data type of each column

In [None]:
df_google.dtypes

#### print the summary statistics for each of the numeric variables

In [None]:
df_google.describe().T

#### print the unique values for each of your categorical variables


In [None]:
for column in df_google.select_dtypes(include = 'object').columns:
    print(f"Value in the {column} column:")
    print(df_google[column].value_counts())
    print("-------")
    print()

In [None]:
print(df_google.Survived.value_counts())
print(df_google.Pclass.value_counts())
print(df_google.Sex.value_counts())
print(df_google.SibSp.value_counts())
print(df_google.Parch.value_counts())
print(df_google.Cabin.value_counts())
print(df_google.Embarked.value_counts())

### Data Prep Exercises

#### Use the function defined in acquire.py to load the iris data.

In [2]:
df = acquire.get_iris_data()
df.head()

Unnamed: 0,species_id,species_name,measurement_id,sepal_length,sepal_width,petal_length,petal_width
0,1,setosa,1,5.1,3.5,1.4,0.2
1,1,setosa,2,4.9,3.0,1.4,0.2
2,1,setosa,3,4.7,3.2,1.3,0.2
3,1,setosa,4,4.6,3.1,1.5,0.2
4,1,setosa,5,5.0,3.6,1.4,0.2


#### Drop the species_id and measurement_id columns.

In [None]:
df.columns

In [None]:
df = df.drop(columns = ['species_id','measurement_id'])
df.head()

#### Rename the species_name column to just species.

In [3]:
df = df.rename(columns = {"species_name": "species"})
df.head()

Unnamed: 0,species_id,species,measurement_id,sepal_length,sepal_width,petal_length,petal_width
0,1,setosa,1,5.1,3.5,1.4,0.2
1,1,setosa,2,4.9,3.0,1.4,0.2
2,1,setosa,3,4.7,3.2,1.3,0.2
3,1,setosa,4,4.6,3.1,1.5,0.2
4,1,setosa,5,5.0,3.6,1.4,0.2


#### Create dummy variables of the species name.

In [None]:
df.species.value_counts()

In [None]:
df_dummy = pd.get_dummies(df[['species']], drop_first = False)
df_dummy.head()

In [None]:
df = pd.concat([df,df_dummy], axis=1)
df.head()

#### Create a function named prep_iris that accepts the untransformed iris data, and returns the data with the transformations above applied.

In [None]:
# All Together:
##def clean_data(df):
#    df.drop_duplicates(inplace = True)
#    df.drop(columns = ['species_id','measurement_id'], inplace = True)
#    df.rename(columns = {"species_name": "species"}, inplace = True)
#    dummy_df = pd.get_dummies(df[['species']], drop_first = True)
#    return pd.concat([df, dummy_df], axis=1)

### Exploratory Analysis Exercise

In [4]:
df.head()

Unnamed: 0,species_id,species,measurement_id,sepal_length,sepal_width,petal_length,petal_width
0,1,setosa,1,5.1,3.5,1.4,0.2
1,1,setosa,2,4.9,3.0,1.4,0.2
2,1,setosa,3,4.7,3.2,1.3,0.2
3,1,setosa,4,4.6,3.1,1.5,0.2
4,1,setosa,5,5.0,3.6,1.4,0.2


#### Acquire, prepare & split your data.



In [5]:
train, validate, test = explore.train_validate_test_split(df, 'species', seed=123 )

print(train.shape)
print(validate.shape)
print(test.shape)

(84, 7)
(36, 7)
(30, 7)


In [6]:
df.head(2)

Unnamed: 0,species_id,species,measurement_id,sepal_length,sepal_width,petal_length,petal_width
0,1,setosa,1,5.1,3.5,1.4,0.2
1,1,setosa,2,4.9,3.0,1.4,0.2


#### Univariate Stats

In [None]:
cat_vars = ['species_id','species','species_setosa','species_versicolor','species_virginica']
quant_vars = ['sepal_length','sepal_width','petal_length','petal_width']

In [None]:
explore.explore_univariate(train, cat_vars, quant_vars)

#### Bivariate Stats



In [None]:
explore.explore_bivariate(train, 'species_id', cat_vars, quant_vars)

#### Multivariate Stats

In [None]:
explore.explore_multivariate(train, 'species_id', cat_vars, quant_vars)