In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns
import numpy as np
from pydataset import data

# Data Acquisition
## 1. In a jupyter notebook, classification_exercises.ipynb, use a python module (pydata or seaborn datasets) containing datasets as a source from the iris data. Create a pandas dataframe, df_iris, from this data.

In [None]:
df_iris = data('iris', show_doc='True')
df_iris = data('iris')

### a. Print the first 3 rows

In [None]:
df_iris.head(3)

### b. Print the number of rows and columns (shape)

In [None]:
df_iris.shape

### c. Print the column names

In [None]:
df_iris.columns

### d. Print the data type of each column

In [None]:
df_iris.dtypes

### e. Print the summary statistics for each of the numeric variables. Would you recommend rescaling the data based on these statistics?

In [None]:
df_iris.describe()

I believe rescaling the units to milimeters would beter suit this data since the standard deviations for some of the columns are less than a centimeter.

# 2. Read the Table1_CustDetails table from the Excel_Exercises.xlsx file into a dataframe named df_excel.

### a. Assign the first 100 rows to a new dataframe, df_excel_sample

In [None]:
df_excel = pd.read_excel("Spreadsheets_Exercises.xlsx")
df_excel_sample = df_excel.loc[0:100:].copy()
df_excel_sample

### b. Print the number of rows of your original dataframe

In [None]:
df_excel.shape[0]

### c. Print the first 5 column names

In [None]:
df_excel.columns[0:5]

### d. Print the column names that have a data type of object

In [None]:
df_excel.dtypes

In [None]:
df_excel.columns[(df_excel.dtypes == 'object')]

### e. Compute the range for each of the numeric variables.

In [None]:
# Determine columns that are numeric by filtering columns that are object values
numerical_columns = [feature for feature in df_excel.columns if df_excel[feature].dtypes != 'object']
numerical_columns

In [None]:
for i in numerical_columns:
    x = df_excel[i].min()
    y = df_excel[i].max()
    if y <=2:
        print(f'({x},{y}): column is categorical.')
    else:
        print(f'({x},{y}): column is numeric')

## 3. Read the data from this google sheet into a dataframe, df_google

### a. Print the first 3 rows

In [None]:
sheet_url = 'https://docs.google.com/spreadsheets/d/1Uhtml8KY19LILuZsrDtlsHHDC9wuDGUSe8LTEwvdI5g/edit#gid=341089357'    

csv_export_url = sheet_url.replace('/edit#gid=', '/export?format=csv&gid=')

df_google = pd.read_csv(csv_export_url)

In [None]:
df_google.head(3)

### b. Print the number of rows and columns

In [None]:
df_google.shape

### c. Print the column names

In [None]:
df_google.columns

### d. Print the data type of each column

In [None]:
df_google.info()

### e. Print the summary statistics for each of the numeric variables

In [None]:
# Determine columns that are numeric by filtering columns that are object values
numerical_columns = [feature for feature in df_google.columns if df_google[feature].dtypes != 'object']
numerical_columns

In [None]:
# Remove numeric columns that are categorical
for i in numerical_columns:
    x = df_google[i].nunique()
    if x <=7:
        #Convert numerical columns with categorical info to objects
        df_google[i] = df_google[i].astype('object')
    else:
        print(f'{i} has {x} unique values: column is numeric')

### f. Print the unique values for each of your categorical variables

In [None]:
categorical_columns = [feature for feature in df_google.columns if df_google[feature].dtypes == 'object']
categorical_columns

In [None]:
for i in categorical_columns:
    x = df_google[i].nunique()
    print(f'Categorical column {i} has {x} unique values')

# Prepare Data

The end product of this exercise should be the specified functions in a python script named `prepare.py`.
Do these in your `classification_exercises.ipynb` first, then transfer to the prepare.py file. 

This work should all be saved in your local `classification-exercises` repo. Then add, commit, and push your changes.

Using the Iris Data:  

1. Use the function defined in `acquire.py` to load the iris data.  

1. Drop the `species_id` and `measurement_id` columns.  

1. Rename the `species_name` column to just `species`.  

1. Create dummy variables of the species name. 

1. Create a function named `prep_iris` that accepts the untransformed iris data, and returns the data with the transformations above applied.  

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

import warnings
warnings.filterwarnings("ignore")

import acquire

In [None]:
df = acquire.get_iris_data()
df

In [None]:
#measurement was already dropped before importing data
df.drop(columns='species_id', inplace=True)

In [None]:
df.rename(columns={"species_name": "species"}, inplace=True)

In [None]:
df_dummy = pd.get_dummies(df['species'])
df = pd.concat([df_dummy, df], axis=1)
df.head()

In [None]:
def prep_iris(df):
    """
    Takes in data from the iris db and turns it into a dataframe.
    Drops 'species_id' and 'measurement_id' columns.
    Renames the 'species_name' column to 'species'.
    Creates dummy variables of species name.
    """
    df.drop(columns=["species_id","measurement_id"], inplace=True)
    df.rename(columns={"species_name": "species"}, inplace=True)
    df_dummy = pd.get_dummies(df['species'])
    df = pd.concat([df, df_dummy], axis=1)
    return df

In [None]:
prep_iris(df)