In [28]:
# ******ignore warnings
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from scipy import stats
import os

# ******files/data
from pydataset import data
import env
import acquire
import prepare

# ******visualizations
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# ******sklearn
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# <span style="color: #c48f7f"> I. Data Acquisition </style>

## Step 1: Import Data
1. SQL
    - use imported <b><span style="color: #c48f7f">get_connection( ) </b>function to connect to Database
    - use imported <b><span style="color: #c48f7f">get_database_data( ) </b>function to read database into a DataFrame
    
2. csv file
    - |df = pd.read_csv( 'filename.csv' )
    
    
3. google sheet
    - replace <b>/edit</b> with <b>/export</b> and add <b>format=csv</b> to beg of query string
        - <b><span style="color: #c48f7f">csv_export_url = sheet_url.replace( '/edit#gid=', '/export?format=csv&gid=' )</b>
            - https//docs.googlecom/spreadsheets/d/BLAHBLAHBLAH<b><span style="color: #c48f7f"> /edit#</b>gid=NUMBER
            - https//docs.googlecom/spreadsheets/d/BLAHBLAHBLAH<b><span style="color: #c48f7f"> /export</b><i>?format=csv&</i>gid=NUMBER
    - |df = pd.read_csv( <b><span style="color: #c48f7f">csv_export_url</b> )


4. pydataset import
    - |df = data( 'db_name' ) 

### <span style="background-color: #c48f7f"><span style="color: #ffffff">|  SQL Database Connect Function  |</span></span>

In [3]:
# function to connect to Codeup Database

def get_connection(db, user = env.user, host = env.host, password = env.password):
    '''
    This function uses my info from the env file to create a connection url that 
    returns the user credentials needed to access the requested Codeup database.
    It takes in a string name of a database an an argument.
    '''
    return f'mysql+pymysql://{user}:{password}@{host}/{db}'

### (feeder) <span style="background-color: #c48f7f"><span style="color: #ffffff">|  Read Database into DataFrame Function  |</span></span>

In [5]:
def new_database_name_data():
    '''
    This function reads in the [database_name] data from the Codeup Database into a
    Pandas DataFrame.
    '''
#     sequel query
    sql_query = 'Select * from table_name'
    
#     read in DataFrame from Codeup DB
    df_name = pd.read_sql(sql_query, get_connection('database_name'))
    
    return df_name

### <span style="background-color: #c48f7f"><span style="color: #ffffff">|  Read Database and Cache Function  |</span></span>

In [6]:
def get_database_name_data():
    '''
    This function reads in the [database_name] data from the Codeup Database, writes
    data to a .csv file if a local file does not already exist, and returns a df.
    '''
    if os.path.isfile('df_name.csv'):
#         If .csv exists, read in data from .csv file
        df_name = pd.read_csv('df_name.csv', index_col = 0)
    
    else:
#         Read fresh data from Database into a DataFrame (referencing function from above cell)
        df_name = new_database_name_data()
        
#         ... and write DataFrame to .csv file
        df_name.to_csv('df_name.csv')
    
    return df_name

# <span style="color: #c48f7f">II. Data Preparation </style>

## Step 2: Summarize the Data

Acquire and General Info

In [12]:
# acquire the data using acquire.py file function
#     df_name = acquire.get_database_name_data()

# sample of the DataFrame
#     df_name.head()

# numbers of rows and columns
#     df_name.shape

# information about the DataFrame:
#     -- column names
#     -- datatypes
#     -- missing values
#     df_name.info()

# summary statistics for numeric columns
#     df_name.describe()


For loop to visualize numeric columns

In [13]:
# for loop to visualize the distributions for the numeric columns
#     df_name_num_cols = df_name.columns[[df_name[col].dtype == 'int64' for col in df_name.columns]]

#     for col in df_name_num_cols:
#         plt.hist(df[col])
#         plt.title(col)
#         plt.show()

For loop to get breakdowns for object columns

In [15]:
# for loop to get the breakdowns of the object columns
#     df_name_obj_cols = df_name.columns[[df_name[col].dtype == 'O' for col in df_name.columns]]

#     for col in obj_cols:
#         print(df_name[col].value_counts())
#         print(df[col].value_counts(normalize = True, dropna = False))
#         print('-----------------------')

## Step 3: Clean the Data

Reset index

In [4]:
# set index

# df = df.set_index('index_column')

To bin columns with continuous numeric variables

In [16]:
# to bin continuous numeric values
#     df_name.column_name.value_counts(bins = x, sort = False)

Drop duplicates

In [18]:
# Drop duplicates
#     df_name = df_name.drop_duplicates

# Verify shape of data
#     df_name.shape

To find missing values

In [5]:
# to find missing values
#     missing = df_name.isnull().sum()
#     missing[missing > 0]

# or
#     df.isna().sum()

Drop columns with too many missing values

In [21]:
# Drop columns with too many missing values
#     df_name_cols_to_drop = ['col_1', 'col_2' ...]
#     df_name = df_name.drop(columns = df_name_cols_to_drop)

# Verify shape of data
#     df_name.shape

# Preview DataFrame and verify columns were dropped
#     df_name.head()

Fill missing values

In [6]:
# Fill in missing values with most common value
#     df_name['column_name'] = df_name.column_name.fillna(value = 'fill_value')
#         or
#     df_name['column_name'] = df_name.column_name.fillna(value = df_name.column.mode())

# Validate that missing values have been filled (this line of code should return 0)
#     df_name.column_name.isna().sum()

Create dummies to encode variables that are strings

In [25]:
# Create dummy DataFrame
#     df_name_dummies = pd.get_dummies(df_name[['col_1', 'col_2' ...]], dummy_na = False,
#                                                                         drop_first = [True])
#     df_name_dummies.head()

# Concatenate the dummy DataFrame with original DataFrame
#     df_name = pd.concat([df_name, df_name_dummies], axis = 1)
#     df_name.head()

### <span style="background-color: #c48f7f"><span style="color: #ffffff">|  Clean Data Function  |</span></span>

In [27]:
# def clean_data(df):
#     '''
#     This function cleans the data and does the following:
#         - drops duplicate observations
#         - drops columns with too many missing values ['col_1', 'col_2', ...]
#         - fill missing values with most common, 'common_value'
#         - creates dummy variables from col_1, col_2, ...
#     '''
#     df = df.drop_duplicates()
#     df = df.drop(columns = ['col_drop_1', 'col_drop_2' ...])
    
#     df['fill_col'] = df.fill_col.fillna(value = 'fill_value')
    
#     dummy_df = pd.get_dummies(df[['dum_col_1', 'dum_col_2' ...]], drop_first = True)
#     df = pd.concat([df, dummy_df], axis = 1)
    
#     return df

## Split Data
#### | Train | *** | Validate | *** | Test |

In [29]:
# 20% test, 80% train_validate
#     of the 80% train_validate: 30% validate, 70% train
#     .24% validate, .56 train

# train, test = train_test_split(df, test_size = .2, 
#                                random_state = 123,
#                               stratify = df.target)

# train, validate = train_test_split(train, test_size = .3,
#                                    random_state = 123,
#                                    stratify = train.target)

Validate the split

In [30]:
# print(f'train ------> {train.shape}')
# print(f'validate ------> {validate.shape}')
# print(f'test ------> {test.shape}')

### <span style="background-color: #c48f7f"><span style="color: #ffffff">|  Split Data Function  |</span></span>

In [46]:
def split_data(df, target, seed = 123):
    '''
    This function takes in a DataFrame, a target variable (for stratification purposes), and an integer for
    setting a seed and splits the data into train, validate, and test DataFrames;
    and stratifies on the target variable
    '''
    train_validate, test = train_test_split(df, test_size = .2,
                                           random_state = seed,
                                           stratify = df[target])
    
    train, validate = train_test_split(train_validate, test_size = .3,
                                      random_state = seed,
                                      stratify = train_validate[target])
    
    return train, validate, test

Test out the function

In [47]:
# train, validate, test = split_data(df, target = 'target_variable')

Validate my split

In [35]:
# print(f'train -------> {train.shape}')
# print(f'validate ----> {validate.shape}')
# print(f'test --------> {test.shape}')

## Imputing Missing Values
1. Create the imputer object, selecting the strategy used to impute
    - Mean
    - Median
    - Mode (strategy = 'most_frequent')<br><br>
2. Fit to train 
    - Compute the mean, median, or most_frequent (mode) for each of the columns that will be imputed.
    - Store that value in the imputer object<br><br>
2. Transform train: fill missing values in the train dataset with that value identified.<br><br>
2. Transform validate and test: fill missing values with that value identified].


In [39]:
# Only look at the train dataset after data split
#     train.info()

# 1. Create the SimpleImputer object (imputer instructions)---> will be stored in a variable called imputer
#     imputer = SimpleImputer(missing_values = None, strategy = 'most_frequent')

# 2. Fit the imputer columns in the training df so the imputer determines the value depending on the strategy 
# called
#     imputer = imputer.fit(train[['col_name']])


# 3. Next we will call transform on all three of our split data sets
#     train[['col_name']] = imputer.transform(train[['col_name']])

# 4. And finally calling transform on our validate and test data sets
#     validate[['col_name']] = imputer.transform(validate[['col_name']])
#     test[['col_name']] = imputer.transform(test[['col_name']])

In [40]:
# Validate there are no longer any null values in imputer column(s)
#     train.col_name.value_counts(dropna = False)

### <span style="background-color: #c48f7f"><span style="color: #ffffff">|  Imputer Function  |</span></span>

In [43]:
# def impute_mode(train, validate, test):
#     '''
#     This function takes in the train, test, and validate DataFrames and imputes the mode for the selected
#     column to impute, returning imputed train, test, and validated DataFrames
#     '''
#     imputer = SimpleImputer(missing_values = None, strategy = 'most_frequent')
#     train[['col_name']] = imputer.fit_transform(train[['col_name']])
#     validate[['col_name']] = imputer.transform(validate[['col_name']])
#     test[['col_name']] = imputer.transform(test[['col_name']])
    
#     return train, validate, test

# *********************************

# Validate the function worked properly
#     train, validate, test = prep_titanic_data(df)
#     train.info

# <span style="color: #c48f7f">III. Tidy Data </style>
- Data should be tabular (made up of rows and columns)
- There is only value per cell
- Each variable should have its own column
- Each observation should have its own row


<b>Melt</b> use when one variable is spread across multiple columns
- Wide ----> Long

<b>Pivot</b> use when one column contains multiple variables
- Long ----> Wide

In [2]:
# melt

# |df_name.melt(id_vars = ['index_column'], var_name = 'new_var_column_name', value_name = 'value_column_name')

In [3]:
# pivot

# |df_name.pivot(index = 'index_column', columns = 'column_to_pivot')

# <span style="color: #c48f7f">IV. Data Exploration </style>
<span style="color: #c48f7f"><b>Exploratory Data Analysis</style> (EDA)</b> where we develop nearly all of the insights and takeaways and learn the <i>story</i> of our data.
### Only explore train data set!
##### <span style="color: #c48f7f">process of performing initial investigations on data so as to:</style>
    - discover patterns,
    - spot anomolies,
    - test hypothesis, and
    - check assumptions
##### <span style="color: #c48f7f">with the help of:</style>
    - summary statistics and
    - graphical representations    
##### <span style="color: #c48f7f">can lead to:</style>
    - feature engineering,
    - feature elimination to reduce noise, and
    - domain based outlier handling


## Step 1: Document initial hypotheses 
How the indepedent variables (predictors, features, attributes) interact with the target (y-value or dependent variable) using natural language

## Step 2: Use visualization techniques to identify drivers
### i. Univariate Stats
- Looking for outliers
- Testing for normalcy
- Looking at the scale of each variable<br><br>
###### Using:
- value_counts( ) [categorical variables] and
- histograms [numerical variables]
- frequencies

### ii. Bivariate Stats
Plot the interactions of each variable with the target and document takeaways.
- Numeric -----> Numeric
    - Scatterplot
    - Lineplot
    
- Numeric -----> Categorical
    - see https://seaborn.pydata.org/tutorial/categorical.html
    
<b><i>Use hypothesis testing where appropriate.</b></i>

### iii. Multivariate Stats
Ask additional questions of the data, such as how subgroups compare to each other and to the overall population using visualizations and/or hypothesis testing.
- sns.pairplot (with hue and/or col)
- see https://seaborn.pydata.org/tutorial/axis_grids.html

<b><i>Use hypothesis testing where appropriate.</b></i>

## Step 3: Hypothesis Testing
When a visualization isn't immediately clear or you need/want additional confirmation

## Which Hypothesis Test?
### Pearson's R
<b>corr, p = stats.pearsonsr( train_df.column, train_df.column )</b><br><br>
https://ds.codeup.com/stats/more-statistical-testing-examples/#pearson-r

- Numeric ------> Numeric
- Linear relationships

### Spearman's R
https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.spearmanr.html
- Numeric -------> Numeric
- Non-linear relationships

### T-Test
<b>t, p = stats.ttest_1samp( train_df_sample, $\mu$ )<br>
t, p = stats.ttest_ind( train_df_sample1, train_df_sample2, equal_var = True/False )</b><br><br>
https://ds.codeup.com/stats/compare-means/
- Numeric --------> Categorical
- Comparing the means of two populations
- Comparing the mean of a subgroup with the mean of the total population
- When samples are normal(ish) distributed but have different variances <i>(determined using .var( ))</i>

### ANOVA
https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.f_oneway.html
- Numeric --------> Categorical
- Comparing the means of more than two groups

### Mann-Whitney u-test
https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.mannwhitneyu.html
- Numeric --------> Categorical
- Data does not match the assumptions of a t-test

### $chi^2$ Test
<b>observed = pd.crosstab( a, b )<br>
chi2, p, degf, expected = stats.chi2_contingency(observed)</b><br><br>
https://ds.codeup.com/stats/compare-group-membership/
- Categorical ----> Categorical 


## Univariate Exploration

In [50]:
# |.info() -----> determine which variables are quantitative (numeric) vs qualitative (non-numeric) by datatypes

In [54]:
# quantitative (numeric) variables
#     .describe() ----> get descriptive statistics for quantitative variables
#     start by plotting the target variable (document takeaways)
#     use list comprehension to create a for loop that saves each numeric column into a variable ---> num_cols
#         |train_df_num_cols = train_df.columns[[train_df[col].dtype == 'float'for col in train_df.columns]]
#     use a for loop to plot (hist&boxplot) each numeric column from num_cols list created in last step
#         |for col in train_df_num_cols:
#             |plt.hist(train_df[col])
#             |plt.title(col)
#             |plt.show()
#             |plt.boxplot(train[col])
#             |plt.title(col)
#             |plt.show()


## Bivariate Exploration
##### Analyzing each feature with respect to the target variable
<b>Document: $H$(o), $H$(a), and $\alpha$</b>
- ask and document questions and document takeaways
- numeric ------> numeric
    - scatterplot
- numeric ------> categorical
    - catplot
    - barplot
    - boxplot
- think about other feature combinations we could visualize
- could we bin any features?
- should we create any new features?

In [1]:
# qualitative (non-numeric) variables
#     |plt.hist(train.df[col])
#     |train_df[col].value_counts()
#     compute the descriptive statistics within each categorical value (from value_counts) (for each type of ...
#     ...flower) ---> can create df's for each and then concatenate
#         use a barplot to plot the different categorical values against one another
#             |plt.title('Title')
#             |sns.barplot(x = 'categorical_column', y = 'numerical_column', data = train_df)
#         plot a mean line on the barplot
#             |target_rate = train_df.target.mean()
#             |plt.axhline(target_rate, label = 'Average Target')
#         use a hypothesis test to compare the means across categorical values, if needed



## Multivariate Exploration
##### Adding an additional dimension to our data, such as the target variable as color or, separating variable values by columns
- Here we are asking more specific and targeted questions
    - How subgroups compare to one another
    - How subgroups compare to the target populuation

In [4]:
# plotting
#     add hue and/or cols to existing plots
#     pairplot
#         |sns.pairplot(train_df, hue = 'target', corner = True)
#         |plt.show()
#     heatmap
#         |sns.heatmap(train_df.corr(), cmap = 'color_combo', center = x, annot = True)
#         |plt.show()

# <span style="color: #c48f7f">V. Modeling </style>
Create X and y versions of train
- y: series with just the target variable
- X: DataFrame with all the feature variables

In [18]:
# X_train = train.drop(columns = ['target'])
#     - feature selection
#     - fit models
#     - make predictions
# y_train = train.target
#     - feature selection
#     - evaluate model predictions


# X_validate = validate.drop(columns = ['target'])
#     - make predictions using top models
# y_validate = validate.target
#     - evaluate model predictions made from X_validate to access overfitting

# X_test = test.drop(columns = ['target'])
#     - make predictions using best model
# y_test = test.target
#     -evaluate model predictions made from X_test to estimate future performance on new data

## 01. Decision Tree
Create the Decision Tree Object with desired hyper-parameters

In [19]:
# clf = DecisionTreeClassifier(max_depth = 3, random_state = 123)

Fit the random forest algorithm to the training data

In [20]:
# model.fit(X, y)

# clf = clf.fit(X_train, y_train)

Visualize the Decision Tree

In [21]:
# dot_data = export_graphviz(clf, feature_names = X_train.columns, class_names = clf.classes_, rounded = True,
#                           out_file = None)

Make predictions, classify each target by its target value

In [22]:
# y_predict = clf.predict(X_train)
# y_predict
#     - array

In [24]:
# Estimate the probability of each target variable, using the trainind data
# y_predict_prob = clf.predict_proba(X_train)
#     - array of arrays

## Evaluate the model
<b> train, test, and validate

In [25]:
# print('Accuracy of Decision Tree Classifier on training set: {:.2f}'.format(clf.score(X_train, y_train)))

Confusion Matrix

In [26]:
# confusion_matrix(y_train, y_predict)
#     - array of arrays

Classification Report
- Precision
- Recall
- f1-score
- support

In [27]:
# print(classification_report(y_train, y_predict))

# <span style="color: #c48f7f">VI. Evaluation </style>
How we evaluate our classification model's performance
##### The methods used in this section, along with the classification_report function are to evaluate the model performance:
- Train ---------> see in-sample performance
- Validate ------> see out-of-sample performance and allow us to tune parameters
- Test -----> Determine best performing model

# Classification Report

In [17]:
# classification_report(actual, predicted)

# report = classification_report(y_train, y_predictions, output_dict = True)
# pd.DataFrame(report)

## Confusion Matrix
A cross-tabulation of our model's predictions against the actual values
- <b>Positive</b>
    - tp:
    - fp: [over-confident]
- <b>Negative</b>
    - tn:
    - fn: [under-confident]
- <b>Consequences</b>
    - fp
    - fn

## Metrics
- <b>Accuracy</b><br>
(TP + TN) / total
    - Total number of times I got it right (positive or negative)
    - Will be the same regardless of which is positive value<br><br>
    
- <b>Precision</b><br>
TP / (TP + FP)
    - <i>Of all of our positive predictions, how many were true positives</i>
    - The percentage of positive predictions we make that are true
    - Does not take negative predictions into account
    - Use when we want to minimize FP: FP is more costly than FN<br><br>
    
- <b>Recall</b><br>
TP / (TP + FN)
    - <i>Of all our actual positive cases, how many did we accurately predict?</i>
    - The percentage of actual positive cases we accurately predicted
    - Does not take actual negatives into account
    - Use when we want to minimize FN: FN is more costly than FP


In [7]:
# baseline

# baseline = df.target.mode()

In [10]:
# accuracy

# model_accuracy = (df.actual == df.model).mean()
# print(f'Model Accuracy: {model_accuracy}')

# baseline_accuracy = (df.actual == df.baseline).mean()
# print(f'Baseline Accuracy: {baseline_accuracy}')

In [16]:
# precision | accuracy of positive predictions, ratio of positive predictions that were actually positive, to posi
# predictions that were actually negative, FP
#     minimizing FP's
#     looking at the subset of data where we have made a positive prediction
#         - TP
#         - FP

# positive prediction subset
#     subset = df[df.model == 'positive_value']

# model_precision = (subset.actual == subset.model).mean()
# print(f'Model | Precision: {model_precision: .2%}')

In [15]:
# recall | ratio of positive cases you caught to positive cases you missed
#     minimizing FN's
#     looking at the subset of data where we actually had positive's
#         - TP
#         - FN

# actual positive subset
#     subset = df[df.actual == 'positive_value' ]

# model_recall = (subset.actual == subset.model).mean()
# print(f'Model | Recall: {model_recall: .2%}')